In [42]:
import pandas as pd
import sklearn as skl
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
import numpy as np
import pickle

In [43]:
df = pd.read_csv("Resources/application_record.csv")
df.head()

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS
0,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
1,5008805,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
2,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,-21474,-1134,1,0,0,0,Security staff,2.0
3,5008808,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0
4,5008809,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0


In [44]:
# Drop Unwanted Columns
df2 = df.drop(['ID','DAYS_BIRTH', 'DAYS_EMPLOYED', 'FLAG_MOBIL', 'FLAG_WORK_PHONE', 'FLAG_PHONE', 'FLAG_EMAIL', 'OCCUPATION_TYPE', 'CNT_FAM_MEMBERS', 'NAME_HOUSING_TYPE'],axis=1)

In [45]:
# Rename columns
df2 = df2.rename(columns={'CODE_GENDER': 'GENDER', 
                   'FLAG_OWN_CAR': 'CAR',
                   'FLAG_OWN_REALTY': 'HOUSE',
                   'AMT_INCOME_TOTAL': 'TOTAL INCOME',
                   'NAME_INCOME_TYPE': 'INCOME CATEGORY',
                   'NAME_EDUCATION_TYPE': 'EDUCATION LEVEL',
                   'NAME_FAMILY_STATUS': 'MARITAL STATUS'})
                   
df2.head()

Unnamed: 0,GENDER,CAR,HOUSE,CNT_CHILDREN,TOTAL INCOME,INCOME CATEGORY,EDUCATION LEVEL,MARITAL STATUS
0,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage
1,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage
2,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married
3,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married
4,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married


In [46]:

# make a list of the income categories converted to integers
income_categories = {'Working': 1, 'Commercial associate': 2, 'Pensioner': 3, 'State servant': 4, 'Student': 5}
education = {'Academic degree': 1, 'Higher education': 2, 'Incomplete higher': 3, 'Lower secondary': 4, 'Secondary / secondary special': 5}
family_status = {'Civil marriage': 1, 'Married': 2, 'Separated': 3, 'Single / not married': 4, 'Widow': 5}
gender = {'F': 0, 'M': 1}
car = {'N': 0, 'Y': 1}
house = {'N': 0, 'Y': 1}



df2['INCOME CATEGORY'] = df2['INCOME CATEGORY'].map(income_categories)
df2['EDUCATION LEVEL'] = df2['EDUCATION LEVEL'].map(education)
df2['MARITAL STATUS'] = df2['MARITAL STATUS'].map(family_status)
df2['GENDER'] = df2['GENDER'].map(gender)
df2['CAR'] = df2['CAR'].map(car)
df2['HOUSE'] = df2['HOUSE'].map(house)

df2


Unnamed: 0,GENDER,CAR,HOUSE,CNT_CHILDREN,TOTAL INCOME,INCOME CATEGORY,EDUCATION LEVEL,MARITAL STATUS
0,1,1,1,0,427500.0,1,2,1
1,1,1,1,0,427500.0,1,2,1
2,1,1,1,0,112500.0,1,5,2
3,0,0,1,0,270000.0,2,5,4
4,0,0,1,0,270000.0,2,5,4
...,...,...,...,...,...,...,...,...
438552,1,0,1,0,135000.0,3,5,3
438553,0,0,0,0,103500.0,1,5,4
438554,0,0,0,0,54000.0,2,2,4
438555,0,0,1,0,72000.0,3,5,2


In [52]:
# Generate our categorical variable lists
# app_cat = df2.dtypes[df2.dtypes == 'object'].index.to_list()

In [51]:
# Check the number of unique values in each column
# df2[app_cat].nunique()

In [50]:
# # Create OneHotEncoder instance
# encoder = OneHotEncoder(sparse=False)

# # Fit and transform the OneHotEncoder using the categorical variable list
# encode_df = pd.DataFrame(encoder.fit_transform(df2[app_cat]))

# # Add the encoded variable names to the dataframe
# encode_df.columns = encoder.get_feature_names(app_cat)
# encode_df.head()

In [53]:
# # Merge one-hot encoded features and drop the originals
# df2 = df2.merge(encode_df,left_index=True, right_index=True)
# df2 = df2.drop(app_cat, axis=1)
# df2.head()

In [54]:
# Split our preprocessed data into our features and target arrays
y = df2["TOTAL INCOME"].values
X = df2.drop(["TOTAL INCOME"],axis=1)

X.dtypes

GENDER             int64
CAR                int64
HOUSE              int64
CNT_CHILDREN       int64
INCOME CATEGORY    int64
EDUCATION LEVEL    int64
MARITAL STATUS     int64
dtype: object

In [55]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler for ridge hyperparameter tuning
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

In [56]:
# Scale the data
X_scaler = scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [57]:
# Find best alpha hyperparameter for Ridge Regression model
model = Ridge()
alphas = [0.1, 1.0, 10.0, 100.0, 200, 300, 400, 500, 600, 700, 800, 900, 1000]
param_grid = {'alpha': alphas}
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_scaled, y)
best_alpha = grid_search.best_params_['alpha']
best_model = grid_search.best_estimator_

print("Best Alpha:", best_alpha)

Best Alpha: 800


In [58]:
# Model: Ridge Regression
model = Ridge(alpha=800)
model.fit(X_train_scaled, y_train)

# Predictions
y_pred_ridge = model.predict(X_test_scaled)

# Metrics
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
mae_ridge = mean_absolute_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)

print("Ridge Regression")
print("Mean Squared Error:", round(mse_ridge,2))
print("Mean Absolute Error:", round(mae_ridge,2))
print("R-squared:", round(r2_ridge,2))

Ridge Regression
Mean Squared Error: 10741745392.96
Mean Absolute Error: 66853.5
R-squared: 0.09


In [60]:
# Model: Lasso Regression
lasso_model = Lasso(alpha=800) 
lasso_model.fit(X_train, y_train)

# Predict
y_pred_lasso = lasso_model.predict(X_test)

# Metrics
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
mae_lasso = mean_absolute_error(y_test, y_pred_lasso)
r2_lasso = r2_score(y_test, y_pred_lasso)

print("Lasso Regression")
print("Mean Squared Error:", round(mse_lasso,2))
print("Mean Absolute Error:", round(mae_lasso,2))
print("R-squared:", round(r2_lasso,2))

Lasso Regression
Mean Squared Error: 10750379769.0
Mean Absolute Error: 66827.62
R-squared: 0.09


In [65]:
# Model: Decision Tree Regressor

decision_tree_model = DecisionTreeRegressor(max_depth=150, random_state=78)
decision_tree_model.fit(X_train, y_train)

# Predict
y_pred = decision_tree_model.predict(X_test)

# Metrics
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(X_train.shape)
print(X_test.shape)
print("Mean Squared Error:", round(mse,2))
print("Mean Absolute Error:", round(mae,2))
print("R-squared:", round(r2,2))

(350845, 7)
(87712, 7)
Mean Squared Error: 10179140269.38
Mean Absolute Error: 64756.69
R-squared: 0.14


In [66]:
# Save model and scaler with pickle
with open('scaler.pkl', 'wb') as f:
    pickle.dump(X_scaler, f)

with open("decision_tree_model.pkl", 'wb') as f:
    pickle.dump(decision_tree_model, f)