In [47]:
import pandas as pd
import sklearn as skl
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
import numpy as np
import pickle

In [63]:
df = pd.read_csv("Resources/application_record.csv")
df.head()

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS
0,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
1,5008805,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
2,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,-21474,-1134,1,0,0,0,Security staff,2.0
3,5008808,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0
4,5008809,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0


In [64]:
# Drop Unwanted Columns
df2 = df.drop(['ID','DAYS_BIRTH', 'DAYS_EMPLOYED', 'FLAG_MOBIL', 'FLAG_WORK_PHONE', 'FLAG_PHONE', 'FLAG_EMAIL', 'CNT_FAM_MEMBERS', 'NAME_HOUSING_TYPE'],axis=1)

In [65]:
# Rename columns
df2 = df2.rename(columns={'CODE_GENDER': 'GENDER', 
                   'FLAG_OWN_CAR': 'CAR',
                   'FLAG_OWN_REALTY': 'HOUSE',
                   'AMT_INCOME_TOTAL': 'TOTAL INCOME',
                   'NAME_INCOME_TYPE': 'INCOME CATEGORY',
                   'NAME_EDUCATION_TYPE': 'EDUCATION LEVEL',
                   'NAME_FAMILY_STATUS': 'MARITAL STATUS',
                   'OCCUPATION_TYPE': 'OCCUPATION',})

df2.dropna(inplace=True)
                   
df2['OCCUPATION'].unique()

array(['Security staff', 'Sales staff', 'Accountants', 'Laborers',
       'Managers', 'Drivers', 'Core staff', 'High skill tech staff',
       'Cleaning staff', 'Private service staff', 'Cooking staff',
       'Low-skill Laborers', 'Medicine staff', 'Secretaries',
       'Waiters/barmen staff', 'HR staff', 'Realty agents', 'IT staff'],
      dtype=object)

In [66]:

# make a list of the income categories converted to integers
income_categories = {'Working': 1, 'Commercial associate': 2, 'Pensioner': 3, 'State servant': 4, 'Student': 5}
education = {'Academic degree': 1, 'Higher education': 2, 'Incomplete higher': 3, 'Lower secondary': 4, 'Secondary / secondary special': 5}
family_status = {'Civil marriage': 1, 'Married': 2, 'Separated': 3, 'Single / not married': 4, 'Widow': 5}
gender = {'F': 0, 'M': 1}
car = {'N': 0, 'Y': 1}
house = {'N': 0, 'Y': 1}
occupancy = {'Security staff':1, 'Sales staff':2, 'Accountants':3, 'Laborers':4, 'Managers':5, 'Drivers':6, 
             'Core staff':7, 'High skill tech staff':8,
       'Cleaning staff':9, 'Private service staff':10, 'Cooking staff':11,
       'Low-skill Laborers':12, 'Medicine staff':13, 'Secretaries':14,
       'Waiters/barmen staff':15, 'HR staff':16, 'Realty agents':17, 'IT staff':18}



df2['INCOME CATEGORY'] = df2['INCOME CATEGORY'].map(income_categories)
df2['EDUCATION LEVEL'] = df2['EDUCATION LEVEL'].map(education)
df2['MARITAL STATUS'] = df2['MARITAL STATUS'].map(family_status)
df2['GENDER'] = df2['GENDER'].map(gender)
df2['CAR'] = df2['CAR'].map(car)
df2['HOUSE'] = df2['HOUSE'].map(house)
df2['OCCUPATION'] = df2['OCCUPATION'].map(occupancy)

df2


Unnamed: 0,GENDER,CAR,HOUSE,CNT_CHILDREN,TOTAL INCOME,INCOME CATEGORY,EDUCATION LEVEL,MARITAL STATUS,OCCUPATION
2,1,1,1,0,112500.0,1,5,2,1
3,0,0,1,0,270000.0,2,5,4,2
4,0,0,1,0,270000.0,2,5,4,2
5,0,0,1,0,270000.0,2,5,4,2
6,0,0,1,0,270000.0,2,5,4,2
...,...,...,...,...,...,...,...,...,...
438541,1,0,1,0,202500.0,1,2,1,4
438548,1,1,1,1,135000.0,1,5,2,4
438553,0,0,0,0,103500.0,1,5,4,4
438554,0,0,0,0,54000.0,2,2,4,2


In [67]:
# Split our preprocessed data into our features and target arrays
y = df2["TOTAL INCOME"].values
X = df2.drop(["TOTAL INCOME"],axis=1)

print(X.shape, y.shape)

(304354, 8) (304354,)


In [69]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler for ridge hyperparameter tuning
X_scaled = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaled.transform(X_train)
X_test_scaled = X_scaled.transform(X_test)


print(X_train_scaled.shape)
print(y_train.shape)

(243483, 8)
(243483,)


In [70]:
# Find best alpha hyperparameter for Ridge Regression model
model = Ridge()
alphas = [0.1, 1.0, 10.0, 100.0, 200, 300, 400, 500, 600, 700, 800, 900, 1000]
param_grid = {'alpha': alphas}
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train_scaled, y_train)
best_alpha = grid_search.best_params_['alpha']
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_scaled)
r2 = r2_score(y_test, y_pred)

print("Best Alpha:", best_alpha)
print("R2 Score:", round(r2,2))

Best Alpha: 100.0
R2 Score: 0.09


In [73]:
# Model: Ridge Regression
model = Ridge(alpha=100)
model.fit(X_train_scaled, y_train)

# Predictions
y_pred_ridge = model.predict(X_test_scaled)

# Metrics
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
mae_ridge = mean_absolute_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)

print("Ridge Regression")
print("Mean Squared Error:", round(mse_ridge,2))
print("Mean Absolute Error:", round(mae_ridge,2))
print("R-squared:", round(r2_ridge,2))

Ridge Regression
Mean Squared Error: 12006292876.54
Mean Absolute Error: 68185.26
R-squared: 0.09


In [74]:
# Model: Lasso Regression
lasso_model = Lasso(alpha=100) 
lasso_model.fit(X_train_scaled, y_train)

# Predict
y_pred_lasso = lasso_model.predict(X_test_scaled)

# Metrics
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
mae_lasso = mean_absolute_error(y_test, y_pred_lasso)
r2_lasso = r2_score(y_test, y_pred_lasso)

print("Lasso Regression")
print("Mean Squared Error:", round(mse_lasso,2))
print("Mean Absolute Error:", round(mae_lasso,2))
print("R-squared:", round(r2_lasso,2))

Lasso Regression
Mean Squared Error: 12006570707.83
Mean Absolute Error: 68181.54
R-squared: 0.09


In [75]:
# Model: Decision Tree Regressor
param_grid = {'max_depth': [50, 100, 150, 200, 400, 500, 1000, None], 'min_samples_split': [2, 5, 10]}
decision_tree_model = DecisionTreeRegressor(random_state=78)
grid_search = GridSearchCV(decision_tree_model, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train_scaled, y_train)


# Best parameters
best_max_depth = grid_search.best_params_['max_depth']
best_min_samples_split = grid_search.best_params_['min_samples_split']
best_model = grid_search.best_estimator_


# Predict
y_pred = best_model.predict(X_test_scaled)

# Metrics
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Best Hyperparameters:")
print("Best max_depth:", best_max_depth)
print("Best min_samples_split:", best_min_samples_split)
print("\nDecision Tree Regressor")
print("Mean Squared Error:", round(mse,2))
print("Mean Absolute Error:", round(mae,2))
print("R-squared:", round(r2,2))

Best Hyperparameters:
Best max_depth: 50
Best min_samples_split: 2

Decision Tree Regressor
Mean Squared Error: 9961163772.83
Mean Absolute Error: 60285.17
R-squared: 0.25


In [76]:
from sklearn.ensemble import RandomForestRegressor

# Define a smaller parameter grid to search over
param_grid = {
    'n_estimators': [400, 500, 700, 800, 900],  # Reduce the number of estimators
    'max_depth': [10,20,30, 40, 50],  # Reduce the number of max_depth values
    'min_samples_split': [45, 55, 65, 75, 85],  # Reduce the number of min_samples_split values
    'min_samples_leaf': [6, 7]  # Reduce the number of min_samples_leaf values
}

# Create a smaller dataset for faster training
# For example, use the first 1000 samples for training and testing
X_train_small = X_train_scaled[:1000]
y_train_small = y_train[:1000]
X_test_small = X_test_scaled[:1000]
y_test_small = y_test[:1000]

# Create a Random Forest Regressor
random_forest = RandomForestRegressor()

# Create GridSearchCV with the Random Forest Regressor and smaller parameter grid
grid_search = GridSearchCV(random_forest, param_grid, cv=3, scoring='neg_mean_squared_error')

# Fit the GridSearchCV to the smaller training data
grid_search.fit(X_train_small, y_train_small)

# Get the best hyperparameters and the best model from the grid search
forest_best_params = grid_search.best_params_
forest_best_model = grid_search.best_estimator_

# Predict using the best model
forest_y_pred = forest_best_model.predict(X_test_small)

# Metrics
forest_mse = mean_squared_error(y_test_small, forest_y_pred)
forest_mae = mean_absolute_error(y_test_small, forest_y_pred)
forest_r2 = r2_score(y_test_small, forest_y_pred)

# Print the best hyperparameters
print("Best Hyperparameters:")
print(forest_best_params)
print("Mean Squared Error:", round(forest_mse, 2))
print("Mean Absolute Error:", round(forest_mae, 2))
print("R-squared:", round(forest_r2, 2))

Best Hyperparameters:
{'max_depth': 50, 'min_samples_leaf': 7, 'min_samples_split': 45, 'n_estimators': 400}
Mean Squared Error: 9592711338.81
Mean Absolute Error: 70330.27
R-squared: 0.13


In [77]:
forest_model = RandomForestRegressor(n_estimators=400,max_depth=50, min_samples_leaf=7, min_samples_split=45)
forest_model.fit(X_train_scaled, y_train)


y_forest = forest_model.predict(X_test_scaled)

mse_forest = mean_squared_error(y_test, y_forest)
mae_forest = mean_absolute_error(y_test, y_forest)
r2_forest = r2_score(y_test, y_forest)


print("Mean Squared Error:", round(mse_forest,2))
print("Mean Absolute Error:", round(mae_forest,2))
print("R-squared:", round(r2_forest,2))

Mean Squared Error: 10294942730.77
Mean Absolute Error: 62747.11
R-squared: 0.22


In [78]:
# Save model and scaler with pickle
with open('scaler.pkl', 'wb') as f:
    pickle.dump(X_scaled, f)

with open("decision_tree_model.pkl", 'wb') as f:
    pickle.dump(decision_tree_model, f)