In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

df = pd.read_csv('Data/cleaned_cc_data.csv')
df.head()

cc_data_scaled = StandardScaler().fit_transform(df[["LIMIT_BAL", "PAY_0", "PAY_2", "PAY_3", "PAY_4", "PAY_5", "PAY_6", "BILL_AMT1", "BILL_AMT2", "BILL_AMT3", "BILL_AMT4", "BILL_AMT5", "BILL_AMT6", "PAY_AMT1", "PAY_AMT2", "PAY_AMT3", "PAY_AMT4", "PAY_AMT5", "PAY_AMT6", "BILL_AMT1", "BILL_AMT2", "BILL_AMT3", "BILL_AMT4", "BILL_AMT5", "BILL_AMT6"]])
# Use original column names for the scaled data and concatenate with the columns left out from scaling
cc_data_scaled = pd.DataFrame(cc_data_scaled, columns=["LIMIT_BAL", "PAY_0", "PAY_2", "PAY_3", "PAY_4", "PAY_5", "PAY_6", "BILL_AMT1", "BILL_AMT2", "BILL_AMT3", "BILL_AMT4", "BILL_AMT5", "BILL_AMT6", "PAY_AMT1", "PAY_AMT2", "PAY_AMT3", "PAY_AMT4", "PAY_AMT5", "PAY_AMT6", "BILL_AMT1", "BILL_AMT2", "BILL_AMT3", "BILL_AMT4", "BILL_AMT5", "BILL_AMT6"])
cc_data_scaled = pd.concat([cc_data_scaled, df[["LIMIT_BAL","SEX","EDUCATION","MARRIAGE","AGE","default.payment.next.month"]]], join='inner',  axis=1)
cc_data_scaled.head()

Unnamed: 0,LIMIT_BAL,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,...,BILL_AMT3.1,BILL_AMT4,BILL_AMT5,BILL_AMT6,LIMIT_BAL.1,SEX,EDUCATION,MARRIAGE,AGE,default.payment.next.month
0,-1.135512,1.791873,1.776654,-0.697265,-0.666863,-1.527923,-1.486255,-0.641203,-0.646339,-0.667135,...,-0.667135,-0.67174,-0.662877,-0.652876,20000.0,2,2,1,24,1
1,-0.365937,-0.876018,1.776654,0.136226,0.186234,0.232334,1.985406,-0.657981,-0.665755,-0.638302,...,-0.638302,-0.620771,-0.605957,-0.598087,120000.0,2,2,2,26,1
2,-0.59681,0.013279,0.109462,0.136226,0.186234,0.232334,0.249576,-0.296016,-0.492299,-0.480945,...,-0.480945,-0.448499,-0.41661,-0.391631,90000.0,2,2,2,34,0
3,-0.90464,0.013279,0.109462,0.136226,0.186234,0.232334,0.249576,-0.054075,-0.009999,0.035991,...,0.035991,-0.23068,-0.18578,-0.156445,50000.0,2,2,1,37,0
4,-0.90464,-0.876018,0.109462,-0.697265,0.186234,0.232334,0.249576,-0.577089,-0.610131,-0.158677,...,-0.158677,-0.345548,-0.347449,-0.331449,50000.0,1,2,1,57,0


In [2]:
# Define the features set.
X = cc_data_scaled.copy()
X = X.drop("default.payment.next.month", axis=1)
X.head()

Unnamed: 0,LIMIT_BAL,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,...,BILL_AMT2.1,BILL_AMT3.1,BILL_AMT4,BILL_AMT5,BILL_AMT6,LIMIT_BAL.1,SEX,EDUCATION,MARRIAGE,AGE
0,-1.135512,1.791873,1.776654,-0.697265,-0.666863,-1.527923,-1.486255,-0.641203,-0.646339,-0.667135,...,-0.646339,-0.667135,-0.67174,-0.662877,-0.652876,20000.0,2,2,1,24
1,-0.365937,-0.876018,1.776654,0.136226,0.186234,0.232334,1.985406,-0.657981,-0.665755,-0.638302,...,-0.665755,-0.638302,-0.620771,-0.605957,-0.598087,120000.0,2,2,2,26
2,-0.59681,0.013279,0.109462,0.136226,0.186234,0.232334,0.249576,-0.296016,-0.492299,-0.480945,...,-0.492299,-0.480945,-0.448499,-0.41661,-0.391631,90000.0,2,2,2,34
3,-0.90464,0.013279,0.109462,0.136226,0.186234,0.232334,0.249576,-0.054075,-0.009999,0.035991,...,-0.009999,0.035991,-0.23068,-0.18578,-0.156445,50000.0,2,2,1,37
4,-0.90464,-0.876018,0.109462,-0.697265,0.186234,0.232334,0.249576,-0.577089,-0.610131,-0.158677,...,-0.610131,-0.158677,-0.345548,-0.347449,-0.331449,50000.0,1,2,1,57


In [3]:
# Define the target set.
y = df["default.payment.next.month"].ravel()

In [4]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [5]:
rf_model_1 = RandomForestClassifier(n_estimators=128, random_state=78)

In [6]:
rf_model_1 = rf_model_1.fit(X_train, y_train)

In [7]:
predictions = rf_model_1.fit(X_train, y_train).predict(X_test)

In [8]:
balanced_accuracy_score(y_test, predictions)

0.6615554659576599

In [12]:
# This accuracy is not bad, but we can try to improve it by tuning the model parameters.
# Let's start by checking the feature importances.
importances = rf_model_1.feature_importances_
importances

# We can sort the features by their importance.
sorted(zip(rf_model_1.feature_importances_, X.columns), reverse=True)

# We can see that the most important features are PAY_0, AGE, and LIMIT_BAL.
# We can try to drop the less important features and see if the model improves.
X = cc_data_scaled.copy()
X = X.drop(["default.payment.next.month", "BILL_AMT1", "BILL_AMT2", "BILL_AMT3", "BILL_AMT4", "BILL_AMT5", "BILL_AMT6"], axis=1)
X.head()

# Define the target set.
y = df["default.payment.next.month"].ravel()

# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

rf_model_2 = RandomForestClassifier(n_estimators=128, random_state=78)

rf_model_2 = rf_model_2.fit(X_train, y_train)

predictions = rf_model_2.fit(X_train, y_train).predict(X_test)

balanced_accuracy_score(y_test, predictions)

# The accuracy has improved slightly, but not enough to reach our target of 75%
# Let's try to improve the model by tuning the hyperparameters.
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 4, 6]
}

grid = GridSearchCV(rf_model_2, param_grid, verbose=3)

grid.fit(X_train, y_train)

print(grid.best_params_)
print(grid.best_score_)
print(grid.best_estimator_)
predictions = grid.predict(X_test)
balanced_accuracy_score(y_test, predictions)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV 1/5] END max_depth=5, min_samples_split=2, n_estimators=100;, score=0.810 total time=   0.8s
[CV 2/5] END max_depth=5, min_samples_split=2, n_estimators=100;, score=0.818 total time=   0.8s
[CV 3/5] END max_depth=5, min_samples_split=2, n_estimators=100;, score=0.815 total time=   0.8s
[CV 4/5] END max_depth=5, min_samples_split=2, n_estimators=100;, score=0.820 total time=   0.8s
[CV 5/5] END max_depth=5, min_samples_split=2, n_estimators=100;, score=0.821 total time=   0.8s
[CV 1/5] END max_depth=5, min_samples_split=2, n_estimators=200;, score=0.812 total time=   1.7s
[CV 2/5] END max_depth=5, min_samples_split=2, n_estimators=200;, score=0.820 total time=   1.7s
[CV 3/5] END max_depth=5, min_samples_split=2, n_estimators=200;, score=0.818 total time=   1.7s
[CV 4/5] END max_depth=5, min_samples_split=2, n_estimators=200;, score=0.822 total time=   1.7s
[CV 5/5] END max_depth=5, min_samples_split=2, n_estimators=200;,

0.6648122121086484

In [13]:
#Running the best hyperparameters still doesn't get us to 75% accuracy. We need to use a different model to predict default
rf_model_3 = RandomForestClassifier(max_depth=10, min_samples_split=4, n_estimators=300,
                       random_state=78)

rf_model_3 = rf_model_3.fit(X_train, y_train)

predictions = rf_model_3.fit(X_train, y_train).predict(X_test)

balanced_accuracy_score(y_test, predictions)

0.6648122121086484