# Explainable AI: Application in Credit Scoring

## Training Random forests

Thesis: Explainable AI: Applications in Credit Scoring
Degree: Master of Information Management
Dataset: Give Me Some Credit (GMC), taken from Kaggle

In [80]:
# Pre-processing libraries
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [81]:
# ML-related libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split

In [2]:
# Import the training dataset
path = "~/Desktop/Master of Information Management/Master's Thesis/GiveMeSomeCredit/cs-training.csv"
df_training = pd.read_csv(path)

In [3]:
df_training.drop(['Unnamed: 0'], axis=1, inplace=True)

In [4]:
# Dealing with missing values
imputer = SimpleImputer(strategy="median")

In [5]:
imputer.fit(df_training)

SimpleImputer(strategy='median')

In [6]:
X = imputer.transform(df_training)

In [7]:
df_training = pd.DataFrame(X, columns=df_training.columns, index=df_training.index)

In [8]:
df_training.isna().sum()

SeriousDlqin2yrs                        0
RevolvingUtilizationOfUnsecuredLines    0
age                                     0
NumberOfTime30-59DaysPastDueNotWorse    0
DebtRatio                               0
MonthlyIncome                           0
NumberOfOpenCreditLinesAndLoans         0
NumberOfTimes90DaysLate                 0
NumberRealEstateLoansOrLines            0
NumberOfTime60-89DaysPastDueNotWorse    0
NumberOfDependents                      0
dtype: int64

In [9]:
# Feature scaling
scaler = MinMaxScaler()

In [10]:
scaler.fit(df_training)

MinMaxScaler()

In [11]:
Y = scaler.transform(df_training)

In [12]:
df_training = pd.DataFrame(Y, columns=df_training.columns, index=df_training.index)

In [13]:
df_training.describe()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
count,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0
mean,0.06684,0.0001192798,0.479773,0.004296,0.001070803,0.002133,0.145737,0.002714,0.018856,0.002453,0.036871
std,0.249746,0.004925364,0.135522,0.042783,0.006181502,0.004284,0.088723,0.042544,0.020922,0.0424,0.055351
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,5.890085e-07,0.376147,0.0,5.310675e-07,0.001297,0.086207,0.0,0.0,0.0,0.0
50%,0.0,3.04056e-06,0.477064,0.0,1.111762e-06,0.001795,0.137931,0.0,0.018519,0.0,0.0
75%,0.0,1.102481e-05,0.577982,0.0,2.633754e-06,0.002459,0.189655,0.0,0.037037,0.0,0.05
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
# Dealing with multicollinearity (based on VIF)

df_training = df_training.drop(['NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfTimes90DaysLate'], axis=1)

In [20]:
df_training.shape

(150000, 9)

In [22]:
df_training.columns

Index(['SeriousDlqin2yrs', 'RevolvingUtilizationOfUnsecuredLines', 'age',
       'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome',
       'NumberOfOpenCreditLinesAndLoans', 'NumberRealEstateLoansOrLines',
       'NumberOfDependents'],
      dtype='object')

In [82]:
# Split the dataset into features and label

features = df_training.drop(['SeriousDlqin2yrs'], axis=1)
labels = df_training['SeriousDlqin2yrs']

In [88]:
# Create a training set and a test set

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.20)

In [89]:
# Instantiate the model

rf = RandomForestClassifier()

In [None]:
# Calculating the optimal candidate settings for max_features

[np.trunc(np.sqrt((np.log2(150000) + 1)*0.1)), np.trunc(np.sqrt((np.log2(150000) + 1)*0.25)), 
 np.trunc(np.sqrt((np.log2(150000) + 1)*0.5)), np.trunc(np.sqrt((np.log2(150000) + 1)*1)), 
 np.trunc(np.sqrt((np.log2(150000) + 1)*2)), np.trunc(np.sqrt((np.log2(150000) + 1)*4))]

In [90]:
# Create the parameter grid
param_grid = {
    'n_estimators': [100, 250, 500, 750, 1000],
    'max_features': [1, 2, 3, 4, 6, 8]
}

In [91]:
# Instantiate the grid search model
grid_search = GridSearchCV(rf, param_grid=param_grid, cv=5, verbose=3)

In [None]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV 1/5] END ...............max_features=1, n_estimators=100; total time=   8.4s
[CV 2/5] END ...............max_features=1, n_estimators=100; total time=   8.6s
[CV 3/5] END ...............max_features=1, n_estimators=100; total time=   8.0s
[CV 4/5] END ...............max_features=1, n_estimators=100; total time=   8.4s
[CV 5/5] END ...............max_features=1, n_estimators=100; total time=   8.9s
[CV 1/5] END ...............max_features=1, n_estimators=250; total time=  20.8s
[CV 2/5] END ...............max_features=1, n_estimators=250; total time=  20.7s
[CV 3/5] END ...............max_features=1, n_estimators=250; total time=  22.7s
[CV 4/5] END ...............max_features=1, n_estimators=250; total time=  21.2s
[CV 5/5] END ...............max_features=1, n_estimators=250; total time=  20.7s
[CV 1/5] END ...............max_features=1, n_estimators=500; total time=  43.9s
[CV 2/5] END ...............max_features=1, n_e

[CV 2/5] END ...............max_features=6, n_estimators=100; total time=  35.0s
[CV 3/5] END ...............max_features=6, n_estimators=100; total time=  38.5s
[CV 4/5] END ...............max_features=6, n_estimators=100; total time=  35.2s
[CV 5/5] END ...............max_features=6, n_estimators=100; total time=  35.9s
[CV 1/5] END ...............max_features=6, n_estimators=250; total time= 1.6min
[CV 2/5] END ...............max_features=6, n_estimators=250; total time=27.4min
[CV 3/5] END ...............max_features=6, n_estimators=250; total time= 1.7min
[CV 4/5] END ...............max_features=6, n_estimators=250; total time= 1.6min
[CV 5/5] END ...............max_features=6, n_estimators=250; total time= 1.7min
[CV 1/5] END ...............max_features=6, n_estimators=500; total time= 3.5min
[CV 2/5] END ...............max_features=6, n_estimators=500; total time= 4.1min
[CV 3/5] END ...............max_features=6, n_estimators=500; total time= 3.2min
[CV 4/5] END ...............