# Explainable AI: Application in Credit Scoring

## Pre-processing

Thesis: Explainable AI: Applications in Credit Scoring
Degree: Master of Information Management
Dataset: Give Me Some Credit (GMC), taken from Kaggle

In [42]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [7]:
# Import the training dataset
path = "~/Desktop/Master of Information Management/Master's Thesis/GiveMeSomeCredit/cs-training.csv"
df_training = pd.read_csv(path)

In [11]:
df_training.drop(['Unnamed: 0'], axis=1, inplace=True)

In [13]:
# Dealing with missing values
imputer = SimpleImputer(strategy="median")

In [15]:
imputer.fit(df_training)

SimpleImputer(strategy='median')

In [18]:
X = imputer.transform(df_training)

In [23]:
df_training = pd.DataFrame(X, columns=df_training.columns, index=df_training.index)

In [30]:
df_training.isna().sum()

SeriousDlqin2yrs                        0
RevolvingUtilizationOfUnsecuredLines    0
age                                     0
NumberOfTime30-59DaysPastDueNotWorse    0
DebtRatio                               0
MonthlyIncome                           0
NumberOfOpenCreditLinesAndLoans         0
NumberOfTimes90DaysLate                 0
NumberRealEstateLoansOrLines            0
NumberOfTime60-89DaysPastDueNotWorse    0
NumberOfDependents                      0
dtype: int64

In [32]:
# Feature scaling
scaler = MinMaxScaler()

In [33]:
scaler.fit(df_training)

MinMaxScaler()

In [34]:
Y = scaler.transform(df_training)

In [37]:
df_training = pd.DataFrame(Y, columns=df_training.columns, index=df_training.index)

In [40]:
df_training.describe()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
count,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0
mean,0.06684,0.0001192798,0.479773,0.004296,0.001070803,0.002133,0.145737,0.002714,0.018856,0.002453,0.036871
std,0.249746,0.004925364,0.135522,0.042783,0.006181502,0.004284,0.088723,0.042544,0.020922,0.0424,0.055351
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,5.890085e-07,0.376147,0.0,5.310675e-07,0.001297,0.086207,0.0,0.0,0.0,0.0
50%,0.0,3.04056e-06,0.477064,0.0,1.111762e-06,0.001795,0.137931,0.0,0.018519,0.0,0.0
75%,0.0,1.102481e-05,0.577982,0.0,2.633754e-06,0.002459,0.189655,0.0,0.037037,0.0,0.05
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [44]:
# Dealing with multicollinearity based on VIF

## independent variabels set
X = df_training[['RevolvingUtilizationOfUnsecuredLines', 'age',
       'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome',
       'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate',
       'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse',
       'NumberOfDependents']]

In [46]:
## VIF dataframe
vif_data = pd.DataFrame()

In [47]:
vif_data["feature"] = X.columns

In [48]:
## calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]

In [49]:
print(vif_data)

                                feature        VIF
0  RevolvingUtilizationOfUnsecuredLines   1.000777
1                                   age   3.638439
2  NumberOfTime30-59DaysPastDueNotWorse  41.173243
3                             DebtRatio   1.049552
4                         MonthlyIncome   1.269632
5       NumberOfOpenCreditLinesAndLoans   4.570548
6               NumberOfTimes90DaysLate  73.196237
7          NumberRealEstateLoansOrLines   2.304678
8  NumberOfTime60-89DaysPastDueNotWorse  91.181441
9                    NumberOfDependents   1.403443


In [51]:
## drop 'NumberOfTime30-59DaysPastDueNotWorse' and 'NumberOfTime60-89DaysPastDueNotWorse' and repeat process
X = df_training[['RevolvingUtilizationOfUnsecuredLines', 'age',
       'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome',
       'NumberOfOpenCreditLinesAndLoans', 'NumberRealEstateLoansOrLines',
       'NumberOfDependents']]

In [52]:
vif_data=pd.DataFrame()

In [53]:
vif_data["feature"] = X.columns

In [54]:
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]

In [55]:
print(vif_data)

                                feature       VIF
0  RevolvingUtilizationOfUnsecuredLines  1.000777
1                                   age  3.623498
2  NumberOfTime30-59DaysPastDueNotWorse  1.007052
3                             DebtRatio  1.049539
4                         MonthlyIncome  1.269621
5       NumberOfOpenCreditLinesAndLoans  4.502202
6          NumberRealEstateLoansOrLines  2.303571
7                    NumberOfDependents  1.399348


In [57]:
df_training = df_training.drop(['NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfTimes90DaysLate'], axis=1)

In [60]:
df_training.shape

(150000, 9)

In [61]:
df_training.columns

Index(['SeriousDlqin2yrs', 'RevolvingUtilizationOfUnsecuredLines', 'age',
       'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome',
       'NumberOfOpenCreditLinesAndLoans', 'NumberRealEstateLoansOrLines',
       'NumberOfDependents'],
      dtype='object')

In [62]:
df_training.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 9 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   SeriousDlqin2yrs                      150000 non-null  float64
 1   RevolvingUtilizationOfUnsecuredLines  150000 non-null  float64
 2   age                                   150000 non-null  float64
 3   NumberOfTime30-59DaysPastDueNotWorse  150000 non-null  float64
 4   DebtRatio                             150000 non-null  float64
 5   MonthlyIncome                         150000 non-null  float64
 6   NumberOfOpenCreditLinesAndLoans       150000 non-null  float64
 7   NumberRealEstateLoansOrLines          150000 non-null  float64
 8   NumberOfDependents                    150000 non-null  float64
dtypes: float64(9)
memory usage: 10.3 MB


In [67]:
df_training.DebtRatio.describe()

count    1.500000e+05
mean     1.070803e-03
std      6.181502e-03
min      0.000000e+00
25%      5.310675e-07
50%      1.111762e-06
75%      2.633754e-06
max      1.000000e+00
Name: DebtRatio, dtype: float64