In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_curve, roc_auc_score, confusion_matrix, precision_recall_curve, auc
from sklearn import svm
from scipy.stats import chi2_contingency

#Importing the data
df_credit = pd.read_csv("german_credit_data.csv",index_col=0)

In [2]:
df_credit.iloc[:,-1:]

Unnamed: 0,Risk
0,good
1,bad
2,good
3,good
4,bad
...,...
995,good
996,good
997,good
998,bad


In [3]:
df_credit.dtypes

Age                  int64
Sex                 object
Job                  int64
Housing             object
Saving accounts     object
Checking account    object
Credit amount        int64
Duration             int64
Purpose             object
Risk                object
dtype: object

In [4]:
df_credit.isna().sum()

Age                   0
Sex                   0
Job                   0
Housing               0
Saving accounts     183
Checking account    394
Credit amount         0
Duration              0
Purpose               0
Risk                  0
dtype: int64

In [5]:
df_credit['Saving accounts'].unique()

array([nan, 'little', 'quite rich', 'rich', 'moderate'], dtype=object)

In [6]:
df_credit['Checking account'].unique()

array(['little', 'moderate', nan, 'rich'], dtype=object)

In [7]:
def clean1(x):
    
    if x != x:
        return 0
    elif x == 'little':
        return 1
    elif x == 'moderate':
        return 2
    elif x == 'quite rich':
        return 3
    else:
        return 4
    
df_credit['Saving accounts'] = df_credit['Saving accounts'].map(clean1)
df_credit['Checking account'] = df_credit['Checking account'].map(clean1)
df_credit

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,67,male,2,own,0,1,1169,6,radio/TV,good
1,22,female,2,own,1,2,5951,48,radio/TV,bad
2,49,male,1,own,1,0,2096,12,education,good
3,45,male,2,free,1,1,7882,42,furniture/equipment,good
4,53,male,2,free,1,1,4870,24,car,bad
...,...,...,...,...,...,...,...,...,...,...
995,31,female,1,own,1,0,1736,12,furniture/equipment,good
996,40,male,3,own,1,1,3857,30,car,good
997,38,male,2,own,1,0,804,12,radio/TV,good
998,23,male,2,free,1,1,1845,45,radio/TV,bad


## Chi-squared Test for Categorical Features

In [8]:
categorical_X = df_credit.select_dtypes(include = 'object')
numerical_X = df_credit.select_dtypes(include='number')

y = df_credit['Risk']

In [9]:
categorical_X

Unnamed: 0,Sex,Housing,Purpose,Risk
0,male,own,radio/TV,good
1,female,own,radio/TV,bad
2,male,own,education,good
3,male,free,furniture/equipment,good
4,male,free,car,bad
...,...,...,...,...
995,female,own,furniture/equipment,good
996,male,own,car,good
997,male,own,radio/TV,good
998,male,free,radio/TV,bad


In [10]:
numerical_X

Unnamed: 0,Age,Job,Saving accounts,Checking account,Credit amount,Duration
0,67,2,0,1,1169,6
1,22,2,1,2,5951,48
2,49,1,1,0,2096,12
3,45,2,1,1,7882,42
4,53,2,1,1,4870,24
...,...,...,...,...,...,...
995,31,1,1,0,1736,12
996,40,3,1,1,3857,30
997,38,2,1,0,804,12
998,23,2,1,1,1845,45


In [11]:
chisquare = {}

for column in categorical_X:
    for column2 in categorical_X:
        if column != column2:
            chi, p, dof, ex = chi2_contingency(pd.crosstab(categorical_X[column], categorical_X[column2]))
            chisquare.setdefault('Feature 1', []).append(column)
            chisquare.setdefault('Feature 2', []).append(column2)
            chisquare.setdefault('chi', []).append(round(chi,3))

chisquare_result = pd.DataFrame(data=chisquare)
chisquare_result

Unnamed: 0,Feature 1,Feature 2,chi
0,Sex,Housing,53.955
1,Sex,Purpose,21.262
2,Sex,Risk,5.349
3,Housing,Sex,53.955
4,Housing,Purpose,65.018
5,Housing,Risk,18.2
6,Purpose,Sex,21.262
7,Purpose,Housing,65.018
8,Purpose,Risk,13.642
9,Risk,Sex,5.349


Since Sex and Risk have a very low chi-square statistic value, it shows a greater extent of independence between the 2 variables. For our analysis in classification later on, we will not consider Sex variable.

In [12]:
categorical_X = categorical_X.drop('Sex', axis= 1)
categorical_X

Unnamed: 0,Housing,Purpose,Risk
0,own,radio/TV,good
1,own,radio/TV,bad
2,own,education,good
3,free,furniture/equipment,good
4,free,car,bad
...,...,...,...
995,own,furniture/equipment,good
996,own,car,good
997,own,radio/TV,good
998,free,radio/TV,bad


In [13]:
dummy_df = pd.DataFrame()
for column in categorical_X:
    dummies = pd.get_dummies(categorical_X[column], prefix=column)
    dummy_df = pd.concat([dummy_df, dummies], axis=1)

In [14]:
X = pd.concat([dummy_df, numerical_X], axis=1)

In [39]:
mapped_y = y.map(lambda x: 1 if x == 'good' else 0)

In [41]:
cleaned_df = pd.concat([X,mapped_y],axis=1)
cleaned_df.to_csv("cleaned_german_credit_data.csv")

# ML Classification

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

In [17]:
clf = svm.SVC(kernel = 'linear')
clf.fit(X_train, y_train)

SVC(kernel='linear')

In [31]:
preds = clf.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, preds)}")

Accuracy: 0.99


In [36]:
X_test.reset_index()

Unnamed: 0,index,Housing_free,Housing_own,Housing_rent,Purpose_business,Purpose_car,Purpose_domestic appliances,Purpose_education,Purpose_furniture/equipment,Purpose_radio/TV,Purpose_repairs,Purpose_vacation/others,Risk_bad,Risk_good,Age,Job,Saving accounts,Checking account,Credit amount,Duration
0,977,0,1,0,1,0,0,0,0,0,0,0,0,1,42,2,0,2,2427,18
1,735,0,1,0,0,0,1,0,0,0,0,0,0,1,29,0,0,2,3990,36
2,615,0,1,0,1,0,0,0,0,0,0,0,0,1,48,3,0,2,12204,48
3,413,0,0,1,0,1,0,0,0,0,0,0,0,1,40,1,3,0,1597,10
4,563,1,0,0,0,1,0,0,0,0,0,0,1,0,37,2,0,2,12389,36
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,138,0,1,0,0,0,0,0,0,1,0,0,0,1,35,2,0,2,2728,15
196,442,0,1,0,0,0,0,0,0,0,0,1,0,1,29,2,1,2,2629,20
197,153,0,0,1,0,1,0,0,0,0,0,0,0,1,29,2,4,2,7758,24
198,325,0,1,0,0,1,0,0,0,0,0,0,0,1,39,1,1,1,3398,8


In [33]:
pd.DataFrame(preds)

Unnamed: 0,0
0,good
1,good
2,good
3,good
4,bad
...,...
195,good
196,good
197,good
198,good


In [None]:
print(f"Precision Score: {precision_score(y_test, preds, pos_label='good')}")

In [20]:
print(f"Recall: {recall_score(y_test, preds, pos_label='good')}")

Recall: 0.9857142857142858


## Classification with Neural Networks

In [21]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense, Dropout
from sklearn.preprocessing import StandardScaler
from copy import deepcopy

### Normalize Numerical Features - Age, Credit Amount and Duration

In [22]:
scl = StandardScaler()
numerical_X_NN = deepcopy(numerical_X)
for column in numerical_X_NN:
    if column in ['Age','Credit amount', 'Duration']:
        numerical_X_NN[column] = scl.fit_transform(numerical_X_NN[column].values.reshape(-1,1))
numerical_X_NN

Unnamed: 0,Age,Job,Saving accounts,Checking account,Credit amount,Duration
0,2.766456,2,0,1,-0.745131,-1.236478
1,-1.191404,2,1,2,0.949817,2.248194
2,1.183312,1,1,0,-0.416562,-0.738668
3,0.831502,2,1,1,1.634247,1.750384
4,1.535122,2,1,1,0.566664,0.256953
...,...,...,...,...,...,...
995,-0.399832,1,1,0,-0.544162,-0.738668
996,0.391740,3,1,1,0.207612,0.754763
997,0.215835,2,1,0,-0.874503,-0.738668
998,-1.103451,2,1,1,-0.505528,1.999289


In [23]:
X_NN = pd.concat([numerical_X_NN, dummy_df], axis=1)
X_NN

Unnamed: 0,Age,Job,Saving accounts,Checking account,Credit amount,Duration,Housing_free,Housing_own,Housing_rent,Purpose_business,Purpose_car,Purpose_domestic appliances,Purpose_education,Purpose_furniture/equipment,Purpose_radio/TV,Purpose_repairs,Purpose_vacation/others,Risk_bad,Risk_good
0,2.766456,2,0,1,-0.745131,-1.236478,0,1,0,0,0,0,0,0,1,0,0,0,1
1,-1.191404,2,1,2,0.949817,2.248194,0,1,0,0,0,0,0,0,1,0,0,1,0
2,1.183312,1,1,0,-0.416562,-0.738668,0,1,0,0,0,0,1,0,0,0,0,0,1
3,0.831502,2,1,1,1.634247,1.750384,1,0,0,0,0,0,0,1,0,0,0,0,1
4,1.535122,2,1,1,0.566664,0.256953,1,0,0,0,1,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,-0.399832,1,1,0,-0.544162,-0.738668,0,1,0,0,0,0,0,1,0,0,0,0,1
996,0.391740,3,1,1,0.207612,0.754763,0,1,0,0,1,0,0,0,0,0,0,0,1
997,0.215835,2,1,0,-0.874503,-0.738668,0,1,0,0,0,0,0,0,1,0,0,0,1
998,-1.103451,2,1,1,-0.505528,1.999289,1,0,0,0,0,0,0,0,1,0,0,1,0


In [24]:
y_NN = y.map(lambda x: 1 if x == 'good' else 0)
y_NN

0      1
1      0
2      1
3      1
4      0
      ..
995    1
996    1
997    1
998    0
999    1
Name: Risk, Length: 1000, dtype: int64

In [25]:
model = tf.keras.models.Sequential()
model.add(Dense(16, activation='relu', input_dim=19))
model.add(Dense(8, activation='relu'))
#model.add(Dropout(0.4))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [26]:
X_train_NN, X_test_NN, y_train_NN, y_test_NN = train_test_split(X_NN, y_NN, test_size = 0.2, random_state = 42, stratify = y_NN)

In [27]:
model.fit(X_train_NN, y_train_NN, epochs = 300, verbose =False)

<tensorflow.python.keras.callbacks.History at 0x1584a4909c8>

In [28]:
scores = model.evaluate(X_train_NN, y_train_NN, verbose=False)
print("Training Accuracy: %.2f%%\n" % (scores[1]*100))

Training Accuracy: 100.00%



In [29]:
scores = model.evaluate(X_test_NN, y_test_NN, verbose=False)
print("Testing Accuracy: %.2f%%\n" % (scores[1]*100))

Testing Accuracy: 100.00%



In [30]:
preds = model.predict(X_test_NN)
results = pd.DataFrame(preds).reset_index()
results['predicted'] = results[0].map(lambda x: 1 if x >= 0.5 else 0)
results = pd.concat([results, pd.DataFrame(y_test_NN).reset_index()], axis=1)
results.to_csv("results.csv")