# Credit Default Prediction

Predict the customers with high risk i.e. bad customers
Bad: these indicate if customer is bad (=1)


In [36]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import imblearn


## Exploratory Data Analysis

In [37]:
data=pd.read_excel('dataset_final.xlsx')

In [38]:
data.shape

(22620, 276)

In [3]:
data.isna().sum().sum()

0

In [4]:
data.describe()

Unnamed: 0,count,cust,bad,a1,a2,a3,a4,a5,a6,a7,...,a264,a265,a266,a267,a268,a269,a270,a271,a272,a273
count,22620.0,22620.0,22620.0,22620.0,22620.0,22620.0,22620.0,22620.0,22620.0,22620.0,...,22620.0,22620.0,22620.0,22620.0,22620.0,22620.0,22620.0,22620.0,22620.0,22620.0
mean,11310.5,5618493.0,0.090451,1.876923,1.445225,1.507383,1.132759,2.581565,1.539346,2.505172,...,2.661981,1.93992,2.382891,1.392396,1.349027,1.2958,1.430283,2.395225,3.131919,2.143899
std,6529.975881,4419618.0,0.286833,0.328533,0.497002,0.499957,0.339321,1.116545,0.498461,0.564211,...,0.89483,0.237639,0.597043,0.488295,0.476673,0.456412,0.717027,0.614842,0.958725,1.016218
min,1.0,16480.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,5655.75,1822630.0,0.0,2.0,1.0,1.0,1.0,2.0,1.0,2.0,...,2.0,2.0,2.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0
50%,11310.5,4246013.0,0.0,2.0,1.0,2.0,1.0,3.0,2.0,3.0,...,3.0,2.0,2.0,1.0,1.0,1.0,1.0,2.0,3.0,2.0
75%,16965.25,9065963.0,0.0,2.0,2.0,2.0,1.0,4.0,2.0,3.0,...,3.0,2.0,3.0,2.0,2.0,2.0,2.0,3.0,4.0,3.0
max,22620.0,15685830.0,1.0,2.0,2.0,2.0,2.0,4.0,2.0,3.0,...,4.0,2.0,3.0,2.0,2.0,2.0,3.0,3.0,4.0,4.0


In [39]:
# Putting feature variables to X
x = data.drop(['bad','cust','count'],axis=1)

# Putting response variable to y
y = pd.DataFrame(data['bad'])

In [40]:
y['bad'].value_counts()

0    20574
1     2046
Name: bad, dtype: int64

As the target variable is highly imbalanced we are oversampling the minority class using SMOTE 

In [41]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(sampling_strategy='minority',random_state=27)
X,Y = sm.fit_sample(x, y)

In [42]:
Y['bad'].value_counts()

1    20574
0    20574
Name: bad, dtype: int64

### Data Preparation and Model Building

In [43]:
# Splitting the data into train and test

from sklearn.model_selection import train_test_split,RepeatedStratifiedKFold,cross_val_score

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, train_size=0.7,test_size=0.3,stratify=Y,random_state=100)

#### Default Hyperparameters
Let's first fit a random forest model with default hyperparameters.

In [15]:
# Importing random forest classifier from sklearn library
from sklearn.ensemble import RandomForestClassifier

# Running the random forest with default parameters.
rfc = RandomForestClassifier()
rfc.fit(X_train,Y_train)

  


RandomForestClassifier()

In [16]:
y_pred=rfc.predict(X_test)

In [18]:
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score,f1_score,roc_auc_score

In [19]:
print(roc_auc_score(Y_test,y_pred))
print(f1_score(Y_test,y_pred))

0.9535827473540776
0.9525858502275547


### Hyperparameter Tuning
Grid Search to Find Optimal Hyperparameters

In [44]:
# Create the parameter grid based on the results of random search 
from sklearn.model_selection import GridSearchCV
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=5, random_state=10)

param_grid = {
    'max_depth': [4,8,10],
    'min_samples_leaf': range(100, 400, 200),
    'min_samples_split': range(200, 500, 200),
    'n_estimators': [100,200, 300]
}
# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = cv, n_jobs = -1,verbose = 1)

In [46]:
# Fit the grid search to the data
grid_search.fit(X_train, Y_train)

Fitting 50 folds for each of 36 candidates, totalling 1800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   30.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  8.1min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed: 17.4min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed: 34.2min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed: 58.9min
[Parallel(n_jobs=-1)]: Done 1800 out of 1800 | elapsed: 59.8min finished
  self.best_estimator_.fit(X, y, **fit_params)


GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=5, n_splits=10, random_state=10),
             estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'max_depth': [4, 8, 10],
                         'min_samples_leaf': range(100, 400, 200),
                         'min_samples_split': range(200, 500, 200),
                         'n_estimators': [100, 200, 300]},
             verbose=1)

In [47]:
grid_search.best_score_,grid_search.best_params_

(0.8408778708203168,
 {'max_depth': 10,
  'min_samples_leaf': 100,
  'min_samples_split': 200,
  'n_estimators': 100})

In [48]:
model=RandomForestClassifier(n_estimators=100,max_depth=10,min_samples_leaf=100,min_samples_split=200)
model.fit(X_train, Y_train)

  


RandomForestClassifier(max_depth=10, min_samples_leaf=100,
                       min_samples_split=200)

In [49]:
y_pred=model.predict(X_test)

In [50]:
print(f1_score(y_pred,Y_test))
print(roc_auc_score(y_pred,Y_test))

0.8391293871978155
0.8378493522991007
