In [1]:
import pandas as pd
import seaborn as sns 
import numpy as np
import matplotlib.pyplot as plt

import warnings; warnings.filterwarnings('ignore')

pd.set_option("display.max_columns",None)

In [2]:
df = pd.read_csv("data/cleaned_Data.csv")
df.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,20000,2,2,1,24,2,2,-1,-1,-2,-2,3913,3102,689,0,0,0,0,689,0,0,0,0,1
1,120000,2,2,2,26,-1,2,0,0,0,2,2682,1725,2682,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,90000,2,2,2,34,0,0,0,0,0,0,29239,14027,13559,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,50000,2,2,1,37,0,0,0,0,0,0,46990,48233,49291,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,50000,1,2,1,57,-1,0,-1,0,0,0,8617,5670,35835,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [3]:
df["default payment next month"].value_counts()

0    23364
1     6636
Name: default payment next month, dtype: int64

In [4]:
df.shape

(30000, 24)

## FIX Outliers

In [5]:
from scipy.stats import zscore

In [6]:
z_scores = zscore(df)

abs_z_scores = np.abs(z_scores)
filtered_entries = (abs_z_scores < 3).all(axis=1)
new_df = df[filtered_entries]

# Remove imbalanced - Under sampling 

In [7]:
from imblearn.under_sampling import NearMiss

In [8]:
X = new_df.drop(labels=['default payment next month'], axis =1)
y = new_df["default payment next month"]

In [9]:
# Implementing Undersampling for Handling Imbalanced 
nm = NearMiss()
X_res,y_res=nm.fit_resample(X,y)

In [10]:
X_res

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
0,360000,1,2,1,42,1,-2,-2,-2,-2,-2,0,0,0,0,0,0,0,0,0,0,0,0
1,360000,2,1,1,36,-2,-2,-2,-2,-2,-2,0,0,0,0,0,0,0,0,0,0,0,0
2,360000,2,2,2,34,1,-2,-2,-2,-2,-2,0,0,0,0,0,0,0,0,0,0,0,0
3,360000,1,2,1,37,1,-2,-2,-2,-2,-2,0,0,0,0,0,0,0,0,0,0,0,0
4,360000,1,1,2,29,-2,-2,-2,-2,-2,-2,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11907,40000,1,2,2,47,2,2,3,2,2,2,52358,54892,53415,51259,47151,46934,4000,0,2000,0,3520,0
11908,90000,1,2,1,36,0,0,0,0,0,0,7752,9112,10306,11328,12036,14329,1500,1500,1500,1200,2500,0
11909,210000,1,2,1,34,3,2,2,2,2,2,2500,2500,2500,2500,2500,2500,0,0,0,0,0,0
11910,80000,1,2,2,34,2,2,2,2,2,2,72557,77708,79384,77519,82607,81158,7000,3500,0,7000,0,4000


## One Hot encoding

In [11]:
X_res_encoded = pd.get_dummies(X_res, columns = ['SEX', 'EDUCATION','MARRIAGE'] , drop_first= True)

## StandardScaler

In [12]:
from sklearn.preprocessing import StandardScaler
scalar = StandardScaler()
X_scaled = scalar.fit_transform(X_res_encoded)

In [13]:
scalar.mean_

array([ 1.20120551e+05,  3.53382304e+01,  1.74194090e-01, -6.56480860e-02,
       -1.34905977e-01, -2.31950974e-01, -2.99278039e-01, -3.33613163e-01,
        2.52112550e+04,  2.46952551e+04,  2.38540303e+04,  2.22039338e+04,
        2.08006163e+04,  2.03096474e+04,  1.88354172e+03,  1.82219577e+03,
        1.59847641e+03,  1.46892117e+03,  1.46876419e+03,  1.46838037e+03,
        5.87558764e-01,  4.81783076e-01,  1.85946944e-01,  9.98992612e-03,
        4.61635326e-01,  5.23673606e-01,  1.24244459e-02])

In [14]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X_scaled,y_res,test_size=0.2,random_state=42)

In [15]:
X_train.shape, X_test.shape

((9529, 27), (2383, 27))

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier


In [17]:
svc = SVC(kernel='sigmoid', gamma=1.0)
knc = KNeighborsClassifier()
dtc = DecisionTreeClassifier(max_depth=5)
lrc = LogisticRegression(solver='liblinear', penalty='l1')
rfc = RandomForestClassifier(n_estimators=50, random_state=42)
abc = AdaBoostClassifier(n_estimators=50, random_state=42)
bc = BaggingClassifier(n_estimators=50, random_state=2)
etc = ExtraTreesClassifier(n_estimators=50, random_state=42)
gbdt = GradientBoostingClassifier(n_estimators=50,random_state=42)
xgb = XGBClassifier(n_estimators=50,random_state=42)

In [18]:
clfs = {
    'SVC' : svc,
    'KN' : knc,  
    'DT': dtc, 
    'LR': lrc, 
    'RF': rfc, 
    'AdaBoost': abc, 
    'BgC': bc, 
    'ETC': etc,
    'GBDT':gbdt,
    'xgb':xgb
}

In [19]:
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score,f1_score, recall_score

In [20]:
def train_classifier(clf,X_train,y_train,X_test,y_test):
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    precision = precision_score(y_test,y_pred)
    f1_scores = f1_score(y_test,y_pred)
    recall_scores = recall_score(y_test,y_pred)
    
    return accuracy,precision,f1_scores, recall_scores

In [21]:
accuracy_scores = []
precision_scores = []
f1_score_scores = []
recall_scores = []

for name,clf in clfs.items():
    
    current_accuracy,current_precision, current_f1_score, current_recall_score = train_classifier(clf, X_train,y_train,X_test,y_test)
    
    print("For ",name)
    print("Accuracy - ",current_accuracy)
    print("Precision - ",current_precision)
    print("f1_score - ",current_f1_score)
    print("recall_score - ",current_recall_score)
    print("--------------------------------------------------------")
    
    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision)
    f1_score_scores.append(current_f1_score)
    recall_scores.append(current_recall_score)

For  SVC
Accuracy -  0.5539236256819136
Precision -  0.541913632514818
f1_score -  0.5463081519419547
recall_score -  0.5507745266781411
--------------------------------------------------------
For  KN
Accuracy -  0.7129668485102811
Precision -  0.7564377682403434
f1_score -  0.6733524355300861
recall_score -  0.6067125645438899
--------------------------------------------------------
For  DT
Accuracy -  0.726814939152329
Precision -  0.7829457364341085
f1_score -  0.6847457627118644
recall_score -  0.608433734939759
--------------------------------------------------------
For  LR
Accuracy -  0.7402433906840118
Precision -  0.7539756782039289
f1_score -  0.7225459435230839
recall_score -  0.693631669535284
--------------------------------------------------------
For  RF
Accuracy -  0.756189676877885
Precision -  0.7727699530516432
f1_score -  0.7391109115401886
recall_score -  0.7082616179001722
--------------------------------------------------------
For  AdaBoost
Accuracy -  0.759127

In [22]:
performance_df = pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy':accuracy_scores,'Precision':precision_scores, 'f1_score': f1_score_scores, 'recall_score' : recall_scores }).sort_values('Precision',ascending=False)

performance_df

Unnamed: 0,Algorithm,Accuracy,Precision,f1_score,recall_score
8,GBDT,0.765422,0.818374,0.734945,0.666954
5,AdaBoost,0.759127,0.816129,0.725621,0.653184
9,xgb,0.762065,0.801418,0.736156,0.680723
2,DT,0.726815,0.782946,0.684746,0.608434
4,RF,0.75619,0.77277,0.739111,0.708262
6,BgC,0.749475,0.770853,0.729252,0.69191
7,ETC,0.743181,0.760911,0.723827,0.690189
1,KN,0.712967,0.756438,0.673352,0.606713
3,LR,0.740243,0.753976,0.722546,0.693632
0,SVC,0.553924,0.541914,0.546308,0.550775


# Remove imbalanced - Over  sampling

In [23]:
from imblearn.combine import SMOTETomek

In [24]:
X = new_df.drop(labels=['default payment next month'], axis =1)
y = new_df["default payment next month"]

In [27]:
# Implementing Oversampling for Handling Imbalanced 
smk = SMOTETomek()
X_res,y_res=smk.fit_resample(X,y)

In [29]:
X_res.shape

(40270, 23)

## One Hot encoding

In [30]:
X_res_encoded = pd.get_dummies(X_res, columns = ['SEX', 'EDUCATION','MARRIAGE'] , drop_first= True)

## StandardScaler

In [31]:
from sklearn.preprocessing import StandardScaler
scalar = StandardScaler()
X_scaled = scalar.fit_transform(X_res_encoded)

In [32]:
scalar.mean_

array([ 1.40605159e+05,  3.51252049e+01,  1.35758629e-01,  2.01142290e-03,
       -5.35634467e-02, -1.24559225e-01, -1.78048175e-01, -2.12937671e-01,
        4.03073920e+04,  3.86838898e+04,  3.67661908e+04,  3.37759672e+04,
        3.13434935e+04,  3.03048525e+04,  3.31535339e+03,  3.35427765e+03,
        2.94718145e+03,  2.66591361e+03,  2.63791778e+03,  2.64997244e+03,
        5.12664515e-01,  4.88626769e-01,  1.22895456e-01,  9.51080209e-03,
        5.46908369e-01,  4.44127142e-01,  7.30072014e-03])

In [33]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X_scaled,y_res,test_size=0.2,random_state=42)

In [34]:
X_train.shape, X_test.shape

((32216, 27), (8054, 27))

In [35]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier


In [36]:
svc = SVC(kernel='sigmoid', gamma=1.0)
knc = KNeighborsClassifier()
dtc = DecisionTreeClassifier(max_depth=5)
lrc = LogisticRegression(solver='liblinear', penalty='l1')
rfc = RandomForestClassifier(n_estimators=50, random_state=42)
abc = AdaBoostClassifier(n_estimators=50, random_state=42)
bc = BaggingClassifier(n_estimators=50, random_state=2)
etc = ExtraTreesClassifier(n_estimators=50, random_state=42)
gbdt = GradientBoostingClassifier(n_estimators=50,random_state=42)
xgb = XGBClassifier(n_estimators=50,random_state=42)

In [37]:
clfs = {
    'SVC' : svc,
    'KN' : knc,  
    'DT': dtc, 
    'LR': lrc, 
    'RF': rfc, 
    'AdaBoost': abc, 
    'BgC': bc, 
    'ETC': etc,
    'GBDT':gbdt,
    'xgb':xgb
}

In [38]:
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score,f1_score, recall_score

In [39]:
def train_classifier(clf,X_train,y_train,X_test,y_test):
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    precision = precision_score(y_test,y_pred)
    f1_scores = f1_score(y_test,y_pred)
    recall_scores = recall_score(y_test,y_pred)
    
    return accuracy,precision,f1_scores, recall_scores

In [40]:
accuracy_scores = []
precision_scores = []
f1_score_scores = []
recall_scores = []

for name,clf in clfs.items():
    
    current_accuracy,current_precision, current_f1_score, current_recall_score = train_classifier(clf, X_train,y_train,X_test,y_test)
    
    print("For ",name)
    print("Accuracy - ",current_accuracy)
    print("Precision - ",current_precision)
    print("f1_score - ",current_f1_score)
    print("recall_score - ",current_recall_score)
    print("--------------------------------------------------------")
    
    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision)
    f1_score_scores.append(current_f1_score)
    recall_scores.append(current_recall_score)

For  SVC
Accuracy -  0.5398559721877328
Precision -  0.5441396508728179
f1_score -  0.5407682775712515
recall_score -  0.5374384236453202
--------------------------------------------------------
For  KN
Accuracy -  0.7623541097591259
Precision -  0.7509354536950421
f1_score -  0.7703934740882917
recall_score -  0.7908866995073892
--------------------------------------------------------
For  DT
Accuracy -  0.7273404519493419
Precision -  0.7324189526184539
f1_score -  0.7278810408921934
recall_score -  0.7233990147783251
--------------------------------------------------------
For  LR
Accuracy -  0.727216290042215
Precision -  0.7237569060773481
f1_score -  0.7328225708378937
recall_score -  0.7421182266009853
--------------------------------------------------------
For  RF
Accuracy -  0.8395828159920536
Precision -  0.8482637141419225
f1_score -  0.8391834702514315
recall_score -  0.8302955665024631
--------------------------------------------------------
For  AdaBoost
Accuracy -  0.75

In [41]:
performance_df = pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy':accuracy_scores,'Precision':precision_scores, 'f1_score': f1_score_scores, 'recall_score' : recall_scores }).sort_values('Precision',ascending=False)

performance_df

Unnamed: 0,Algorithm,Accuracy,Precision,f1_score,recall_score
7,ETC,0.838838,0.84909,0.838074,0.82734
4,RF,0.839583,0.848264,0.839183,0.830296
6,BgC,0.832133,0.845938,0.830449,0.815517
9,xgb,0.817854,0.832351,0.815727,0.799754
8,GBDT,0.772908,0.792552,0.767687,0.744335
5,AdaBoost,0.756394,0.770361,0.752897,0.736207
1,KN,0.762354,0.750935,0.770393,0.790887
2,DT,0.72734,0.732419,0.727881,0.723399
3,LR,0.727216,0.723757,0.732823,0.742118
0,SVC,0.539856,0.54414,0.540768,0.537438


# Remove imbalanced - Over sampling

In [42]:
## RandomOverSampler to handle imbalanced data

from imblearn.over_sampling import RandomOverSampler

In [44]:
os =  RandomOverSampler()
X_train_res, y_train_res = os.fit_resample(X, y)

In [45]:
X_train_res.shape

(41482, 23)

## One Hot encoding

In [46]:
X_res_encoded = pd.get_dummies(X_train_res, columns = ['SEX', 'EDUCATION','MARRIAGE'] , drop_first= True)

## StandardScaler

In [47]:
from sklearn.preprocessing import StandardScaler
scalar = StandardScaler()
X_scaled = scalar.fit_transform(X_res_encoded)

In [48]:
scalar.mean_

array([ 1.41933176e+05,  3.53192710e+01,  1.86900342e-01,  1.85381611e-02,
       -4.72494094e-02, -1.27115375e-01, -1.94518104e-01, -2.27134661e-01,
        4.01770937e+04,  3.85200433e+04,  3.65372627e+04,  3.35643225e+04,
        3.11111918e+04,  3.00005970e+04,  3.38158399e+03,  3.37250236e+03,
        2.99042197e+03,  2.73352676e+03,  2.67289608e+03,  2.72809472e+03,
        6.00573743e-01,  4.79581505e-01,  1.73255870e-01,  1.10650403e-02,
        4.62923678e-01,  5.24227376e-01,  1.14989634e-02])

In [50]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X_scaled,y_train_res,test_size=0.2,random_state=42)

In [51]:
X_train.shape, X_test.shape

((33185, 27), (8297, 27))

In [70]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier


In [71]:
svc = SVC(kernel='sigmoid', gamma=1.0)
knc = KNeighborsClassifier()
dtc = DecisionTreeClassifier(max_depth=5)
lrc = LogisticRegression(solver='liblinear', penalty='l1')
rfc = RandomForestClassifier(n_estimators=50, random_state=42)
abc = AdaBoostClassifier(n_estimators=50, random_state=42)
bc = BaggingClassifier(n_estimators=50, random_state=2)
etc = ExtraTreesClassifier( criterion= "entropy", n_estimators=50, random_state=42)
gbdt = GradientBoostingClassifier(n_estimators=50,random_state=42)
xgb = XGBClassifier(n_estimators=50,random_state=42)

In [72]:
clfs = {
    'SVC' : svc,
    'KN' : knc,  
    'DT': dtc, 
    'LR': lrc, 
    'RF': rfc, 
    'AdaBoost': abc, 
    'BgC': bc, 
    'ETC': etc,
    'GBDT':gbdt,
    'xgb':xgb
}

In [73]:
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score,f1_score, recall_score

In [74]:
def train_classifier(clf,X_train,y_train,X_test,y_test):
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    precision = precision_score(y_test,y_pred)
    f1_scores = f1_score(y_test,y_pred)
    recall_scores = recall_score(y_test,y_pred)
    
    return accuracy,precision,f1_scores, recall_scores

In [75]:
accuracy_scores = []
precision_scores = []
f1_score_scores = []
recall_scores = []

for name,clf in clfs.items():
    
    current_accuracy,current_precision, current_f1_score, current_recall_score = train_classifier(clf, X_train,y_train,X_test,y_test)
    
    print("For ",name)
    print("Accuracy - ",current_accuracy)
    print("Precision - ",current_precision)
    print("f1_score - ",current_f1_score)
    print("recall_score - ",current_recall_score)
    print("--------------------------------------------------------")
    
    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision)
    f1_score_scores.append(current_f1_score)
    recall_scores.append(current_recall_score)

For  SVC
Accuracy -  0.5512835964806556
Precision -  0.5453003875968992
f1_score -  0.547355623100304
recall_score -  0.5494264095679766
--------------------------------------------------------
For  KN
Accuracy -  0.7505122333373508
Precision -  0.7122513089005236
f1_score -  0.7666816952209197
recall_score -  0.8301195997071028
--------------------------------------------------------
For  DT
Accuracy -  0.7001325780402555
Precision -  0.7181990778410632
f1_score -  0.6803699897225078
recall_score -  0.646326580424701
--------------------------------------------------------
For  LR
Accuracy -  0.6630107267687115
Precision -  0.6642262055036607
f1_score -  0.6530156366344005
recall_score -  0.64217720283134
--------------------------------------------------------
For  RF
Accuracy -  0.9344341328190913
Precision -  0.9079219288174512
f1_score -  0.9356365357311879
recall_score -  0.9650964120087869
--------------------------------------------------------
For  AdaBoost
Accuracy -  0.70772

In [76]:
performance_df = pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy':accuracy_scores,'Precision':precision_scores, 'f1_score': f1_score_scores, 'recall_score' : recall_scores }).sort_values('Precision',ascending=False)

performance_df

Unnamed: 0,Algorithm,Accuracy,Precision,f1_score,recall_score
7,ETC,0.943594,0.92524,0.944046,0.963632
4,RF,0.934434,0.907922,0.935637,0.965096
6,BgC,0.929372,0.899977,0.930945,0.96412
9,xgb,0.766301,0.776525,0.757595,0.739566
8,GBDT,0.715078,0.750506,0.687136,0.633634
5,AdaBoost,0.707726,0.746899,0.675932,0.617281
2,DT,0.700133,0.718199,0.68037,0.646327
1,KN,0.750512,0.712251,0.766682,0.83012
3,LR,0.663011,0.664226,0.653016,0.642177
0,SVC,0.551284,0.5453,0.547356,0.549426


# Best model condition 

- ExtraTreesClassifier with RandomOverSampler

In [77]:
etc.score(X_test,y_test)

0.9435940701458359

In [78]:
import pickle

In [79]:
pickle.dump(etc,open('best_model.pkl','wb'))

In [80]:
pickle.dump(os,open('RandomOverSampler.pkl','wb'))

In [81]:
pickle.dump(scalar,open('StandardScaler.pkl','wb'))