In [6]:
#--------Basic Imports---------

import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

%matplotlib inline
import sklearn 
import plotly
import plotly.offline as offline
import plotly.graph_objs as go
import plotly.express as px

# Data Modelling 

import warnings
warnings.filterwarnings('ignore')

In [7]:
"""lets create a random color generator
     which we can use inside our visualization"""
     
def random_colours(number_of_colors):
   
    colors = []
    for i in range(number_of_colors):
        colors.append("#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)]))
    return colors

In [8]:
df=pd.read_csv('train_indessa.csv')

In [9]:
df.shape

(532428, 45)

# Lets Find Columns with missing Values

In [10]:
cols_with_missing=[col for col in df.columns if df[col].isnull().any()]

In [11]:
for col in cols_with_missing:
    print(col,"----->",df[col].isna().sum())

batch_enrolled -----> 85149
emp_title -----> 30833
emp_length -----> 26891
annual_inc -----> 3
desc -----> 456829
title -----> 90
delinq_2yrs -----> 16
inq_last_6mths -----> 16
mths_since_last_delinq -----> 272554
mths_since_last_record -----> 450305
open_acc -----> 16
pub_rec -----> 16
revol_util -----> 287
total_acc -----> 16
collections_12_mths_ex_med -----> 95
mths_since_last_major_derog -----> 399448
verification_status_joint -----> 532123
acc_now_delinq -----> 16
tot_coll_amt -----> 42004
tot_cur_bal -----> 42004
total_rev_hi_lim -----> 42004


In [12]:
for col in cols_with_missing:
    print(col,"--------------------->",np.round(df[col].isnull().mean(),3)," % missing values ")

batch_enrolled ---------------------> 0.16  % missing values 
emp_title ---------------------> 0.058  % missing values 
emp_length ---------------------> 0.051  % missing values 
annual_inc ---------------------> 0.0  % missing values 
desc ---------------------> 0.858  % missing values 
title ---------------------> 0.0  % missing values 
delinq_2yrs ---------------------> 0.0  % missing values 
inq_last_6mths ---------------------> 0.0  % missing values 
mths_since_last_delinq ---------------------> 0.512  % missing values 
mths_since_last_record ---------------------> 0.846  % missing values 
open_acc ---------------------> 0.0  % missing values 
pub_rec ---------------------> 0.0  % missing values 
revol_util ---------------------> 0.001  % missing values 
total_acc ---------------------> 0.0  % missing values 
collections_12_mths_ex_med ---------------------> 0.0  % missing values 
mths_since_last_major_derog ---------------------> 0.75  % missing values 
verification_status_joint 

In [13]:
cols_with_missing=[col for col in df.columns if df[col].isnull().any()]

In [14]:
cols_with_missing

['batch_enrolled',
 'emp_title',
 'emp_length',
 'annual_inc',
 'desc',
 'title',
 'delinq_2yrs',
 'inq_last_6mths',
 'mths_since_last_delinq',
 'mths_since_last_record',
 'open_acc',
 'pub_rec',
 'revol_util',
 'total_acc',
 'collections_12_mths_ex_med',
 'mths_since_last_major_derog',
 'verification_status_joint',
 'acc_now_delinq',
 'tot_coll_amt',
 'tot_cur_bal',
 'total_rev_hi_lim']

# Dataframe which exclude object dtype


In [15]:
df=df.select_dtypes(exclude=[object])

In [16]:
df.fillna(0)

Unnamed: 0,member_id,loan_amnt,funded_amnt,funded_amnt_inv,int_rate,annual_inc,dti,delinq_2yrs,inq_last_6mths,mths_since_last_delinq,...,total_rec_late_fee,recoveries,collection_recovery_fee,collections_12_mths_ex_med,mths_since_last_major_derog,acc_now_delinq,tot_coll_amt,tot_cur_bal,total_rev_hi_lim,loan_status
0,58189336,14350,14350,14350.0,19.19,28700.0,33.88,0.0,1.0,50.0,...,0.0,0.0,0.0,0.0,74.0,0.0,0.0,28699.0,30800.0,0
1,70011223,4800,4800,4800.0,10.99,65000.0,3.64,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9974.0,32900.0,0
2,70255675,10000,10000,10000.0,7.26,45000.0,18.42,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,65.0,38295.0,34900.0,0
3,1893936,15000,15000,15000.0,19.72,105000.0,14.97,0.0,2.0,46.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,55564.0,24700.0,0
4,7652106,16000,16000,16000.0,10.64,52000.0,20.16,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,47159.0,47033.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
532423,31296187,20000,20000,20000.0,12.49,75000.0,14.53,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,83087.0,24800.0,0
532424,29403184,12000,12000,12000.0,14.99,59000.0,22.97,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,227812.0,17800.0,0
532425,7357607,18725,18725,18725.0,20.80,42504.0,27.27,0.0,1.0,26.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,26010.0,24200.0,1
532426,23182668,21000,21000,21000.0,16.29,50000.0,14.91,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,29197.0,23300.0,0


In [17]:
for col in df.columns:
    print(df[col].isna().any())

False
False
False
False
False
True
False
True
True
True
True
True
True
False
True
True
False
False
False
False
True
True
True
True
True
True
False


In [18]:
X=df.drop(['loan_status'],axis=1)

In [19]:
y=df['loan_status']

In [20]:
X

Unnamed: 0,member_id,loan_amnt,funded_amnt,funded_amnt_inv,int_rate,annual_inc,dti,delinq_2yrs,inq_last_6mths,mths_since_last_delinq,...,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,collections_12_mths_ex_med,mths_since_last_major_derog,acc_now_delinq,tot_coll_amt,tot_cur_bal,total_rev_hi_lim
0,58189336,14350,14350,14350.0,19.19,28700.0,33.88,0.0,1.0,50.0,...,1173.84,0.0,0.0,0.0,0.0,74.0,0.0,0.0,28699.0,30800.0
1,70011223,4800,4800,4800.0,10.99,65000.0,3.64,0.0,1.0,,...,83.95,0.0,0.0,0.0,0.0,,0.0,0.0,9974.0,32900.0
2,70255675,10000,10000,10000.0,7.26,45000.0,18.42,0.0,0.0,,...,56.47,0.0,0.0,0.0,0.0,,0.0,65.0,38295.0,34900.0
3,1893936,15000,15000,15000.0,19.72,105000.0,14.97,0.0,2.0,46.0,...,4858.62,0.0,0.0,0.0,0.0,,0.0,0.0,55564.0,24700.0
4,7652106,16000,16000,16000.0,10.64,52000.0,20.16,0.0,0.0,,...,2296.41,0.0,0.0,0.0,0.0,,0.0,0.0,47159.0,47033.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
532423,31296187,20000,20000,20000.0,12.49,75000.0,14.53,0.0,0.0,,...,2595.45,0.0,0.0,0.0,0.0,,0.0,0.0,83087.0,24800.0
532424,29403184,12000,12000,12000.0,14.99,59000.0,22.97,0.0,0.0,,...,2182.92,0.0,0.0,0.0,0.0,,0.0,0.0,227812.0,17800.0
532425,7357607,18725,18725,18725.0,20.80,42504.0,27.27,0.0,1.0,26.0,...,645.32,0.0,0.0,0.0,0.0,,0.0,0.0,26010.0,24200.0
532426,23182668,21000,21000,21000.0,16.29,50000.0,14.91,0.0,1.0,,...,4619.79,0.0,0.0,0.0,0.0,,0.0,0.0,29197.0,23300.0


In [21]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.30) 

In [22]:
import matplotlib.pyplot as plt
import sklearn.metrics as metrics
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.metrics import roc_curve, auc 
from sklearn.metrics import roc_auc_score

In [23]:
from catboost import CatBoostClassifier
cat_clf = CatBoostClassifier(learning_rate=0.1, n_estimators=1400, subsample=0.70, max_depth=6, scale_pos_weight=2.5, silent=True)
cat_clf.fit(X_train, y_train)

# evaluate predictions
y_train_predict_cat = cat_clf.predict(X_train)

print('Train Accuracy: %.3f' % metrics.accuracy_score(y_train, y_train_predict_cat))

# make predictions for test data
y_pred_cat = cat_clf.predict(X_test)
predictions = [round(value) for value in y_pred_cat]


accuracy_per_roc_auc = roc_auc_score(y_test, predictions)
print("ROC-AUC: %.10f%%" % (accuracy_per_roc_auc * 100))

print('Test Accuracy: %.3f' % metrics.accuracy_score(y_test, predictions))

print(metrics.confusion_matrix(y_test, predictions))

print(metrics.classification_report(y_test, predictions))


print( "precision------------>",metrics.precision_score(y_test, predictions))
print( "Recall------------>", metrics.recall_score(y_test, predictions))
print( "F1-score------------>",metrics.f1_score(y_test, predictions))

Train Accuracy: 0.927
ROC-AUC: 89.6664919563%
Test Accuracy: 0.920
[[114709   7288]
 [  5544  32188]]
              precision    recall  f1-score   support

           0       0.95      0.94      0.95    121997
           1       0.82      0.85      0.83     37732

    accuracy                           0.92    159729
   macro avg       0.88      0.90      0.89    159729
weighted avg       0.92      0.92      0.92    159729

Precision Score: 0.815
Recall Score: 0.853
F1 Score: 0.834


In [24]:
from sklearn.ensemble import AdaBoostClassifier

In [25]:
from xgboost import XGBClassifier

xg = XGBClassifier() 
    
xg.fit(X_train, y_train)


# evaluate predictions
y_train_predict_xg = xg.predict(X_train)

print('Train Accuracy %.3f' % metrics.accuracy_score(y_train, y_train_predict_xg))

# make predictions for test data
y_pred_xg = xg.predict(X_test)
predictions = [round(value) for value in y_pred_xg]

accuracy_per_roc_auc = roc_auc_score(y_test, predictions)
print("ROC-AUC: %.10f%%" % (accuracy_per_roc_auc * 100))


print('Test Accuracy %.3f' % metrics.accuracy_score(y_test, predictions))

print(metrics.confusion_matrix(y_test, predictions))

print(metrics.classification_report(y_test, predictions))

print( "precision------------>",metrics.precision_score(y_test, predictions))
print( "Recall------------>", metrics.recall_score(y_test, predictions))
print( "F1-score------------>",metrics.f1_score(y_test, predictions))

Train Accuracy 0.920
ROC-AUC: 85.2135691202%
Test Accuracy 0.914
[[118203   3794]
 [  9985  27747]]
              precision    recall  f1-score   support

           0       0.92      0.97      0.94    121997
           1       0.88      0.74      0.80     37732

    accuracy                           0.91    159729
   macro avg       0.90      0.85      0.87    159729
weighted avg       0.91      0.91      0.91    159729

0.879712120731746
0.7353705077917948
0.8010913342860855


# <font color=red>WE are getting accuracy of 88 percent using catboost and 86 using xgboost lets see how much we can improve using data cleaning and feature engineering</font>