# Vehicle loan default predection

Importing necessary libs and custom functions

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from IPython.display import Image
from sklearn.ensemble import ExtraTreesClassifier,GradientBoostingClassifier,RandomForestClassifier
from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score,RepeatedStratifiedKFold
from sklearn.metrics import classification_report
import shap
import numpy as np

#importing custom functions from functions.py file
from functions import dateconvert
from functions import yrscalc
from functions import time_diff_months
from functions import resample
from functions import model_perf

In [2]:
df = pd.read_csv(r'../Data/train.csv')

In [3]:
df = df.head(20000)



In [None]:
df.info()

In [4]:
#We can see some columns has wrong data types like date and have missing values in Employment type
df['Date.of.Birth'] = df['Date.of.Birth'].apply(dateconvert,delim='-')
df['DisbursalDate'] = df['DisbursalDate'].apply(dateconvert,delim='-')
df['Employment.Type'].fillna('Unknown',inplace=True)
df.head()

Unnamed: 0,UniqueID,disbursed_amount,asset_cost,ltv,branch_id,supplier_id,manufacturer_id,Current_pincode_ID,Date.of.Birth,Employment.Type,...,SEC.SANCTIONED.AMOUNT,SEC.DISBURSED.AMOUNT,PRIMARY.INSTAL.AMT,SEC.INSTAL.AMT,NEW.ACCTS.IN.LAST.SIX.MONTHS,DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS,AVERAGE.ACCT.AGE,CREDIT.HISTORY.LENGTH,NO.OF_INQUIRIES,loan_default
0,420825,50578,58400,89.55,67,22807,45,1441,1984-01-01 00:00:00+00:00,Salaried,...,0,0,0,0,0,0,0yrs 0mon,0yrs 0mon,0,0
1,537409,47145,65550,73.23,67,22807,45,1502,1985-07-31 00:00:00+00:00,Self employed,...,0,0,1991,0,0,1,1yrs 11mon,1yrs 11mon,0,1
2,417566,53278,61360,89.63,67,22807,45,1497,1985-08-24 00:00:00+00:00,Self employed,...,0,0,0,0,0,0,0yrs 0mon,0yrs 0mon,0,0
3,624493,57513,66113,88.48,67,22807,45,1501,1993-12-30 00:00:00+00:00,Self employed,...,0,0,31,0,0,0,0yrs 8mon,1yrs 3mon,1,1
4,539055,52378,60300,88.39,67,22807,45,1495,1977-12-09 00:00:00+00:00,Self employed,...,0,0,0,0,0,0,0yrs 0mon,0yrs 0mon,1,1


In [5]:
#for purpose of labeling in human format
df['labels'] = df['loan_default'].apply(lambda x:'Paid' if x==0 else "default")

In [6]:
#Feature engineering
#We have features some of which might not be useful or have negligible effect on models.
#we also replace string values with integers.
#Date of birth and loan disbursal dates are used to calculate age of person and time since disbursement of loan
emp_type = {'Unknown':0,'Self employed':1,'Salaried':2}

cns_desc = {'No Bureau History Available':0,
            'Not Scored: Sufficient History Not Available':0,
            'Not Scored: Not Enough Info available on the customer':0,
           'Not Scored: No Activity seen on the customer (Inactive)':0,
           'Not Scored: No Updates available in last 36 months':0,
           'Not Scored: Only a Guarantor':1,
           'Not Scored: More than 50 active Accounts found':1,
           'M-Very High Risk':2,
           'L-Very High Risk':3,
           'K-High Risk':4,
            'J-High Risk':5,
            'H-Medium Risk':6,
            'I-Medium Risk':7,
            'F-Low Risk':8,
            'E-Low Risk':9,
            'G-Low Risk':10,
            'C-Very Low Risk':11,
            'A-Very Low Risk':12,
            'D-Very Low Risk':13,
            'B-Very Low Risk':14
           }
df['Age'] = df['Date.of.Birth'].apply(time_diff_months)
df['AVERAGE.ACCT.AGE'] = df['AVERAGE.ACCT.AGE'].apply(yrscalc)
df['CREDIT.HISTORY.LENGTH'] = df['CREDIT.HISTORY.LENGTH'].apply(yrscalc)
df['Age.of.Loan'] = df['DisbursalDate'].apply(time_diff_months)
#combaining to sum of ids
df['No.of.Proofs'] = df[['Aadhar_flag','VoterID_flag','PAN_flag','Driving_flag','Passport_flag','MobileNo_Avl_Flag']].sum(axis=1)

In [7]:
df['PERFORM_CNS.SCORE.DESCRIPTION'] = df['PERFORM_CNS.SCORE.DESCRIPTION'].map(cns_desc)
df['Employment.Type'] = df['Employment.Type'].map(emp_type)


In [None]:
#Now we can see columns in desired datatypes and 
df.info()

In [None]:
#imbalanced data
fig = px.pie(df, names='labels')
fig.show()

In [None]:
#Static plot for no dynamic render.
fig.write_image(r"../Images/imbalanced_data.png")
img_bytes = fig.to_image(format="png")
Image(img_bytes)

In [None]:
#feature importance. We drop rudumentry features.
X = df.drop(columns=['UniqueID','loan_default','labels','Date.of.Birth','DisbursalDate','Aadhar_flag','PAN_flag','VoterID_flag','Driving_flag','Passport_flag','MobileNo_Avl_Flag'])
y = df[['loan_default']]
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=27)

In [None]:
#dealing with imbalanced data
os_X,os_y = resample(X_train,y_train,'minority')

In [None]:
#dealing with imbalanced data
#We use ExtraTressClassifer for feture selection which failry robust and also computationaly inexpensive.
#os_X,os_y = resample(X,y,'minority')
etc_model = ExtraTreesClassifier(oob_score=True,random_state=27,bootstrap=True,n_estimators=15)
etc_model.fit(os_X,os_y)

In [None]:
#Creating a dataframe for a tubular column and sorting them in decending order.
#We can see that features with secondary accounts have negligible. So we combine both primary and secondary account features.
columns = list(X_train.columns)
pd_feature = pd.DataFrame(data=[columns,etc_model.feature_importances_])
pd_feature = pd_feature.T
pd_feature.columns=['features','scores']
pd_feature.sort_values(['scores'],ascending=False)

In [None]:
plt.figure(figsize=(25,25))
fig = px.bar(pd_feature,x='features',y='scores',text='scores', width=900, height=700)
fig.show()

In [None]:
#non dynamic render.For purpose of being displayed in Github
#Image(filename=r'../Images/feature_score.png')
fig.write_image(r"../Images/feature_score_before_clubbing.png")
img_bytes = fig.to_image(format="png")
Image(img_bytes)

In [None]:
etc_model.oob_score_

In [8]:
#we could see that the secondary account features dont contribute much. We will combanie them with the primary account features.
df['Total.ACCTS'] = df[['PRI.NO.OF.ACCTS','SEC.NO.OF.ACCTS']].sum(axis=1)
df['Total.Active.ACCTS'] = df[['PRI.ACTIVE.ACCTS','SEC.ACTIVE.ACCTS']].sum(axis=1)
df['Total.Overdue.ACCTS'] = df[['PRI.OVERDUE.ACCTS','SEC.OVERDUE.ACCTS']].sum(axis=1)
df['Total.CurrentBalance'] = df[['PRI.CURRENT.BALANCE','SEC.CURRENT.BALANCE']].sum(axis=1)
df['Total.SanctionedAmount'] = df[['PRI.SANCTIONED.AMOUNT','SEC.SANCTIONED.AMOUNT']].sum(axis=1)
df['Total.DisbursedAmount'] = df[['PRI.DISBURSED.AMOUNT','SEC.DISBURSED.AMOUNT']].sum(axis=1)
df['Total.InstalAmount'] = df[['PRIMARY.INSTAL.AMT','SEC.INSTAL.AMT']].sum(axis=1)

In [9]:
X = df.drop(columns=['UniqueID','loan_default','labels','Date.of.Birth','DisbursalDate','Aadhar_flag','PAN_flag','VoterID_flag','Driving_flag','Passport_flag','MobileNo_Avl_Flag','PRI.NO.OF.ACCTS',
                    'SEC.NO.OF.ACCTS','PRI.ACTIVE.ACCTS','SEC.ACTIVE.ACCTS','PRI.OVERDUE.ACCTS','SEC.OVERDUE.ACCTS','PRI.CURRENT.BALANCE','SEC.CURRENT.BALANCE',
                    'PRI.SANCTIONED.AMOUNT','SEC.SANCTIONED.AMOUNT','PRI.DISBURSED.AMOUNT','SEC.DISBURSED.AMOUNT','PRIMARY.INSTAL.AMT','SEC.INSTAL.AMT'])
y = df[['loan_default']]
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=27)
os_X,os_y = resample(X_train,y_train,'minority')
os_X_test,os_y_test = resample(X_test,y_test,'minority')

In [None]:
os_X,os_y = resample(X_train,y_train,'minority')
etc_model = ExtraTreesClassifier(oob_score=True,random_state=27,bootstrap=True,n_estimators=15)
etc_model.fit(os_X,os_y)
columns = list(os_X.columns)
pd_feature = pd.DataFrame(data=[columns,etc_model.feature_importances_])
pd_feature = pd_feature.T
pd_feature.columns=['features','scores']
pd_feature.sort_values(['scores'],ascending=False)

In [None]:
plt.figure(figsize=(25,25))
fig = px.bar(pd_feature,x='features',y='scores',text='scores', width=900, height=700)
fig.show()

In [None]:
#non dynamic render.For purpose of being displayed in Github
#Image(filename=r'../Images/feature_score.png')
fig.write_image(r"../Images/feature_score_after_clubbing.png")
img_bytes = fig.to_image(format="png")
Image(img_bytes)

In [None]:
from sklearn.model_selection import cross_val_score,RepeatedStratifiedKFold
from imblearn.ensemble import BalancedRandomForestClassifier
brc_model = BalancedRandomForestClassifier(n_estimators=10)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=27)
scores = cross_val_score(brc_model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
print(f'Mean AUC_ROC score : {scores.mean()}')

In [None]:
from sklearn.model_selection import cross_val_score,RepeatedStratifiedKFold
from imblearn.ensemble import BalancedRandomForestClassifier
brc_model = BalancedRandomForestClassifier(n_estimators=10)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=27)
scores = cross_val_score(brc_model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
print(f'Accuracy : {scores.mean()}')

In [None]:
from sklearn.model_selection import cross_val_score,RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=15,random_state=27)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=27)
scores = cross_val_score(rf_model, os_X, os_y, scoring='precision', cv=cv, n_jobs=-1)
print(f'Mean AUC_ROC score : {scores.mean()}')

In [None]:
os_X_test,os_y_test = resample(X_test,y_test,'minority')
rf_model.fit(os_X,os_y)
y_pred = rf_model.predict(os_X_test)
c_report = classification_report(os_y_test,y_pred,target_names=['Paid','Default'])
print(c_report)

In [None]:
from sklearn.model_selection import cross_val_score,RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=15)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=27)
scores = cross_val_score(rf_model, os_X, os_y, scoring='accuracy', cv=cv, n_jobs=-1)
print(f'Accuracy : {scores.mean()}')

In [None]:
from sklearn.metrics import classification_report
c_report = classification_report(test_targets,y_pred_logmodel,target_names=['Negative','Positive'])

In [None]:
from sklearn.model_selection import cross_val_score,RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=10)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=27)
scores = cross_val_score(rf_model, os_X1, os_y1, scoring='roc_auc', cv=cv, n_jobs=-1)
print(f'Mean AUC_ROC score : {scores.mean()}')

In [None]:
from sklearn.model_selection import cross_val_score,RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=10)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=27)
scores = cross_val_score(rf_model, os_X1, os_y1, scoring='accuracy', cv=cv, n_jobs=-1)
print(f'Accuracy : {scores.mean()}')

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gbc_model = GradientBoostingClassifier(learning_rate=0.1,loss='deviance',random_state=10,n_estimators=100)
gbc_model.fit(os_X,os_y.loan_default)
y_pred = gbc_model.predict(os_X_test)
c_report = classification_report(os_y_test.loan_default,y_pred,target_names=['Paid','Default'])
print(c_report)

In [None]:
grid_params = [{'loss':['deviance', 'exponential'],'learning_rate':[0.3,0.5,0.7,0.9],'n_estimators':[50,100,150,250,350],'max_depth':[3,5,7,10,15]}]
grid_search = GridSearchCV(estimator=GradientBoostingClassifier(),param_grid=grid_params,scoring=['precision','recall'],return_train_score=True,refit='precision',verbose=5,n_jobs=-1)
gs_val1 = grid_search.fit(os_X,os_y.loan_default)

In [None]:
gs_val1.best_params_

In [11]:
grid_params = [{'loss':['deviance', 'exponential'],'learning_rate':[0.5,0.7,0.9],'n_estimators':[150,250,350],'max_depth':[3,5,7,10,15]}]
#grid_search = GridSearchCV(estimator=GradientBoostingClassifier(),param_grid=grid_params,scoring=['precision','recall'],return_train_score=True,refit='recall',verbose=5,n_jobs=-1)
#gs_val2 = grid_search.fit(os_X,os_y.loan_default)

In [None]:
gs_val2.best_params_

In [None]:
def model_perf(model,params,scoring,refit_val,X,y):
    grid_search = GridSearchCV(estimator=model,param_grid=params,scoring=scoring,return_train_score=True,refit=refit_val,verbose=5,n_jobs=-1)
    gs_val = grid_search.fit(X,y)
    return gs_val.best_params_,gs_val.best_score_

In [13]:
model_perf(GradientBoostingClassifier(),grid_params,['precision','recall'],'precision',os_X,os_y)

Fitting 5 folds for each of 90 candidates, totalling 450 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  2.1min


KeyboardInterrupt: 

In [None]:
from sklearn.ensemble import AdaBoostClassifier
ada_model = AdaBoostClassifier(random_state=10)
ada_model.fit(os_X,os_y)
y_pred1 = ada_model.predict(os_X_test)
c_report = classification_report(os_y_test,y_pred1,target_names=['Paid','Default'])
print(c_report)

In [None]:
y_pred1[:50]

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gbc_model = GradientBoostingClassifier(learning_rate=0.5,loss='exponential',random_state=10,n_estimators=150)
gbc_model.fit(os_X,os_y.loan_default)
y_pred = gbc_model.predict(os_X_test)
c_report = classification_report(os_y_test.loan_default,y_pred,target_names=['Paid','Default'])
print(c_report)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gbc_model = GradientBoostingClassifier(learning_rate=0.5,loss='exponential',random_state=10,n_estimators=150)
gbc_model.fit(os_X,os_y.loan_default)
y_pred = gbc_model.predict(os_X_test)
c_report = classification_report(os_y_test.loan_default,y_pred,target_names=['Paid','Default'])
print(c_report)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gbc_model = GradientBoostingClassifier(learning_rate=0.9,loss='deviance',random_state=10,n_estimators=250)
gbc_model.fit(os_X,os_y.loan_default)
y_pred = gbc_model.predict(os_X_test)
c_report = classification_report(os_y_test.loan_default,y_pred,target_names=['Paid','Default'])
print(c_report)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gbc_model = GradientBoostingClassifier(learning_rate=0.01,loss='deviance',random_state=10,n_estimators=350,max_depth=15)
gbc_model.fit(os_X,os_y.loan_default)
y_pred = gbc_model.predict(os_X_test)
c_report = classification_report(os_y_test.loan_default,y_pred,target_names=['Paid','Default'])
print(c_report)

In [None]:
from sklearn.preprocessing import RobustScaler
X = RobustScaler().fit(X)

In [None]:
X = df.drop(columns=['UniqueID','loan_default','labels','Date.of.Birth','DisbursalDate','Aadhar_flag','PAN_flag','VoterID_flag','Driving_flag','Passport_flag','MobileNo_Avl_Flag','PRI.NO.OF.ACCTS',
                    'SEC.NO.OF.ACCTS','PRI.ACTIVE.ACCTS','SEC.ACTIVE.ACCTS','PRI.OVERDUE.ACCTS','SEC.OVERDUE.ACCTS','PRI.CURRENT.BALANCE','SEC.CURRENT.BALANCE',
                    'PRI.SANCTIONED.AMOUNT','SEC.SANCTIONED.AMOUNT','PRI.DISBURSED.AMOUNT','SEC.DISBURSED.AMOUNT','PRIMARY.INSTAL.AMT','SEC.INSTAL.AMT'])
y = df[['loan_default']]
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=27)
os_X,os_y = resample(X_train,y_train,'minority')
os_X_test,os_y_test = resample(X_test,y_test,'minority')

In [15]:
from sklearn.ensemble import GradientBoostingClassifier
gbc_model = GradientBoostingClassifier(learning_rate=0.3,loss='deviance',random_state=10,n_estimators=50)
gbc_model.fit(os_X,os_y.loan_default)
y_pred = gbc_model.predict(os_X_test)
c_report = classification_report(os_y_test.loan_default,y_pred,target_names=['Paid','Default'])
print(c_report)

              precision    recall  f1-score   support

        Paid       0.78      0.96      0.86      4016
     Default       0.94      0.72      0.82      4016

    accuracy                           0.84      8032
   macro avg       0.86      0.84      0.84      8032
weighted avg       0.86      0.84      0.84      8032



In [None]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=27)
scores = cross_val_score(gbc_model, os_X_test, os_y_test.loan_default, scoring='roc_auc', cv=cv, n_jobs=-1)
print(f'Mean AUC_ROC score : {scores.mean()}')

In [None]:
shap.initjs()

In [None]:
explainer = shap.TreeExplainer(gbc_model)
shap_values = explainer.shap_values(os_X_test)
shap.summary_plot(shap_values, features=os_X_test, feature_names=os_X_test.columns)

In [None]:
def shapTree_plot(model,i,rand_picks):
    ex_model = shap.TreeExplainer(model)
    shap_vals = ex_model.shap_values(rand_picks)
    plot = shap.force_plot(ex_model.expected_value,shap_vals[i],picks.iloc[[i]])
    return plot

In [None]:
random_picks = np.arange(1,20000,25) # Every 50 rows
picks = os_X.iloc[random_picks]
picks

In [None]:
shapTree_plot(gbc_model,45,S)

In [None]:
shapTree_plot(gbc_model,1,S)

In [None]:
os_X.mean()

In [None]:
random_picks = np.arange(1,8032,25) # Every 50 rows
picks1 = os_y_test.iloc[random_picks]
picks1.mean()

In [14]:
len(os_y_test)

8032