# <font color='blue'>LOAN STATUS PREDICTION<font/>

#### Importing Required Libraries for Model Building

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB,BernoulliNB,MultinomialNB
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import confusion_matrix,classification_report,plot_confusion_matrix,roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn import set_config
set_config(display='diagram')

#### Pre-Processed Cleaned Data from EDA

In [4]:
df = pd.read_csv('credit_train_cleaned.csv')

df.head()

Unnamed: 0,Loan Status,Current Loan Amount,Term,Credit Score,Annual Income,Years in current job,Home Ownership,Purpose,Monthly Debt,Years of Credit History,Months since last delinquent,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens
0,Fully Paid,445412.0,Short Term,709.0,1167493.0,8 years,Home Mortgage,Home Improvements,5214.74,17.2,32.0,6.0,1.0,228190.0,416746.0,1.0,0.0
1,Fully Paid,262328.0,Short Term,732.0,1219961.5,10+ years,Home Mortgage,Debt Consolidation,33295.98,21.1,8.0,35.0,0.0,229976.0,850784.0,0.0,0.0
2,Fully Paid,789250.0,Short Term,741.0,2231892.0,8 years,Own Home,Debt Consolidation,29200.53,14.9,29.0,18.0,1.0,297996.0,750090.0,0.0,0.0
3,Fully Paid,347666.0,Long Term,721.0,806949.0,3 years,Own Home,Debt Consolidation,8741.9,12.0,32.0,9.0,0.0,256329.0,386958.0,0.0,0.0
4,Fully Paid,176220.0,Short Term,732.0,1219961.5,5 years,Rent,Debt Consolidation,20639.7,6.1,32.0,15.0,0.0,253460.0,427174.0,0.0,0.0


# <font color='blue'> Categorical variable handling<font/>

#### Checking cardinality of Categorical columns

In [19]:
cat_df  = df.select_dtypes(include='O')

In [6]:
cat_columns = cat_df.columns

cat_columns

Index(['Loan Status', 'Term', 'Years in current job', 'Home Ownership',
       'Purpose'],
      dtype='object')

In [7]:
card_list=[]
for i in cat_columns:
        card = df[i].nunique()
        card_list.append(card)
        
card_df = pd.DataFrame({'Cardinality':card_list},index = cat_columns)  
card_df.sort_values(by='Cardinality')[1:]

Unnamed: 0,Cardinality
Term,2
Home Ownership,4
Years in current job,11
Purpose,16


* Ideally from Machine Learning Model Building perspective we should ideally consider the categorical columns with low cardinality.
* So, any column with cardinality > 10 should not be considered.
* But , as we have few categorical columns , we consider all the categorical features for our model.
* We prefer dummy encoding here , as we have cardinal categorical data mostly. 

In [8]:
df=pd.get_dummies(df,drop_first=True)

#### List of Columns after Performing Dummy Encoding

In [9]:
df.columns

Index(['Current Loan Amount', 'Credit Score', 'Annual Income', 'Monthly Debt',
       'Years of Credit History', 'Months since last delinquent',
       'Number of Open Accounts', 'Number of Credit Problems',
       'Current Credit Balance', 'Maximum Open Credit', 'Bankruptcies',
       'Tax Liens', 'Loan Status_Fully Paid', 'Term_Short Term',
       'Years in current job_10+ years', 'Years in current job_2 years',
       'Years in current job_3 years', 'Years in current job_4 years',
       'Years in current job_5 years', 'Years in current job_6 years',
       'Years in current job_7 years', 'Years in current job_8 years',
       'Years in current job_9 years', 'Years in current job_< 1 year',
       'Home Ownership_Home Mortgage', 'Home Ownership_Own Home',
       'Home Ownership_Rent', 'Purpose_Buy House', 'Purpose_Buy a Car',
       'Purpose_Debt Consolidation', 'Purpose_Educational Expenses',
       'Purpose_Home Improvements', 'Purpose_Medical Bills', 'Purpose_Other',
       'Purp

In [10]:
df.head()

Unnamed: 0,Current Loan Amount,Credit Score,Annual Income,Monthly Debt,Years of Credit History,Months since last delinquent,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,...,Purpose_Medical Bills,Purpose_Other,Purpose_Take a Trip,Purpose_major_purchase,Purpose_moving,Purpose_other,Purpose_renewable_energy,Purpose_small_business,Purpose_vacation,Purpose_wedding
0,445412.0,709.0,1167493.0,5214.74,17.2,32.0,6.0,1.0,228190.0,416746.0,...,0,0,0,0,0,0,0,0,0,0
1,262328.0,732.0,1219961.5,33295.98,21.1,8.0,35.0,0.0,229976.0,850784.0,...,0,0,0,0,0,0,0,0,0,0
2,789250.0,741.0,2231892.0,29200.53,14.9,29.0,18.0,1.0,297996.0,750090.0,...,0,0,0,0,0,0,0,0,0,0
3,347666.0,721.0,806949.0,8741.9,12.0,32.0,9.0,0.0,256329.0,386958.0,...,0,0,0,0,0,0,0,0,0,0
4,176220.0,732.0,1219961.5,20639.7,6.1,32.0,15.0,0.0,253460.0,427174.0,...,0,0,0,0,0,0,0,0,0,0


### <font color='blue'>4.b) Spliting data in Train and Test Set <font/>

In [11]:
x=df.drop('Loan Status_Fully Paid',axis=1)
y=df['Loan Status_Fully Paid']
x_train,x_test,y_train,y_test=train_test_split(x,y,stratify=y,test_size=0.3,random_state=42)

In [12]:
print('x_train',x_train.shape)
print('x_test',x_test.shape)
print('y_train',y_train.shape)
print('y_test',y_test.shape)

x_train (57395, 41)
x_test (24599, 41)
y_train (57395,)
y_test (24599,)


In [13]:
from imblearn.over_sampling import SMOTE

In [14]:
sm=SMOTE(random_state=42)

x_res,y_res=sm.fit_resample(x,y)

#### After applying SMOTE Over-sampling technique, both classes counts are equal.

In [15]:
print('before_smote\n')
print(y.value_counts())
print('after smote\n')  
print(y_res.value_counts())

before_smote

1    59360
0    22634
Name: Loan Status_Fully Paid, dtype: int64
after smote

1    59360
0    59360
Name: Loan Status_Fully Paid, dtype: int64


In [16]:
print('Before smote')
print('x',x.shape)
print('After smote')
print('x_res',x_res.shape)

Before smote
x (81994, 41)
After smote
x_res (118720, 41)


### Performing Train_Test Split in a Stratified Fashion So that Balance of data is consistent in Train and Test for both classes. 

In [17]:
x_train_res,x_test_res,y_train_res,y_test_res=train_test_split(x_res,y_res,stratify=y_res,random_state=42)

In [18]:
import eli5
from eli5.sklearn import PermutationImportance
model4_fs2.fit(x_train_res, y_train_res)
perm = PermutationImportance(model4_fs2, random_state=10).fit(x_test_res, y_test_res)
eli5.show_weights(perm, feature_names = x_train_res.columns.tolist())

NameError: name 'model4_fs2' is not defined

In [None]:
model4_fs3=DecisionTreeClassifier(random_state=10)
model4_fs3.fit(x_train_res,y_train_res)

In [None]:
list(zip(x_train_res.columns,model4_fs3.feature_importances_))

In [None]:
d=pd.DataFrame({'features':x_train_res.columns,'feature_importance':model4_fs3.feature_importances_})
d

In [None]:
s1=set(d.sort_values(by='feature_importance',ascending=False).iloc[:6,:]['features'])
s2=set(['Annual Income','Current Loan Amount','Monthly Debt','Maximum Open Credit'])
important_features=s1.union(s2)

In [None]:
imp_features=list(important_features)
imp_features

### <font color='blue'>4.c.4)Logistic Regression Model with balanced data and important features<font/>

In [None]:
x_train_imp=x_train_res[imp_features]
x_test_imp=x_test_res[imp_features]

model4=LogisticRegression(random_state=10)

model4.fit(x_train_imp,y_train_res)

y_pred_logit_imp_feature=model4.predict(x_test_imp)

In [None]:
print('training accuracy',model4.score(x_train_imp,y_train_res))
print('testing accuracy',model4.score(x_test_imp,y_test_res))

In [None]:
confusion_matrix(y_test_res,y_pred_logit_imp_feature)

In [None]:
print(plot_confusion_matrix(estimator=model4,X=x_test_imp,y_true=y_test_res,
                            cmap="YlGnBu",colorbar=False,display_labels=['Charged off','fully paid']))
plt.title('Confussion matrix for testing data',size=15)
plt.show()

In [None]:
print(classification_report(y_test_res,y_pred_logit_imp_feature))

In [None]:
model4_roc_auc=roc_auc_score(y_test_res,model4.predict_proba(x_test_imp)[:,1])
model4_roc_auc

In [None]:
model_4_test_accuracy=model4.score(x_test_imp,y_test_res)
model_4_test_accuracy

In [None]:
model_performance=model_performance.append(pd.DataFrame({'Model':['Logistic model with imp_features(balanced_data)'],'accuracy':[model_4_test_accuracy],'roc_auc':[model4_roc_auc]}),ignore_index=True)

In [None]:
model_performance

### <font color='blue'>4.d) Decision tree classifier<font/>

### <font color='blue'>4.d.1) Decision Tree Full model with balanced data<font/>

In [None]:
model5=DecisionTreeClassifier(random_state=10)
model5.fit(x_train_res,y_train_res)
y_pred_dt_full_model_balance_data=model5.predict(x_test_res)

In [None]:
print('training accuracy',model5.score(x_train_res,y_train_res))
print('testing accuracy',model5.score(x_test_res,y_test_res))

In [None]:
model5_test_accuracy=model5.score(x_test_res,y_test_res)
model5_test_accuracy

In [None]:
confusion_matrix(y_test_res,y_pred_dt_full_model_balance_data)

In [None]:
print(plot_confusion_matrix(estimator=model5,X=x_test_res,y_true=y_test_res,
                            cmap="YlGnBu",colorbar=False,display_labels=['Charged off','fully paid']))
plt.title('Confussion matrix for testing data',size=15)
plt.show()

In [None]:
print(classification_report(y_test_res,y_pred_dt_full_model_balance_data))

In [None]:
model5_roc_auc=roc_auc_score(y_test_res,model5.predict_proba(x_test_res)[:,1])
model5_roc_auc

In [None]:
model_performance=model_performance.append(pd.DataFrame({'Model':['Decision Tree full model(balanced_data)'],'accuracy':[model5_test_accuracy],'roc_auc':[model5_roc_auc]}),ignore_index=True)

In [None]:
model_performance

### <font color='blue'>4.d.1)Decision Tree with selected features and balanced data<font/>

In [None]:
x_train_imp=x_train_res[imp_features]
x_test_imp=x_test_res[imp_features]

model6=DecisionTreeClassifier(random_state=10)

model6.fit(x_train_imp,y_train_res)

y_pred_dt_imp_feature=model6.predict(x_test_imp)

In [None]:
print('training accuracy',model6.score(x_train_imp,y_train_res))
print('testing accuracy',model6.score(x_test_imp,y_test_res))

In [None]:
model6_test_accuracy=model6.score(x_test_imp,y_test_res)
model6_test_accuracy

In [None]:
confusion_matrix(y_test_res,y_pred_dt_imp_feature)

In [None]:
print(plot_confusion_matrix(estimator=model6,X=x_test_imp,y_true=y_test_res,
                            cmap="YlGnBu",colorbar=False,display_labels=['Charged off','fully paid']))
plt.title('Confussion matrix for testing data',size=15)
plt.show()

In [None]:
print(classification_report(y_test_res,y_pred_dt_imp_feature))

In [None]:
model6_roc_auc=roc_auc_score(y_test_res,model6.predict_proba(x_test_imp)[:,1])
model6_roc_auc

In [None]:
model_performance=model_performance.append(pd.DataFrame({'Model':['Decision Tree with important features(balanced_data)'],'accuracy':[model6_test_accuracy],'roc_auc':[model6_roc_auc]}),ignore_index=True)

In [None]:
model_performance

### <font color='blue'>4.d) Random Forest classifier<font/>

### <font color='blue'>4.d.1) Random Forest Full model with balanced data<font/>

In [None]:
model7=RandomForestClassifier(random_state=10)
model7.fit(x_train_res,y_train_res)
y_pred_rf_full_model_balance_data=model7.predict(x_test_res)

In [None]:
print('training accuracy',model7.score(x_train_res,y_train_res))
print('testing accuracy',model7.score(x_test_res,y_test_res))

In [None]:
model7_test_accuracy=model7.score(x_test_res,y_test_res)
model7_test_accuracy

In [None]:
confusion_matrix(y_test_res,y_pred_rf_full_model_balance_data)

In [None]:
print(plot_confusion_matrix(estimator=model7,X=x_test_res,y_true=y_test_res,
                            cmap="YlGnBu",colorbar=False,display_labels=['Charged off','fully paid']))
plt.title('Confussion matrix for testing data',size=15)
plt.show()

In [None]:
print(classification_report(y_test_res,y_pred_rf_full_model_balance_data))

In [None]:
model7_roc_auc=roc_auc_score(y_test_res,model7.predict_proba(x_test_res)[:,1])
model7_roc_auc

In [None]:
model_performance=model_performance.append(pd.DataFrame({'Model':['Random Forest full model(balanced_data)'],'accuracy':[model7_test_accuracy],'roc_auc':[model7_roc_auc]}),ignore_index=True)

In [None]:
model_performance

### <font color='blue'>4.d.1)Random Forest with selected features and balanced data<font/>

In [None]:
x_train_imp=x_train_res[imp_features]
x_test_imp=x_test_res[imp_features]

model8=RandomForestClassifier(random_state=10)

model8.fit(x_train_imp,y_train_res)

y_pred_rf_imp_feature=model8.predict(x_test_imp)

In [None]:
print('training accuracy',model8.score(x_train_imp,y_train_res))
print('testing accuracy',model8.score(x_test_imp,y_test_res))

In [None]:
model8_test_accuracy=model8.score(x_test_imp,y_test_res)
model8_test_accuracy

In [None]:
confusion_matrix(y_test_res,y_pred_rf_imp_feature)

In [None]:
print(plot_confusion_matrix(estimator=model8,X=x_test_imp,y_true=y_test_res,
                            cmap="YlGnBu",colorbar=False,display_labels=['Charged off','fully paid']))
plt.title('Confussion matrix for testing data',size=15)
plt.show()

In [None]:
print(classification_report(y_test_res,y_pred_rf_imp_feature))

In [None]:
model8_roc_auc=roc_auc_score(y_test_res,model8.predict_proba(x_test_imp)[:,1])
model8_roc_auc

In [None]:
model_performance=model_performance.append(pd.DataFrame({'Model':['Random Forest with important feature(balanced_data)'],'accuracy':[model8_test_accuracy],'roc_auc':[model8_roc_auc]}),ignore_index=True)

In [None]:
model_performance

### <font color='blue'>4.d) K-Nearest Neighbors classifier<font/>

### <font color='blue'>4.d.1) KNN Full model with balanced data<font/>

In [None]:
model9=KNeighborsClassifier()
model9.fit(x_train_res,y_train_res)
y_pred_knn_full_model_balance_data=model9.predict(x_test_res)

In [None]:
print('training accuracy',model9.score(x_train_res,y_train_res))
print('testing accuracy',model9.score(x_test_res,y_test_res))

In [None]:
model9_test_accuracy=model9.score(x_test_res,y_test_res)
model9_test_accuracy

In [None]:
confusion_matrix(y_test_res,y_pred_knn_full_model_balance_data)

In [None]:
print(plot_confusion_matrix(estimator=model9,X=x_test_res,y_true=y_test_res,
                            cmap="YlGnBu",colorbar=False,display_labels=['Charged off','fully paid']))
plt.title('Confussion matrix for testing data',size=15)
plt.show()

In [None]:
print(classification_report(y_test_res,y_pred_knn_full_model_balance_data))

In [None]:
model9_roc_auc=roc_auc_score(y_test_res,model9.predict_proba(x_test_res)[:,1])
model9_roc_auc

In [None]:
model_performance=model_performance.append(pd.DataFrame({'Model':['KNN full model(balanced_data)'],'accuracy':[model9_test_accuracy],'roc_auc':[model9_roc_auc]}),ignore_index=True)

In [None]:
model_performance

### <font color='blue'>4.d.1)KNN with selected features and balanced data<font/>

In [None]:
x_train_imp=x_train_res[imp_features]
x_test_imp=x_test_res[imp_features]

model10=RandomForestClassifier(random_state=10)

model10.fit(x_train_imp,y_train_res)

y_pred_knn_imp_feature=model8.predict(x_test_imp)

In [None]:
print('training accuracy',model10.score(x_train_imp,y_train_res))
print('testing accuracy',model10.score(x_test_imp,y_test_res))

In [None]:
model10_test_accuracy=model10.score(x_test_imp,y_test_res)
model10_test_accuracy

In [None]:
confusion_matrix(y_test_res,y_pred_knn_imp_feature)

In [None]:
print(plot_confusion_matrix(estimator=model10,X=x_test_imp,y_true=y_test_res,
                            cmap="YlGnBu",colorbar=False,display_labels=['Charged off','fully paid']))
plt.title('Confussion matrix for testing data',size=15)
plt.show()

In [None]:
print(classification_report(y_test_res,y_pred_knn_imp_feature))

In [None]:
model10_roc_auc=roc_auc_score(y_test_res,model10.predict_proba(x_test_imp)[:,1])
model10_roc_auc

In [None]:
model_performance=model_performance.append(pd.DataFrame({'Model':['KNN with important feature(balanced_data)'],'accuracy':[model10_test_accuracy],'roc_auc':[model10_roc_auc]}),ignore_index=True)

In [None]:
model_performance

### <font color='blue'>4.d) Gaussian Naive Bayes classifier<font/>

### <font color='blue'>4.d.1) Gaussian NB Full model with balanced data<font/>

In [None]:
model11=GaussianNB()
model11.fit(x_train_res,y_train_res)
y_pred_gnb_full_model_balance_data=model11.predict(x_test_res)

In [None]:
print('training accuracy',model11.score(x_train_res,y_train_res))
print('testing accuracy',model11.score(x_test_res,y_test_res))

In [None]:
model11_test_accuracy=model11.score(x_test_res,y_test_res)
model11_test_accuracy

In [None]:
confusion_matrix(y_test_res,y_pred_gnb_full_model_balance_data)

In [None]:
print(plot_confusion_matrix(estimator=model11,X=x_test_res,y_true=y_test_res,
                            cmap="YlGnBu",colorbar=False,display_labels=['Charged off','fully paid']))
plt.title('Confussion matrix for testing data',size=15)
plt.show()

In [None]:
print(classification_report(y_test_res,y_pred_gnb_full_model_balance_data))

In [None]:
model11_roc_auc=roc_auc_score(y_test_res,model11.predict_proba(x_test_res)[:,1])
model11_roc_auc

In [None]:
model_performance=model_performance.append(pd.DataFrame({'Model':['Gaussian NB full model(balanced_data)'],'accuracy':[model11_test_accuracy],'roc_auc':[model11_roc_auc]}),ignore_index=True)

In [None]:
model_performance

### <font color='blue'>4.d.1)Gaussian Naive Bayes with selected features and balanced data<font/>

In [None]:
x_train_imp=x_train_res[imp_features]
x_test_imp=x_test_res[imp_features]

model12=GaussianNB()

model12.fit(x_train_imp,y_train_res)

y_pred_gnb_imp_feature=model8.predict(x_test_imp)

In [None]:
print('training accuracy',model12.score(x_train_imp,y_train_res))
print('testing accuracy',model12.score(x_test_imp,y_test_res))

In [None]:
model12_test_accuracy=model12.score(x_test_imp,y_test_res)
model12_test_accuracy

In [None]:
confusion_matrix(y_test_res,y_pred_gnb_imp_feature)

In [None]:
print(plot_confusion_matrix(estimator=model12,X=x_test_imp,y_true=y_test_res,
                            cmap="YlGnBu",colorbar=False,display_labels=['Charged off','fully paid']))
plt.title('Confussion matrix for testing data',size=15)
plt.show()

In [None]:
print(classification_report(y_test_res,y_pred_gnb_imp_feature))

In [None]:
model12_roc_acuc=roc_auc_score(y_test_res,model12.predict_proba(x_test_imp)[:,1])
model12_roc_acuc

In [None]:
model_performance=model_performance.append(pd.DataFrame({'Model':['Gaussian NB with important feature(balanced_data)'],'accuracy':[model12_test_accuracy],'roc_auc':[model12_roc_acuc]}),ignore_index=True)

In [None]:
model_performance

### <font color='blue'>4.d) Multinomial Naive Bayes classifier<font/>

### <font color='blue'>4.d.1) Multinomial NB Full model with balanced data<font/>

In [None]:
model13=MultinomialNB()
model13.fit(x_train_res,y_train_res)
y_pred_mnb_full_model_balance_data=model13.predict(x_test_res)

In [None]:
print('training accuracy',model13.score(x_train_res,y_train_res))
print('testing accuracy',model13.score(x_test_res,y_test_res))

In [None]:
model13_test_accuracy=model13.score(x_test_res,y_test_res)
model13_test_accuracy

In [None]:
confusion_matrix(y_test_res,y_pred_mnb_full_model_balance_data)

In [None]:
print(plot_confusion_matrix(estimator=model13,X=x_test_res,y_true=y_test_res,
                            cmap="YlGnBu",colorbar=False,display_labels=['Charged off','fully paid']))
plt.title('Confussion matrix for testing data',size=15)
plt.show()

In [None]:
print(classification_report(y_test_res,y_pred_mnb_full_model_balance_data))

In [None]:
model13_roc_auc=roc_auc_score(y_test_res,model13.predict_proba(x_test_res)[:,1])
model13_roc_auc

In [None]:
model_performance=model_performance.append(pd.DataFrame({'Model':['Multinomial NB full model(balanced_data)'],'accuracy':[model13_test_accuracy],'roc_auc':[model13_roc_auc]}),ignore_index=True)

In [None]:
model_performance

### <font color='blue'>4.d.1)Multinomial Naive Bayes with selected features and balanced data<font/>

In [None]:
x_train_imp=x_train_res[imp_features]
x_test_imp=x_test_res[imp_features]

model14=MultinomialNB()

model14.fit(x_train_imp,y_train_res)

y_pred_mnb_imp_feature=model14.predict(x_test_imp)

In [None]:
print('training accuracy',model14.score(x_train_imp,y_train_res))
print('testing accuracy',model14.score(x_test_imp,y_test_res))

In [None]:
model14_test_accuracy=model14.score(x_test_imp,y_test_res)
model14_test_accuracy

In [None]:
confusion_matrix(y_test_res,y_pred_mnb_imp_feature)

In [None]:
print(plot_confusion_matrix(estimator=model14,X=x_test_imp,y_true=y_test_res,
                            cmap="YlGnBu",colorbar=False,display_labels=['Charged off','fully paid']))
plt.title('Confussion matrix for testing data',size=15)
plt.show()

In [None]:
print(classification_report(y_test_res,y_pred_mnb_imp_feature))

In [None]:
model14_roc_auc=roc_auc_score(y_test_res,model14.predict_proba(x_test_imp)[:,1])
model14_roc_auc

In [None]:
model_performance=model_performance.append(pd.DataFrame({'Model':['Multinomial NB with important feature(balanced_data)'],'accuracy':[model14_test_accuracy],'roc_auc':[model14_roc_auc]}),ignore_index=True)

In [None]:
model_performance

### <font color='blue'>4.d.1)Bagged classifier<font/>

In [None]:
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier

In [None]:
lr=LogisticRegression(random_state=10)
knn=KNeighborsClassifier()
svc=SVC(random_state=10)
g_nb=GaussianNB()
m_nb=MultinomialNB()





In [None]:
bagged_lr=BaggingClassifier(base_estimator=lr,random_state=10)
bagged_lr.fit(x_train_res,y_train_res)
y_pred_bagged_lr=bagged_lr.predict(x_test_res)

In [None]:
print('training accuracy',bagged_lr.score(x_train_res,y_train_res))
print('testing accuracy',bagged_lr.score(x_test_res,y_test_res))

In [None]:
bagged_lr_test_accuracy=bagged_lr.score(x_test_res,y_test_res)
bagged_lr_test_accuracy

In [None]:
confusion_matrix(y_test_res,y_pred_bagged_lr)

In [None]:
print(plot_confusion_matrix(estimator=bagged_lr,X=x_test_res,y_true=y_test_res,
                            cmap="YlGnBu",colorbar=False,display_labels=['Charged off','fully paid']))
plt.title('Confussion matrix for testing data',size=15)
plt.show()

In [None]:
print(classification_report(y_test_res,y_pred_bagged_lr))

In [None]:
bagged_lr_roc_auc=roc_auc_score(y_test_res,bagged_lr.predict_proba(x_test_res)[:,1])
bagged_lr_roc_auc

In [None]:
model_performance=model_performance.append(pd.DataFrame({'Model':['Bagged_lr'],'accuracy':[bagged_lr_test_accuracy],'roc_auc':[bagged_lr_roc_auc]}),ignore_index=True)

In [None]:
model_performance

### Bagged_knn

In [None]:
bagged_knn=BaggingClassifier(base_estimator=knn,random_state=10)
bagged_knn.fit(x_train_res,y_train_res)
y_pred_bagged_knn=bagged_knn.predict(x_test_res)

In [None]:
print('training accuracy',bagged_knn.score(x_train_res,y_train_res))
print('testing accuracy',bagged_knn.score(x_test_res,y_test_res))

In [None]:
bagged_knn_test_accuracy=bagged_knn.score(x_test_res,y_test_res)
bagged_knn_test_accuracy

In [None]:
confusion_matrix(y_test_res,y_pred_bagged_knn)

In [None]:
print(plot_confusion_matrix(estimator=bagged_knn,X=x_test_res,y_true=y_test_res,
                            cmap="YlGnBu",colorbar=False,display_labels=['Charged off','fully paid']))
plt.title('Confussion matrix for testing data',size=15)
plt.show()

In [None]:
print(classification_report(y_test_res,y_pred_bagged_knn))

In [None]:
bagged_knn_roc_auc=roc_auc_score(y_test_res,bagged_knn.predict_proba(x_test_res)[:,1])
bagged_knn_roc_auc

In [None]:
model_performance=model_performance.append(pd.DataFrame({'Model':['Bagged_knn'],'accuracy':[bagged_knn_test_accuracy],'roc_auc':[bagged_knn_roc_auc]}),ignore_index=True)

In [None]:
model_performance

In [None]:
# bagged SVC
bagged_svc=BaggingClassifier(base_estimator=svc,random_state=10)
bagged_svc.fit(x_train_res,y_train_res)
y_pred_bagged_svc=bagged_svc.predict(x_test_res)

In [None]:
print('training accuracy',bagged_svc.score(x_train_res,y_train_res))
print('testing accuracy',bagged_svc.score(x_test_res,y_test_res))

In [None]:
bagged_svc_test_accuracy=bagged_knn.score(x_test_res,y_test_res)
bagged_svc_test_accuracy

In [None]:
confusion_matrix(y_test_res,y_pred_bagged_svc)

In [None]:
print(plot_confusion_matrix(estimator=bagged_svc,X=x_test_res,y_true=y_test_res,
                            cmap="YlGnBu",colorbar=False,display_labels=['Charged off','fully paid']))
plt.title('Confussion matrix for testing data',size=15)
plt.show()

In [None]:
print(classification_report(y_test_res,y_pred_bagged_svc))

In [None]:
bagged_svc_roc_auc=roc_auc_score(y_test_res,bagged_knn.predict_proba(x_test_res)[:,1])
bagged_svc_roc_auc

In [None]:
model_performance=model_performance.append(pd.DataFrame({'Model':['Bagged_svc'],'accuracy':[bagged_svc_test_accuracy],'roc_auc':[bagged_svc_roc_auc]}),ignore_index=True)

In [None]:
model_performance

In [None]:
# bagged g_nb

bagged_g_nb=BaggingClassifier(base_estimator=g_nb,random_state=10)
bagged_g_nb.fit(x_train_res,y_train_res)
y_pred_bagged_g_nb=bagged_g_nb.predict(x_test_res)

In [None]:
print('training accuracy',bagged_g_nb.score(x_train_res,y_train_res))
print('testing accuracy',bagged_g_nb.score(x_test_res,y_test_res))

In [None]:
bagged_g_nb_test_accuracy=bagged_g_nb.score(x_test_res,y_test_res)
bagged_g_nb_test_accuracy

In [None]:
confusion_matrix(y_test_res,y_pred_bagged_g_nb)

In [None]:
print(plot_confusion_matrix(estimator=bagged_g_nb,X=x_test_res,y_true=y_test_res,
                            cmap="YlGnBu",colorbar=False,display_labels=['Charged off','fully paid']))
plt.title('Confussion matrix for testing data',size=15)
plt.show()

In [None]:
print(classification_report(y_test_res,y_pred_bagged_g_nb))

In [None]:
bagged_g_nb_roc_auc=roc_auc_score(y_test_res,bagged_g_nb.predict_proba(x_test_res)[:,1])
bagged_g_nb_roc_auc

In [None]:
model_performance=model_performance.append(pd.DataFrame({'Model':['Bagged_g_nb'],'accuracy':[bagged_g_nb_test_accuracy],'roc_auc':[bagged_g_nb_roc_auc]}),ignore_index=True)

In [None]:
model_performance

In [None]:
# bagged m_nb

bagged_m_nb=BaggingClassifier(base_estimator=g_nb,random_state=10)
bagged_m_nb.fit(x_train_res,y_train_res)
y_pred_bagged_m_nb=bagged_m_nb.predict(x_test_res)

In [None]:
print('training accuracy',bagged_m_nb.score(x_train_res,y_train_res))
print('testing accuracy',bagged_m_nb.score(x_test_res,y_test_res))

In [None]:
bagged_m_nb_test_accuracy=bagged_m_nb.score(x_test_res,y_test_res)
bagged_m_nb_test_accuracy

In [None]:
confusion_matrix(y_test_res,y_pred_bagged_m_nb)

In [None]:
print(plot_confusion_matrix(estimator=bagged_m_nb,X=x_test_res,y_true=y_test_res,
                            cmap="YlGnBu",colorbar=False,display_labels=['Charged off','fully paid']))
plt.title('Confussion matrix for testing data',size=15)
plt.show()

In [None]:
print(classification_report(y_test_res,y_pred_bagged_m_nb))

In [None]:
bagged_m_nb_roc_auc=roc_auc_score(y_test_res,bagged_m_nb.predict_proba(x_test_res)[:,1])
bagged_g_nb_roc_auc

In [None]:
model_performance=model_performance.append(pd.DataFrame({'Model':['Bagged_m_nb'],'accuracy':[bagged_m_nb_test_accuracy],'roc_auc':[bagged_m_nb_roc_auc]}),ignore_index=True)

In [None]:
model_performance

#### <font color='blue'>Boosting algorithms</font>

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
# ada_boost_with_logostic_regression
ada_lr=AdaBoostClassifier(base_estimator=lr,random_state=10)
ada_lr.fit(x_train_res,y_train_res)
y_pred_ada_lr=ada_lr.predict(x_test_res)


In [None]:
print('training accuracy',ada_lr.score(x_train_res,y_train_res))
print('testing accuracy',ada_lr.score(x_test_res,y_test_res))

In [None]:
ada_lr_test_accuracy=ada_lr.score(x_test_res,y_test_res)
ada_lr_test_accuracy

In [None]:
confusion_matrix(y_test_res,y_pred_ada_lr)

In [None]:
print(plot_confusion_matrix(estimator=ada_lr,X=x_test_res,y_true=y_test_res,
                            cmap="YlGnBu",colorbar=False,display_labels=['Charged off','fully paid']))
plt.title('Confussion matrix for testing data',size=15)
plt.show()

In [None]:
print(classification_report(y_test_res,y_pred_ada_lr))

In [None]:
ada_lr_roc_auc=roc_auc_score(y_test_res,ada_lr.predict_proba(x_test_res)[:,1])
ada_lr_roc_auc

In [None]:
model_performance=model_performance.append(pd.DataFrame({'Model':['ada_lr'],'accuracy':[ada_lr_test_accuracy],'roc_auc':[ada_lr_roc_auc]}),ignore_index=True)

In [None]:
model_performance

In [None]:
# ada_knn

# ada_knn=AdaBoostClassifier(base_estimator=knn,random_state=10)
# ada_knn.fit(x_train_res,y_train_res)
# y_pred_ada_knn=ada_knn.predict(x_test_res)

In [None]:
# print('training accuracy',ada_knn.score(x_train_res,y_train_res))
# print('testing accuracy',ada_knn.score(x_test_res,y_test_res))

In [None]:
# ada_knn_test_accuracy=bagged_knn.score(x_test_res,y_test_res)
# ada_knn_test_accuracy

In [None]:
# confusion_matrix(y_test_res,y_pred_ada_knn)

In [None]:
# print(plot_confusion_matrix(estimator=ada_knn,X=x_test_res,y_true=y_test_res,
#                             cmap="YlGnBu",colorbar=False,display_labels=['Charged off','fully paid']))
# plt.title('Confussion matrix for testing data',size=15)
# plt.show()

In [None]:
# print(classification_report(y_test_res,y_pred_ada_knn))

In [None]:
# ada_knn_roc_auc=roc_auc_score(y_test_res,ada_knn.predict_proba(x_test_res)[:,1])
# ada_knn_roc_auc

In [None]:
# model_performance=model_performance.append(pd.DataFrame({'Model':['ada_knn'],'accuracy':[ada_knn_test_accuracy],'roc_auc':[ada_knn_roc_auc]}),ignore_index=True)

In [None]:
# model_performance

In [None]:
# ada_g_nb

ada_g_nb=AdaBoostClassifier(base_estimator=g_nb,random_state=10)
ada_g_nb.fit(x_train_res,y_train_res)
y_pred_ada_g_nb=ada_g_nb.predict(x_test_res)

In [None]:
print('training accuracy',ada_g_nb.score(x_train_res,y_train_res))
print('testing accuracy',ada_g_nb.score(x_test_res,y_test_res))

In [None]:
ada_g_nb_test_accuracy=bagged_g_nb.score(x_test_res,y_test_res)
ada_g_nb_test_accuracy

In [None]:
confusion_matrix(y_test_res,y_pred_ada_g_nb)

In [None]:
print(plot_confusion_matrix(estimator=ada_g_nb,X=x_test_res,y_true=y_test_res,
                            cmap="YlGnBu",colorbar=False,display_labels=['Charged off','fully paid']))
plt.title('Confussion matrix for testing data',size=15)
plt.show()

In [None]:
print(classification_report(y_test_res,y_pred_ada_g_nb))

In [None]:
ada_g_nb_roc_auc=roc_auc_score(y_test_res,ada_g_nb.predict_proba(x_test_res)[:,1])
ada_g_nb_roc_auc

In [None]:
model_performance=model_performance.append(pd.DataFrame({'Model':['ada_g_nb'],'accuracy':[ada_g_nb_test_accuracy],'roc_auc':[ada_g_nb_roc_auc]}),ignore_index=True)

In [None]:
model_performance

In [None]:
# ada_m_nb
ada_m_nb=AdaBoostClassifier(base_estimator=m_nb,random_state=10)
ada_m_nb.fit(x_train_res,y_train_res)
y_pred_ada_m_nb=ada_m_nb.predict(x_test_res)

In [None]:
print('training accuracy',ada_m_nb.score(x_train_res,y_train_res))
print('testing accuracy',ada_m_nb.score(x_test_res,y_test_res))

In [None]:
ada_m_nb_test_accuracy=bagged_m_nb.score(x_test_res,y_test_res)
ada_m_nb_test_accuracy

In [None]:
confusion_matrix(y_test_res,y_pred_ada_m_nb)

In [None]:
print(plot_confusion_matrix(estimator=ada_m_nb,X=x_test_res,y_true=y_test_res,
                            cmap="YlGnBu",colorbar=False,display_labels=['Charged off','fully paid']))
plt.title('Confussion matrix for testing data',size=15)
plt.show()

In [None]:
print(classification_report(y_test_res,y_pred_ada_m_nb))

In [None]:
ada_m_nb_roc_auc=roc_auc_score(y_test_res,ada_m_nb.predict_proba(x_test_res)[:,1])
ada_m_nb_roc_auc

In [None]:
model_performance=model_performance.append(pd.DataFrame({'Model':['ada_m_nb'],'accuracy':[ada_m_nb_test_accuracy],'roc_auc':[ada_m_nb_roc_auc]}),ignore_index=True)

In [None]:
model_performance

## Gradient_boost

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
# gradient_boost
gradient_b=GradientBoostingClassifier(random_state=10)
gradient_b.fit(x_train_res,y_train_res)
y_pred_gradient_b=gradient_b.predict(x_test_res)

In [None]:
print('training accuracy',gradient_b.score(x_train_res,y_train_res))
print('testing accuracy',gradient_b.score(x_test_res,y_test_res))

In [None]:
gradient_b_test_accuracy=gradient_b.score(x_test_res,y_test_res)
gradient_b_test_accuracy

In [None]:
confusion_matrix(y_test_res,y_pred_gradient_b)

In [None]:
print(plot_confusion_matrix(estimator=gradient_b,X=x_test_res,y_true=y_test_res,
                            cmap="YlGnBu",colorbar=False,display_labels=['Charged off','fully paid']))
plt.title('Confussion matrix for testing data',size=15)
plt.show()

In [None]:
print(classification_report(y_test_res,y_pred_gradient_b))

In [None]:
gradient_b_roc_auc=roc_auc_score(y_test_res,gradient_b.predict_proba(x_test_res)[:,1])
gradient_b_roc_auc

In [None]:
model_performance=model_performance.append(pd.DataFrame({'Model':['gradient_b'],'accuracy':[gradient_b_test_accuracy],'roc_auc':[gradient_b_roc_auc]}),ignore_index=True)

In [None]:
model_performance

### Xgboost

In [None]:
# Titasta

In [None]:
import re
regex = re.compile(r"\[|\]|<", re.IGNORECASE)

In [None]:
from xgboost import XGBClassifier

In [None]:
x_train_res_copy = x_train_res.copy()
x_test_res_copy = x_test_res.copy()

In [None]:
# gradient_boost
xg_b=XGBClassifier(random_state=10)

x_train_res_copy.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in x_train_res_copy.columns.values]

x_test_res_copy.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in x_test_res_copy.columns.values]

xg_b.fit(x_train_res_copy,y_train_res)

y_pred_xg_b=xg_b.predict(x_test_res_copy)

In [None]:
print('training accuracy',xg_b.score(x_train_res,y_train_res))
print('testing accuracy',xg_b.score(x_test_res,y_test_res))

In [None]:
xg_b_test_accuracy=xg_b.score(x_test_res,y_test_res)
xg_b_test_accuracy

In [None]:
confusion_matrix(y_test_res,y_pred_xg_b)

In [None]:
print(plot_confusion_matrix(estimator=xg_b,X=x_test_res,y_true=y_test_res,
                            cmap="YlGnBu",colorbar=False,display_labels=['Charged off','fully paid']))
plt.title('Confussion matrix for testing data',size=15)
plt.show()

In [None]:
print(classification_report(y_test_res,y_pred_xg_b))

In [None]:
xg_b_roc_auc=roc_auc_score(y_test_res,xg_b.predict_proba(x_test_res)[:,1])
xg_b_roc_auc

In [None]:
model_performance=model_performance.append(pd.DataFrame({'Model':['xg_b'],'accuracy':[xg_b_test_accuracy],'roc_auc':[xg_b_roc_auc]}),ignore_index=True)

In [None]:
# model_performance = model_performance.drop(['level_0','index'],axis=1)

In [None]:
model_performance = model_performance.drop_duplicates(keep='first',ignore_index=True)
model_performance.style.background_gradient()

In [None]:
selective_model_performance = model_performance[(model_performance['accuracy']>0.80) & (model_performance['roc_auc']>0.80)]
selective_model_performance.drop_duplicates(subset=['accuracy','roc_auc'])


In [None]:
selective_model_performance.style.background_gradient()

### <font color='blue'> Plotting Model Performance metrics for different Models</font>

In [None]:
plt.figure(figsize=(30,15))
sns.lineplot(x=model_performance.index, y = 'accuracy',color='red',data= model_performance,label='accuracy')
sns.lineplot(x=model_performance.index, y = 'roc_auc',color='blue',data= model_performance,label='roc_auc')
plt.axhline(0.80,color='black',linestyle='--')
plt.xticks(np.arange(0, len(model_performance)+1, 1),fontsize=15)
plt.xlabel('Models',fontsize=25)
plt.ylabel('Accuracy / ROC_AUC',fontsize=25)
plt.grid()
plt.rc('xtick', labelsize=30) 
plt.rc('ytick', labelsize=30) 
plt.legend(fontsize=20)


### <font color='blue'>Inference:
    
    



### <font color='red'>DecisionTreeClassifier full model with Hyperparamter Tuning<font/>

### <font color='blue'> Plotting Model Performance metrics for different Models</font>

In [None]:
plt.figure(figsize=(25,8))
sns.lineplot(x=model_performance.index, y = 'accuracy',color='red',data= model_performance,label='accuracy')
sns.lineplot(x=model_performance.index, y = 'roc_auc',color='blue',data= model_performance,label='roc_auc')
plt.axhline(0.8,color='black',linestyle='--')
plt.axhline(0.9,color='black',linestyle='--')
plt.xticks(np.arange(0, len(model_performance)+1, 1))
plt.grid()
plt.legend()

# Future agenda:
* Try other models like XGBoost, ADABoost,SVM etc. 
* Follow the above procedure to hyper parameter tune rest of the selected models and eventually decide which model is to be chosen .
* Try to combine few features to make composite features.
* Check Optimal threshold for feature importance.
* Check cardinality issue.

In [None]:
#Rndom Forest and XG Boost HPT is left

In [None]:
#Random Forest with HPT

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in range(200,2000,200)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier(random_state=10)
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(x_train_res, y_train_res)

In [None]:
rf_random.best_params_

In [None]:
# Extract best params from above and add below

In [None]:
model_rf_hpt=RandomForestClassifier(random_state=10,n_estimators=400,min_samples_split=5,
                                    max_features='auto',max_depth=60,bootstrap=False)
model_rf_hpt.fit(x_train_res,y_train_res)
y_pred_rf_full_model_balance_data=model_rf_hpt.predict(x_test_res)

print('training accuracy',model_rf_hpt.score(x_train_res,y_train_res))
print('testing accuracy',model_rf_hpt.score(x_test_res,y_test_res))

model_rf_hpt_test_accuracy=model_rf_hpt.score(x_test_res,y_test_res)
model_rf_hpt_test_accuracy

confusion_matrix(y_test_res,y_pred_rf_full_model_balance_data)

print(plot_confusion_matrix(estimator=model_rf_hpt,X=x_test_res,y_true=y_test_res,
                            cmap="YlGnBu",colorbar=False,display_labels=['Charged off','fully paid']))
plt.title('Confussion matrix for testing data',size=15)
plt.show()

print(classification_report(y_test_res,y_pred_rf_full_model_balance_data))

model_rf_hpt_roc_auc=roc_auc_score(y_test_res,model_rf_hpt.predict_proba(x_test_res)[:,1])
model_rf_hpt_roc_auc

In [None]:
# XGB with HPT

In [None]:
# Titasta

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in range(200,2000,200)]

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

#learning rate
learning_rate = [0.00001,0.0001,0.001,0.01,0.1]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'learning_rate':learning_rate
               }
print(random_grid)

In [None]:
xgb_model = XGBClassifier(random_state=10)

# use GridSearchCV() to find the optimal value of the hyperparameters
# estimator: pass the XGBoost classifier model
# param_grid: pass the list 'tuned_parameters'
# cv: number of folds in k-fold i.e. here cv = 3
# scoring: pass a measure to evaluate the model on test set
xgb_grid = RandomizedSearchCV(estimator = xgb_model, param_distributions = random_grid,n_jobs=-1, cv = 3, n_iter = 100,scoring = 'accuracy')

# fit the model on xtrain and y_train using fit()
xgb_grid.fit(x_train_res_copy, y_train_res)



In [None]:
print('Best parameters for XGBoost classifier: ', xgb_grid.best_params_, '\n')

In [None]:
# Titasta

In [None]:
# gradient_boost with HPT
xg_b=XGBClassifier(random_state=10,###????)

x_train_res_copy.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in x_train_res_copy.columns.values]

x_test_res_copy.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in x_test_res_copy.columns.values]

xg_b.fit(x_train_res_copy,y_train_res)

y_pred_xg_b=xg_b.predict(x_test_res_copy)

In [None]:
####################################### FINAL MODEL SELECTION ################################################

In [None]:
# Stacked classifier full model with DT and RF

In [None]:
from sklearn.ensemble import StackingClassifier,GradientBoostingClassifier

In [None]:
base_learners = [('Decision Tree', DecisionTreeClassifier(min_samples_split=41,min_samples_leaf=63,
                                                          max_depth=36,criterion='entropy')),
                 ('Random Forest', RandomForestClassifier(random_state=10,n_estimators=400,min_samples_split=5,
                                    max_features='auto',max_depth=60,bootstrap=False))]

In [None]:
stack_model_gdBoost = StackingClassifier(estimators = base_learners, final_estimator = GradientBoostingClassifier(random_state = 8))

In [None]:
stack_model_gdBoost.fit(x_train_res, y_train_res)

In [None]:
y_pred = stack_model_gdBoost.predict(x_test_res)

In [None]:
print('training accuracy',stack_model_gdBoost.score(x_train_res,y_train_res))
print('testing accuracy',stack_model_gdBoost.score(x_test_res,y_test_res))

In [None]:
print(plot_confusion_matrix(estimator=stack_model_gdBoost,X=x_test_res,y_true=y_test_res,
                            cmap="YlGnBu",colorbar=False,display_labels=['Charged off','fully paid']))
plt.title('Confussion matrix for testing data',size=15)
plt.show()

In [None]:
print(classification_report(y_test_res,y_pred))

In [None]:
model_rf_hpt_roc_auc=roc_auc_score(y_test_res,stack_model_gdBoost.predict_proba(x_test_res)[:,1])
model_rf_hpt_roc_auc

In [None]:
import eli5
from eli5.sklearn import PermutationImportance
stack_model_gdBoost.fit(x_train_res, y_train_res)
perm = PermutationImportance(stack_model_gdBoost, random_state=10).fit(x_test_res, y_test_res)
eli5.show_weights(perm, feature_names = x_train_res.columns.tolist())

In [None]:
# 20 most important features considered

In [None]:
imp_features = ['Annual Income','Home Ownership_Home Mortgage','Credit Score','Current Loan Amount','Home Ownership_Rent','Home Ownership_Own Home','Term_Short Term','Years in current job_10+ years','Purpose_Debt Consolidation','Monthly Debt','Current Credit Balance','Number of Credit Problems','Maximum Open Credit','Years in current job_2 years','Years in current job_3 years','Number of Open Accounts','Years of Credit History','Years in current job_< 1 year','Months since last delinquent','Years in current job_4 years']

In [None]:
# Stacked classifier model with selective features DT and RF 

In [None]:
x_train_imp=x_train_res[imp_features]
x_test_imp=x_test_res[imp_features]

base_learners = [('Decision Tree', DecisionTreeClassifier(min_samples_split=41,min_samples_leaf=63,
                                                          max_depth=36,criterion='entropy')),
                 ('Random Forest', RandomForestClassifier(random_state=10,n_estimators=400,min_samples_split=5,
                                    max_features='auto',max_depth=60,bootstrap=False))]

stack_model_gdBoost_partial = StackingClassifier(estimators = base_learners, final_estimator = GradientBoostingClassifier(random_state = 8))

In [None]:
stack_model_gdBoost_partial.fit(x_train_imp, y_train_res)

y_pred_imp_feature = stack_model_gdBoost_partial.predict(x_test_imp)

In [None]:
print('training accuracy',stack_model_gdBoost_partial.score(x_train_imp,y_train_res))
print('testing accuracy',stack_model_gdBoost_partial.score(x_test_imp,y_test_res))

In [None]:
confusion_matrix(y_test_res,y_pred_imp_feature)

In [None]:
print(plot_confusion_matrix(estimator=stack_model_gdBoost_partial,X=x_test_imp,y_true=y_test_res,
                            cmap="YlGnBu",colorbar=False,display_labels=['Charged off','fully paid']))
plt.title('Confussion matrix for testing data',size=15)
plt.show()

In [None]:
print(classification_report(y_test_res,y_pred_imp_feature))

In [None]:
model_roc_auc=roc_auc_score(y_test_res,stack_model_gdBoost_partial.predict_proba(x_test_imp)[:,1])
model_roc_auc