In [54]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report,confusion_matrix,roc_auc_score
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.model_selection import cross_val_score


In [55]:
train = pd.read_csv('train_modified.csv')

In [56]:
X = train.drop(columns = ['Disbursed','ID'])
y = train['Disbursed'].values.ravel()

In [53]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2)

gbc = GradientBoostingClassifier()
gbc.fit(X_train,y_train)

pred = gbc.predict(X_test)
proba_pred = gbc.predict_proba(X_test)[:,1]

print(classification_report(y_test,pred))
print(roc_auc_score(y_test,proba_pred))

KeyboardInterrupt: 

In [None]:
cross_val_score(gbc,X,y,cv=4,scoring ='f1') #worst

In [8]:
pd.Series(gbc.feature_importances_,index = X.columns).sort_values(ascending = False)[:5]

Monthly_Income    0.332669
Var5              0.180331
Existing_EMI      0.166368
age               0.081106
Filled_Form_0     0.036072
dtype: float64

In [9]:
cross_val_score(gbc,X,y,cv=4,scoring ='roc_auc') #worst

array([0.82326532, 0.82107817, 0.85228913, 0.82565632])

In [10]:
pd.Series(gbc.feature_importances_,index = X.columns).sort_values(ascending = False)[:5]

Monthly_Income    0.332669
Var5              0.180331
Existing_EMI      0.166368
age               0.081106
Filled_Form_0     0.036072
dtype: float64

In [34]:
# first lets tune the model wrt n_estimators once we find high enough value for n_estimators we can proceed towards finding other estimators
param_test1 = {'n_estimators':np.arange(20,81,10)}
clf_grid_search1 = GradientBoostingClassifier(learning_rate =0.1,
                                              min_samples_split = 500,
                                              min_samples_leaf = 50,
                                              max_depth = 8,
                                              max_features = 'sqrt',
                                              subsample = 0.8
                                             )
grid_search1 = GridSearchCV(clf_grid_search1,param_grid = param_test1,scoring = 'roc_auc',cv=5,verbose = 34,n_jobs = -1)

In [35]:
grid_search1.fit(X,y)

Fitting 5 folds for each of 7 candidates, totalling 35 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    6.0s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    9.6s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:   10.7s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:   11.1s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:   11.1s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   14.7s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   15.7s
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:   17.2s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:   17.7s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:   21.0s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:   21.6s
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:   

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=8,
              max_features='sqrt', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=50, min_sa...      subsample=0.8, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_estimators': array([20, 30, 40, 50, 60, 70, 80])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=34)

In [36]:
grid_search1.best_params_

{'n_estimators': 60}

In [37]:
grid_search1.best_score_

0.8383373857198241

In [38]:
pd.DataFrame(grid_search1.cv_results_).T



Unnamed: 0,0,1,2,3,4,5,6
mean_fit_time,3.64108,4.76747,5.90024,7.04577,7.88844,9.01478,10.1599
std_fit_time,0.202035,0.158662,0.198599,0.14963,0.156455,0.264341,0.207947
mean_score_time,0.0428205,0.0475873,0.0575617,0.0680201,0.0832341,0.0810894,0.0854017
std_score_time,0.00513611,0.00413443,0.00450982,0.00468051,0.0213093,0.00658597,0.00679404
param_n_estimators,20,30,40,50,60,70,80
params,{'n_estimators': 20},{'n_estimators': 30},{'n_estimators': 40},{'n_estimators': 50},{'n_estimators': 60},{'n_estimators': 70},{'n_estimators': 80}
split0_test_score,0.825687,0.817626,0.815522,0.826911,0.83171,0.824332,0.816112
split1_test_score,0.827472,0.828591,0.825036,0.826994,0.830362,0.830699,0.832849
split2_test_score,0.840703,0.842643,0.844986,0.845035,0.848935,0.847544,0.850509
split3_test_score,0.845909,0.850286,0.851747,0.845874,0.847255,0.8484,0.843686


In [60]:
param_test2 = {'max_depth':np.arange(5,15,2),
               'min_samples_split':np.arange(200,1001,200)
              }
# we got n estimator as 60 lets substitute it and tune other parameters
clf_grid_search2 = GradientBoostingClassifier(learning_rate = 0.1,
                                              n_estimators = 60,
                                              max_features = 'sqrt',
                                              subsample = 0.8,
                                             )
grid_search2 = GridSearchCV(clf_grid_search2,param_grid = param_test2,cv=5,verbose = 40,n_jobs = -1,scoring = 'roc_auc')

In [61]:
grid_search2.fit(X,y)

Fitting 5 folds for each of 25 candidates, totalling 125 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    5.2s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    5.3s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   10.1s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:   10.4s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:   10.5s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:   10.8s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   15.2s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   15.4s
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:   15.7s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:   15.9s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:   20.2s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:   20.4s
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:   

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features='sqrt', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_sam...      subsample=0.8, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'max_depth': array([ 5,  7,  9, 11, 13]), 'min_samples_split': array([ 200,  400,  600,  800, 1000])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=40)

In [67]:
grid_search2.best_score_

0.8363839709717107

In [68]:
grid_search2.best_params_

{'max_depth': 7, 'min_samples_split': 800}

In [87]:
param_test3 = {
               'min_samples_leaf':np.arange(30,71,10)
              }
# we got n estimator as 60 lets substitute it and tune other parameters
clf_grid_search3 = GradientBoostingClassifier(learning_rate = 0.1,
                                              n_estimators = 60,
                                              max_features = 'sqrt',
                                              subsample = 0.8,
                                              max_depth = 7,
                                              min_samples_split = 800
                                             )
grid_search3 = GridSearchCV(clf_grid_search3,param_grid = param_test3,cv=5,verbose = 40,n_jobs = -1,scoring = 'roc_auc')

In [88]:
grid_search3.fit(X,y)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    8.8s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    8.9s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    8.9s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    9.1s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   15.6s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:   15.7s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:   16.0s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:   16.2s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   22.7s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   22.8s
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:   23.0s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:   23.0s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:   29.7s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:   29.9s
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:   

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=7,
              max_features='sqrt', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_sam...      subsample=0.8, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'min_samples_leaf': array([30, 40, 50, 60, 70])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=40)

In [89]:
grid_search3.best_score_

0.83625454631656

In [90]:
grid_search3.best_params_

{'min_samples_leaf': 40}

In [93]:
param_test4 = {
               'max_features':np.arange(7,20,2)
              }
# we got n estimator as 60 lets substitute it and tune other parameters
clf_grid_search4 = GradientBoostingClassifier(learning_rate = 0.1,
                                              n_estimators = 60,
                                              max_features = 'sqrt',
                                              subsample = 0.8,
                                              max_depth = 7,
                                              min_samples_split = 800,
                                              min_samples_leaf = 40
                                  
                                             
                                             )
grid_search4 = GridSearchCV(clf_grid_search4,param_grid = param_test4,cv=5,verbose = 40,n_jobs = -1,scoring = 'roc_auc')

In [94]:
grid_search4.fit(X,y)

Fitting 5 folds for each of 7 candidates, totalling 35 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    6.6s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    7.3s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   13.8s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:   14.7s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:   15.3s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:   16.0s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   22.1s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   23.3s
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:   25.2s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:   26.1s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:   33.0s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:   33.9s
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:   

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=7,
              max_features='sqrt', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=40, min_sa...      subsample=0.8, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'max_features': array([ 7,  9, 11, 13, 15, 17, 19])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=40)

In [96]:
grid_search4.best_score_

0.8377024361750197

In [97]:
grid_search4.best_params_

{'max_features': 9}

In [101]:
param_test5 = {
               'subsample':[0.6,0.7,0.75,0.8,0.85,0.9]
              }
# we got n estimator as 60 lets substitute it and tune other parameters
clf_grid_search5 = GradientBoostingClassifier(learning_rate = 0.1,
                                              n_estimators = 60,
                                              subsample = 0.8,
                                              max_depth = 7,
                                              min_samples_split = 800,
                                              min_samples_leaf = 40,
                                              max_features = 9
                                             )
grid_search4 = GridSearchCV(clf_grid_search5,param_grid = param_test5,cv=5,verbose = 40,n_jobs = -1,scoring = 'roc_auc')

In [102]:
grid_search4.fit(X,y)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    9.3s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    9.6s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    9.7s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    9.7s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   16.9s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:   17.4s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:   17.6s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:   18.8s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   25.3s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   25.7s
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:   25.7s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:   27.5s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:   33.6s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:   34.1s
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:   

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=7,
              max_features=9, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=40, min_samples...      subsample=0.8, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'subsample': [0.6, 0.7, 0.75, 0.8, 0.85, 0.9]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=40)

In [103]:
grid_search4.best_params_

{'subsample': 0.7}

In [104]:
grid_search4.best_score_

0.8375717339026726

In [134]:

# we got n estimator as 60 lets substitute it and tune other parameters
clf_tuned = GradientBoostingClassifier(learning_rate = 0.05,
                                              n_estimators = 100,
                                              subsample = 0.8,
                                              max_depth = 7,
                                              min_samples_split = 800,
                                              min_samples_leaf = 40,
                                              max_features = 9
                                             )
clf_tuned.fit(X_train,y_train)

pred = clf_tuned.predict(X_test)
prob = clf_tuned.predict_proba(X_test)[:,1]
y_test.shape

print(classification_report(y_test,pred))
print(roc_auc_score(y_test,prob))

from sklearn.model_selection import cross_validate,cross_val_score


cross_val_params_1 = cross_validate(clf_tuned,X,y,cv = 10,verbose = 40,n_jobs = -1,scoring = 'roc_auc')

pd.DataFrame(cross_val_params_1)



  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


              precision    recall  f1-score   support

         0.0       0.98      1.00      0.99     17105
         1.0       0.00      0.00      0.00       299

   micro avg       0.98      0.98      0.98     17404
   macro avg       0.49      0.50      0.50     17404
weighted avg       0.97      0.98      0.97     17404

0.8481100892676455


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   15.0s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   15.1s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:   15.7s
[Parallel(n_jobs=-1)]: Done   4 out of  10 | elapsed:   16.3s remaining:   24.4s
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   30.3s remaining:   30.3s
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:   31.2s remaining:   20.8s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:   31.3s remaining:   13.4s
[Parallel(n_jobs=-1)]: Done   8 out of  10 | elapsed:   31.9s remaining:    8.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   45.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   45.5s finished


Unnamed: 0,fit_time,score_time,test_score,train_score
0,15.035654,0.050419,0.831507,0.88873
1,14.38251,0.050216,0.827173,0.889689
2,14.461257,0.050871,0.850127,0.889338
3,15.639527,0.051536,0.817147,0.892709
4,14.777629,0.050445,0.858326,0.887919
5,15.630012,0.051447,0.851609,0.891717
6,14.911692,0.046337,0.866044,0.891283
7,15.043274,0.058667,0.844128,0.892765
8,14.315552,0.0461,0.841679,0.88962
9,13.769304,0.048413,0.832778,0.891364


In [142]:
pd.Series(clf_tuned.feature_importances_,index = X_train.columns).sort_values(ascending = False)[:6]

Monthly_Income         0.253750
Existing_EMI           0.156107
Var5                   0.151878
age                    0.091141
Loan_Amount_Applied    0.052345
Var4                   0.028262
dtype: float64

In [144]:

# we got n estimator as 60 lets substitute it and tune other parameters
clf_tuned = GradientBoostingClassifier(learning_rate = 0.005,
                                              n_estimators = 1500,
                                              subsample = 0.8,
                                              max_depth = 7,
                                              min_samples_split = 1200,
                                              min_samples_leaf = 40,
                                              max_features = 9
                                             )
clf_tuned.fit(X_train,y_train)

pred = clf_tuned.predict(X_test)
prob = clf_tuned.predict_proba(X_test)[:,1]


print(classification_report(y_test,pred))
print(roc_auc_score(y_test,prob))



              precision    recall  f1-score   support

         0.0       0.98      1.00      0.99     17105
         1.0       0.00      0.00      0.00       299

   micro avg       0.98      0.98      0.98     17404
   macro avg       0.49      0.50      0.50     17404
weighted avg       0.97      0.98      0.97     17404

0.8520915963667257


  'precision', 'predicted', average, warn_for)


In [None]:
from sklearn.model_selection import cross_validate,cross_val_score


cross_val_params_1 = cross_validate(clf_tuned,X,y,cv = 10,verbose = 40,n_jobs = -1,scoring = 'roc_auc')

pd.DataFrame(cross_val_params_1)



[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done   4 out of  10 | elapsed:  3.4min remaining:  5.1min
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:  6.7min remaining:  6.7min
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:  6.7min remaining:  4.5min
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:  6.7min remaining:  2.9min
[Parallel(n_jobs=-1)]: Done   8 out of  10 | elapsed:  6.8min remaining:  1.7min
