In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import PolynomialFeatures
import warnings
warnings.filterwarnings("ignore")
np.random.seed(42)

In [41]:
df = pd.read_csv('loan_data.csv')

In [42]:
df.columns

Index(['credit.policy', 'purpose', 'int.rate', 'installment', 'log.annual.inc',
       'dti', 'fico', 'days.with.cr.line', 'revol.bal', 'revol.util',
       'inq.last.6mths', 'delinq.2yrs', 'pub.rec', 'not.fully.paid'],
      dtype='object')

In [43]:
df.head()

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,1,debt_consolidation,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0
1,1,credit_card,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0
2,1,debt_consolidation,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0
3,1,debt_consolidation,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0
4,1,credit_card,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0


In [44]:
y = df.pop('not.fully.paid')

In [45]:
y.head

<bound method NDFrame.head of 0       0
1       0
2       0
3       0
4       0
       ..
9573    1
9574    1
9575    1
9576    1
9577    1
Name: not.fully.paid, Length: 9578, dtype: int64>

In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9578 entries, 0 to 9577
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   credit.policy      9578 non-null   int64  
 1   purpose            9578 non-null   object 
 2   int.rate           9578 non-null   float64
 3   installment        9578 non-null   float64
 4   log.annual.inc     9578 non-null   float64
 5   dti                9578 non-null   float64
 6   fico               9578 non-null   int64  
 7   days.with.cr.line  9578 non-null   float64
 8   revol.bal          9578 non-null   int64  
 9   revol.util         9578 non-null   float64
 10  inq.last.6mths     9578 non-null   int64  
 11  delinq.2yrs        9578 non-null   int64  
 12  pub.rec            9578 non-null   int64  
dtypes: float64(6), int64(6), object(1)
memory usage: 972.9+ KB


In [47]:
df['purpose'].value_counts()

debt_consolidation    3957
all_other             2331
credit_card           1262
home_improvement       629
small_business         619
major_purchase         437
educational            343
Name: purpose, dtype: int64

In [49]:
# convert purpose - from string to float

In [50]:
encoded_df = pd.get_dummies(df['purpose'])

In [51]:
encoded_df.head()

Unnamed: 0,all_other,credit_card,debt_consolidation,educational,home_improvement,major_purchase,small_business
0,0,0,1,0,0,0,0
1,0,1,0,0,0,0,0
2,0,0,1,0,0,0,0
3,0,0,1,0,0,0,0
4,0,1,0,0,0,0,0


In [52]:
df.head()

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec
0,1,debt_consolidation,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0
1,1,credit_card,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0
2,1,debt_consolidation,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0
3,1,debt_consolidation,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0
4,1,credit_card,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0


In [53]:
df.drop('purpose', axis=1,inplace=True)

In [54]:
df.head()

Unnamed: 0,credit.policy,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec
0,1,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0
1,1,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0
2,1,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0
3,1,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0
4,1,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0


In [55]:
modelling_df = pd.concat([encoded_df, df], axis=1)

In [56]:
modelling_df.head()

Unnamed: 0,all_other,credit_card,debt_consolidation,educational,home_improvement,major_purchase,small_business,credit.policy,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec
0,0,0,1,0,0,0,0,1,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0
1,0,1,0,0,0,0,0,1,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0
2,0,0,1,0,0,0,0,1,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0
3,0,0,1,0,0,0,0,1,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0
4,0,1,0,0,0,0,0,1,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0


In [57]:
# simple logistic regression

In [58]:
train, test, y_train, y_test = train_test_split(modelling_df, y, test_size = 0.2, random_state=101)

In [59]:
train.shape

(7662, 19)

In [60]:
test.shape

(1916, 19)

In [61]:
lr = LogisticRegression()

In [62]:
lr.fit(train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [63]:
y_pred = lr.predict(test)

In [64]:
print(accuracy_score(y_test, y_pred))

0.8460334029227558


In [65]:
# Decision tree classifier

In [66]:
def fit_predict(train, test, y_train, y_test, scaler, max_depth,
               criterion = 'entropy', max_features=1, min_samples_split=4):
    train_scaled = scaler.fit_transform(train)
    test_scaled = scaler.transform(test)
    dt = DecisionTreeClassifier(criterion = criterion, max_depth=max_depth,
                               random_state= 42, max_features=max_features,
                               min_samples_split=min_samples_split)
    dt.fit(train_scaled, y_train)
    y_pred = dt.predict(test_scaled)
    print(accuracy_score(y_test, y_pred))

In [67]:
dt = DecisionTreeClassifier()
dt.fit(train, y_train)
y_pred = dt.predict(test)
print(accuracy_score(y_test, y_pred))

0.7317327766179541


In [68]:
# max depth tuning

In [69]:
for i in range(1,30):
    print("Accuracy score using max_depth = ", i, end = ':')
    fit_predict(train, test, y_train, y_test, StandardScaler(), i)

Accuracy score using max_depth =  1:0.8470772442588727
Accuracy score using max_depth =  2:0.8470772442588727
Accuracy score using max_depth =  3:0.8470772442588727
Accuracy score using max_depth =  4:0.8470772442588727
Accuracy score using max_depth =  5:0.8470772442588727
Accuracy score using max_depth =  6:0.8455114822546973
Accuracy score using max_depth =  7:0.8465553235908142
Accuracy score using max_depth =  8:0.8434237995824635
Accuracy score using max_depth =  9:0.8402922755741128
Accuracy score using max_depth =  10:0.837160751565762
Accuracy score using max_depth =  11:0.8355949895615866
Accuracy score using max_depth =  12:0.8319415448851775
Accuracy score using max_depth =  13:0.826722338204593
Accuracy score using max_depth =  14:0.8308977035490606
Accuracy score using max_depth =  15:0.81419624217119
Accuracy score using max_depth =  16:0.7875782881002088
Accuracy score using max_depth =  17:0.8021920668058455
Accuracy score using max_depth =  18:0.8016701461377871
Accur

In [70]:
# max features tuning

In [71]:
for i in np.arange(0.1,1.0,0.1):
    print('Accuracy score using max features =', i, end = ":")
    fit_predict(train, test,y_train, y_test, StandardScaler(), max_depth=5, max_features=i)

Accuracy score using max features = 0.1:0.8470772442588727
Accuracy score using max features = 0.2:0.8465553235908142
Accuracy score using max features = 0.30000000000000004:0.8382045929018789
Accuracy score using max features = 0.4:0.8413361169102297
Accuracy score using max features = 0.5:0.8470772442588727
Accuracy score using max features = 0.6:0.8434237995824635
Accuracy score using max features = 0.7000000000000001:0.8444676409185804
Accuracy score using max features = 0.8:0.8460334029227558
Accuracy score using max features = 0.9:0.8418580375782881


In [72]:
# min samples split tuning

In [73]:
for i in range(2, 10):
    print('Accuracy score using min samples split=', i, end=":")
    fit_predict(train, test, y_train, y_test, StandardScaler(), 5, max_features=0.5, min_samples_split=i)

Accuracy score using min samples split= 2:0.8470772442588727
Accuracy score using min samples split= 3:0.8470772442588727
Accuracy score using min samples split= 4:0.8470772442588727
Accuracy score using min samples split= 5:0.8470772442588727
Accuracy score using min samples split= 6:0.8470772442588727
Accuracy score using min samples split= 7:0.8470772442588727
Accuracy score using min samples split= 8:0.8470772442588727
Accuracy score using min samples split= 9:0.8470772442588727


In [74]:
# criterion tuning

In [75]:
for i in ['gini', 'entropy']:
    print("Accuracy score using criterion: ", i, end = ':')
    fit_predict(train, test, y_train, y_test, StandardScaler(), 5, max_features = 0.5, min_samples_split=3, criterion= i)

Accuracy score using criterion:  gini:0.8397703549060542
Accuracy score using criterion:  entropy:0.8470772442588727


In [76]:
def create_poly(train, test, degree):
    poly = PolynomialFeatures(degree = degree)
    train_poly = poly.fit_transform(train)
    test_poly = poly.fit_transform(test)
    return train_poly, test_poly

In [77]:
for degree in [1,2,3,4]:
    train_poly, test_poly = create_poly(train, test, degree)
    print("polynomial degree", degree)
    fit_predict(train_poly, test_poly, y_train, y_test, StandardScaler(), 5, max_features = 0.5,min_samples_split=3, criterion='entropy')
    print(10 *'-')

polynomial degree 1
0.842901878914405
----------
polynomial degree 2
0.837160751565762
----------
polynomial degree 3
0.8408141962421712
----------
polynomial degree 4
0.8376826722338204
----------


In [78]:
train_poly, test_poly = create_poly(train, test, 1)

In [79]:
fit_predict(train_poly, test_poly, y_train, y_test, StandardScaler(), 5, max_features=0.5, min_samples_split=3, criterion='entropy')

0.842901878914405


In [80]:
# Random forest classifier

In [82]:
from sklearn.ensemble import RandomForestClassifier

In [83]:
rf = RandomForestClassifier(criterion='gini', oob_score=True)

In [84]:
rf.fit(train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=True, random_state=None,
                       verbose=0, warm_start=False)

In [85]:
pred_rf = rf.predict(test)

In [86]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, pred_rf))

0.8444676409185804


In [88]:
# parameter tuning using Grid Search 

In [89]:
from sklearn.model_selection import GridSearchCV

In [90]:
params = {
    'n_estimators': [200,500,700],
    'max_depth': [10,15,18,20],
    'min_samples_leaf': [3,5,7]
}

In [93]:
gs = GridSearchCV(rf, params, cv=3, verbose=3, n_jobs=3)

In [94]:
gs.fit(train, y_train)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  26 tasks      | elapsed:  1.4min
[Parallel(n_jobs=3)]: Done 108 out of 108 | elapsed:  5.9min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=True, random_state=None,
                                 

In [95]:
gs.best_params_

{'max_depth': 15, 'min_samples_leaf': 7, 'n_estimators': 200}

In [96]:
gs.best_estimator_

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=15, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=7, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=True, random_state=None,
                       verbose=0, warm_start=False)

In [97]:
# confusion matrix

In [98]:
from sklearn.metrics import confusion_matrix, classification_report

In [99]:
print(confusion_matrix(y_test,pred_rf))

[[1618    5]
 [ 293    0]]


In [100]:
print(classification_report(y_test, pred_rf))

              precision    recall  f1-score   support

           0       0.85      1.00      0.92      1623
           1       0.00      0.00      0.00       293

    accuracy                           0.84      1916
   macro avg       0.42      0.50      0.46      1916
weighted avg       0.72      0.84      0.78      1916



In [101]:
# retraining the model with best estimators

In [102]:
rf1 = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=15, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=7, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=True, random_state=None,
                       verbose=0, warm_start=False)

In [103]:
rf1.fit(train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=15, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=7, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=True, random_state=None,
                       verbose=0, warm_start=False)

In [104]:
rf1.oob_score_

0.8377708170190551

In [105]:
pred_rf1 = rf1.predict(test)

In [106]:
print(accuracy_score(y_test, pred_rf1))

0.8465553235908142


In [107]:
rf1.feature_importances_

array([0.01052798, 0.00506202, 0.01285918, 0.00169063, 0.00393122,
       0.00142577, 0.01971086, 0.03679339, 0.1124729 , 0.11274196,
       0.10111725, 0.09230417, 0.08687504, 0.10411224, 0.10221962,
       0.106632  , 0.07528383, 0.00781293, 0.00642702])

In [108]:
sorted(list(zip(rf1.feature_importances_, train.columns)), reverse=True)

[(0.11274196070693109, 'installment'),
 (0.112472896697181, 'int.rate'),
 (0.10663199682957385, 'revol.util'),
 (0.10411223901649774, 'days.with.cr.line'),
 (0.10221962024204304, 'revol.bal'),
 (0.10111725298986507, 'log.annual.inc'),
 (0.09230416587536827, 'dti'),
 (0.08687504486742545, 'fico'),
 (0.07528383338964718, 'inq.last.6mths'),
 (0.036793385360500634, 'credit.policy'),
 (0.019710864832421005, 'small_business'),
 (0.0128591811638758, 'debt_consolidation'),
 (0.010527977719442087, 'all_other'),
 (0.007812928111679142, 'delinq.2yrs'),
 (0.0064270159792398735, 'pub.rec'),
 (0.0050620204467118875, 'credit_card'),
 (0.0039312182636221955, 'home_improvement'),
 (0.0016906251794005355, 'educational'),
 (0.001425772328574211, 'major_purchase')]

In [109]:
# overall we get an accuracy of 0.846 in predicting the lending score.