In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy  as np
import pandas as pd
import seaborn as sns
import re
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [2]:
dtypes = {'issue_d': 'str', 'last_pymnt_d': 'str'}
parse_dates = ['issue_d', 'last_pymnt_d']
accepted = pd.read_csv("C:\\Users\\nilli\\Downloads\\accepted_b_2015_clean.csv",
                       dtype=dtypes, parse_dates=parse_dates)

In [4]:
accepted.select_dtypes(include = 'object').columns

Index(['term', 'grade', 'sub_grade', 'emp_title', 'emp_length',
       'home_ownership', 'verification_status', 'loan_status', 'pymnt_plan',
       'url', 'purpose', 'title', 'zip_code', 'addr_state', 'earliest_cr_line',
       'initial_list_status', 'last_credit_pull_d', 'application_type',
       'hardship_flag', 'disbursement_method', 'debt_settlement_flag'],
      dtype='object')

In [5]:
accepted["term"] = accepted["term"].apply(lambda x: re.sub('months', '',x)).astype('int64')  

In [6]:
accepted["emp_length"] = accepted['emp_length'].apply(lambda x: re.sub('years', '',x))#.astype('int64')

In [7]:
accepted["emp_length"] = accepted['emp_length'].apply(lambda x: re.sub('year', '',x))

In [8]:
accepted["emp_length"] = accepted['emp_length'].apply(lambda x: re.sub('< 1 ', '0',x))

In [9]:
accepted["emp_length"] = accepted["emp_length"].str.replace(r'\D', '', ).astype('int64')

In [10]:
# accepted["emp_length"].unique()
accepted["emp_length"].value_counts()/ len(accepted['emp_length'])

10    0.319656
2     0.134311
3     0.078722
0     0.078045
5     0.066406
1     0.063732
4     0.060294
6     0.056297
7     0.056264
8     0.048055
9     0.038217
Name: emp_length, dtype: float64

In [11]:
accepted["emp_length"].describe()

count    453647.000000
mean          5.798310
std           3.637896
min           0.000000
25%           2.000000
50%           6.000000
75%          10.000000
max          10.000000
Name: emp_length, dtype: float64

In [12]:
#Let's create categories for annual_income since most of the bad loans are located below 100k
accepted['employ_cat'] = np.nan
lst = [accepted]

for col in lst:
    col.loc[col["emp_length"] <3, 'employ_cat'] = 'under_three'
    col.loc[(col["emp_length"] >= 3) & (col["emp_length"] <= 9), 'employ_cat'] = 'between_three_ten'
    col.loc[col["emp_length"] >= 10, 'employ_cat'] = 'Over_ten'

In [13]:
accepted['employ_cat'].value_counts()

between_three_ten    183389
Over_ten             145011
under_three          125247
Name: employ_cat, dtype: int64

In [16]:
#categories for annual_income since most of the bad loans are located below 100k
accepted['income_cat'] = np.nan
lst = [accepted]

for col in lst:
    col.loc[col['annual_inc'] <= 100000, 'income_cat'] = 'Low'
    col.loc[(col['annual_inc'] > 100000) & (col['annual_inc'] <= 200000), 'income_cat'] = 'Medium'
    col.loc[col['annual_inc'] > 200000, 'income_cat'] = 'High'

In [77]:
accepted['income_cat'].value_counts()

1    378514
2     67606
3      7527
Name: income_cat, dtype: int64

In [18]:
accepted['issue_d'].head()
dt_series = pd.to_datetime(accepted['issue_d'])
accepted['year'] = dt_series.dt.year

In [19]:
accepted.groupby(['income_cat', 'purpose'], as_index=False).int_rate.mean()
group_dti_purpose = accepted.groupby(['income_cat', 'purpose'], as_index=False).funded_amnt.mean()
loan_a = group_dti_purpose['funded_amnt'].values

In [20]:
group_dti_purpose 

Unnamed: 0,income_cat,purpose,funded_amnt
0,High,car,14267.857143
1,High,credit_card,25292.954244
2,High,debt_consolidation,24798.220043
3,High,educational,11000.0
4,High,home_improvement,23745.907738
5,High,house,26466.304348
6,High,major_purchase,19280.585106
7,High,medical,17331.701031
8,High,moving,19066.363636
9,High,other,20982.611111


In [21]:
accepted.groupby(['income_cat', 'purpose'], as_index=False).int_rate.mean()

Unnamed: 0,income_cat,purpose,int_rate
0,High,car,10.453
1,High,credit_card,11.792971
2,High,debt_consolidation,13.412477
3,High,educational,11.42
4,High,home_improvement,13.035308
5,High,house,15.745362
6,High,major_purchase,12.733777
7,High,medical,14.969691
8,High,moving,16.211273
9,High,other,15.770333


In [22]:
dc= accepted[['term', 'grade', "emp_length",'purpose','year','income_cat']]
dc

Unnamed: 0,term,grade,emp_length,purpose,year,income_cat
0,36,A,8,credit_card,2014,Low
1,60,C,10,debt_consolidation,2014,Low
2,36,C,10,debt_consolidation,2014,Low
3,36,C,0,debt_consolidation,2014,Low
4,60,D,6,credit_card,2014,Low
...,...,...,...,...,...,...
453642,36,D,7,debt_consolidation,2012,Low
453643,36,C,3,small_business,2012,Low
453644,36,C,3,small_business,2012,Medium
453645,36,D,2,other,2012,Low


In [23]:
accepted.select_dtypes(include = ['int64', 'float64']).columns

Index(['id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'term', 'int_rate',
       'installment', 'emp_length', 'annual_inc', 'dti', 'delinq_2yrs',
       'fico_range_low', 'fico_range_high', 'inq_last_6mths',
       'mths_since_last_delinq', 'open_acc', 'pub_rec', 'revol_bal',
       'revol_util', 'total_acc', 'out_prncp', 'out_prncp_inv', 'total_pymnt',
       'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
       'total_rec_late_fee', 'recoveries', 'collection_recovery_fee',
       'last_pymnt_amnt', 'last_fico_range_high', 'last_fico_range_low',
       'collections_12_mths_ex_med', 'policy_code', 'acc_now_delinq',
       'tot_coll_amt', 'tot_cur_bal', 'total_rev_hi_lim',
       'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util',
       'chargeoff_within_12_mths', 'delinq_amnt', 'mo_sin_old_il_acct',
       'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl',
       'mort_acc', 'mths_since_recent_bc', 'mths_since_recent_inq',
       'num_acct

In [24]:
#accepted['fico_mid'] = (accepted['fico_range_high'] + accepted['fico_range_low'])/2

In [25]:
# dn =accepted[['funded_amnt','int_rate','annual_inc','term', 'dti','delinq_2yrs', 'fico_mid']]
#dn =accepted[['funded_amnt','int_rate','annual_inc','term', 'dti', 'fico_range_low','emp_length', 'fico_range_high']]



In [26]:
#dn.corr()

In [27]:
grade_ord = {'A': 7,'B': 6,'C': 5,'D': 4,'E': 3,'F': 2,'G': 1}
accepted['grade']= accepted['grade'].map(grade_ord)               

In [28]:
# accepted['grade'] = gradeo

In [29]:
income_cat_ord = {'Low':1, 'Medium':2, 'High':3}
accepted['income_cat'] = accepted['income_cat'].map(income_cat_ord) 

In [30]:
loan_status_cat_ord = {'Defaulted':0, 'FullyPaid':1}
accepted['loan_status'] = accepted['loan_status'].map(loan_status_cat_ord) 

In [31]:
# df =accepted[['funded_amnt','int_rate','annual_inc','term', 'dti','delinq_2yrs', 
#               'fico_mid','grade', "emp_length",'purpose','income_cat']]

# df =accepted[['funded_amnt','int_rate','annual_inc','term', 'dti','grade','purpose',
#               'fico_range_low','emp_length', 'fico_range_high','loan_status']]

df =accepted[['funded_amnt','int_rate','annual_inc','term', 'dti',
              'fico_range_low','emp_length', 'fico_range_high']]

In [32]:
#####
dnn =accepted[['funded_amnt','int_rate','annual_inc','term', 'dti', 'fico_range_low','emp_length', 'fico_range_high','grade','loan_status']]
dnn

Unnamed: 0,funded_amnt,int_rate,annual_inc,term,dti,fico_range_low,emp_length,fico_range_high,grade,loan_status
0,10400.0,6.99,58000.0,36,14.92,710.0,8,714.0,7,0
1,15000.0,12.39,78000.0,60,12.03,750.0,10,754.0,5,1
2,9600.0,13.66,69000.0,36,25.81,680.0,10,684.0,5,1
3,7650.0,13.66,50000.0,36,34.81,685.0,0,689.0,5,0
4,21425.0,15.59,63800.0,60,18.49,685.0,6,689.0,4,1
...,...,...,...,...,...,...,...,...,...,...
453642,20500.0,16.77,60000.0,36,16.40,700.0,7,704.0,4,1
453643,15000.0,15.27,57600.0,36,8.35,680.0,3,684.0,5,1
453644,35000.0,15.96,160000.0,36,4.90,720.0,3,724.0,5,1
453645,12000.0,16.29,35000.0,36,12.93,675.0,2,679.0,4,0


In [33]:
dnn.corr()

Unnamed: 0,funded_amnt,int_rate,annual_inc,term,dti,fico_range_low,emp_length,fico_range_high,grade,loan_status
funded_amnt,1.0,0.163956,0.371911,0.401712,0.053609,0.102555,0.129892,0.102554,-0.157811,-0.057663
int_rate,0.163956,1.0,-0.046972,0.445421,0.157369,-0.502135,0.026557,-0.502132,-0.951612,-0.228207
annual_inc,0.371911,-0.046972,1.0,0.067911,-0.189287,0.09471,0.0935,0.09471,0.039077,0.049932
term,0.401712,0.445421,0.067911,1.0,0.081117,-0.004186,0.084482,-0.004186,-0.450824,-0.178129
dti,0.053609,0.157369,-0.189287,0.081117,1.0,-0.111698,0.038574,-0.1117,-0.1435,-0.089888
fico_range_low,0.102555,-0.502135,0.09471,-0.004186,-0.111698,1.0,0.010825,1.0,0.467726,0.117705
emp_length,0.129892,0.026557,0.0935,0.084482,0.038574,0.010825,1.0,0.010826,-0.015946,0.006993
fico_range_high,0.102554,-0.502132,0.09471,-0.004186,-0.1117,1.0,0.010826,1.0,0.467723,0.117704
grade,-0.157811,-0.951612,0.039077,-0.450824,-0.1435,0.467726,-0.015946,0.467723,1.0,0.230671
loan_status,-0.057663,-0.228207,0.049932,-0.178129,-0.089888,0.117705,0.006993,0.117704,0.230671,1.0


In [34]:
# from sklearn.preprocessing import OneHotEncoder

# #dummify purpose column
# purpose_ohe = OneHotEncoder( drop='first', sparse = False )
# purpose_enc = purpose_ohe.fit_transform( df[['purpose']] )
# purpose_enc = pd.DataFrame( purpose_enc, columns=purpose_ohe.get_feature_names(['purpose']) )
# df = pd.concat( (df.drop(["purpose"], axis=1).reset_index(drop = True), purpose_enc), axis = 1 )


In [35]:
df.head()

Unnamed: 0,funded_amnt,int_rate,annual_inc,term,dti,fico_range_low,emp_length,fico_range_high
0,10400.0,6.99,58000.0,36,14.92,710.0,8,714.0
1,15000.0,12.39,78000.0,60,12.03,750.0,10,754.0
2,9600.0,13.66,69000.0,36,25.81,680.0,10,684.0
3,7650.0,13.66,50000.0,36,34.81,685.0,0,689.0
4,21425.0,15.59,63800.0,60,18.49,685.0,6,689.0


In [36]:
df.shape

(453647, 8)

In [37]:
X = df

In [38]:
# accepted['loan_status']


In [39]:
accepted['loan_status'].value_counts()

1    376150
0     77497
Name: loan_status, dtype: int64

In [40]:
Y =accepted['loan_status']


In [41]:
#np.random.seed(0)
from sklearn.model_selection import train_test_split

Xtrain, Xtest, ytrain, ytest = train_test_split( X,Y, test_size = 0.2 ,random_state=42 )
print(Xtrain.shape)
print(ytrain.shape)

(362917, 8)
(362917,)


In [42]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
Xtrain = pd.DataFrame( ss.fit_transform(Xtrain), columns = Xtrain.columns)
Xtrain.head()
Xtest = pd.DataFrame( ss.fit_transform(Xtest), columns = Xtest.columns)
Xtest.head()

Unnamed: 0,funded_amnt,int_rate,annual_inc,term,dti,fico_range_low,emp_length,fico_range_high
0,0.220601,0.870349,-0.191115,-0.586438,-1.013048,-0.688897,1.155058,-0.688893
1,0.099497,0.870349,-0.297576,-0.586438,-1.905528,-1.174373,1.155058,-1.174365
2,-0.506023,0.645894,0.148799,1.705211,0.794892,-0.850723,1.155058,-0.850717
3,1.673847,2.546883,0.5397,1.705211,0.492292,-0.041595,-0.768767,-0.041597
4,-0.566575,0.48328,-0.225107,-0.586438,1.600549,-0.850723,-1.318431,-0.850717


In [43]:
from sklearn. linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
#logit = LogisticRegression( class_weight='balanced')
#logit = LogisticRegression( )
logit = LogisticRegression( class_weight='balanced',multi_class='ovr',solver='liblinear')
param_grid={"C":np.logspace(-3,3,7)}
#param_grid={"C":np.linspace(1,1000,20)}
#param_grid={"C":np.linspace(10,1000,20)}
logit_cv=GridSearchCV(logit, param_grid, cv =5, n_jobs = -1)
logit_cv.fit(Xtrain,ytrain)


# C=1.0, class_weight=None, dual=False, fit_intercept=True,
#           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
#           penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
#           verbose=0, warm_start=False
            
           # solver='lbfgs',

GridSearchCV(cv=5, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight='balanced',
                                          dual=False, fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='ovr',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='liblinear',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [44]:
#best hyperparemeter
logit_cv.best_params_ 


{'C': 1.0}

{'C': 53.578947368421055}

In [45]:
from sklearn.metrics import classification_report, roc_auc_score

In [46]:
roc=roc_auc_score(ytest, logit_cv.predict_proba(Xtest)[:,1])
roc

0.6929786859230131

In [47]:
# auc = roc_auc_score(Y,logit_cv.predict(X))
# auc

In [48]:
logit_cv.predict_proba(Xtest)

array([[0.5089866 , 0.4910134 ],
       [0.50439092, 0.49560908],
       [0.65675965, 0.34324035],
       ...,
       [0.47516339, 0.52483661],
       [0.66557658, 0.33442342],
       [0.49486881, 0.50513119]])

In [49]:
#best score
logit_cv.best_score_

0.6484926344708672

In [50]:
logit_cv.score(Xtrain,ytrain)

0.6485587613696796

In [51]:
logit_cv.score(Xtest,ytest)

0.6509203130166428

In [52]:
from sklearn.metrics import confusion_matrix,classification_report
confusion_matrix(ytest, logit_cv.predict(Xtest))

array([[ 9631,  5742],
       [25930, 49427]], dtype=int64)

In [53]:
print(classification_report(ytest, logit_cv.predict(Xtest)))

              precision    recall  f1-score   support

           0       0.27      0.63      0.38     15373
           1       0.90      0.66      0.76     75357

    accuracy                           0.65     90730
   macro avg       0.58      0.64      0.57     90730
weighted avg       0.79      0.65      0.69     90730



To handle the imbalance in the dataset,the class weight was as 'balanced' in the logistic regression model.
Thisimplies the sampling of the small class to match the weight of the bigger class

In [54]:
df.columns

Index(['funded_amnt', 'int_rate', 'annual_inc', 'term', 'dti',
       'fico_range_low', 'emp_length', 'fico_range_high'],
      dtype='object')

In [55]:
df2 =accepted[['funded_amnt','int_rate','annual_inc','term', 'dti', 'fico_range_low',
          'emp_length', 'fico_range_high','grade','loan_status']]

In [56]:
from sklearn.utils import resample
#Separate majority and minority classes
df_majority = df2[df2['loan_status']==1]
df_minority = df2[df2['loan_status']==0]
 
# Downsample majority class
df_majority_downsampled = resample(df_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=77497,     # to match minority class
                                 random_state=123) # reproducible results
 
# Combine minority class with downsampled majority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])
 
# Display new class counts
df_downsampled.loan_status.value_counts()
# 1    49
# 0    49
# Name: balance, dtype: int64

1    77497
0    77497
Name: loan_status, dtype: int64

In [57]:
X2 = df_downsampled[['funded_amnt','int_rate','annual_inc','term', 'dti', 'fico_range_low',
          'emp_length', 'fico_range_high','grade']]
X2.shape

(154994, 9)

In [58]:
Y2 =df_downsampled['loan_status']
Y2.shape

(154994,)

In [59]:
from sklearn.model_selection import train_test_split

X2train, X2test, y2train, y2test = train_test_split( X2,Y2, test_size = 0.2 ,random_state=42 )
print(X2train.shape)
print(y2train.shape)

(123995, 9)
(123995,)


In [60]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
X2train = pd.DataFrame( ss.fit_transform(X2train), columns = X2train.columns)
X2train.head()
X2test = pd.DataFrame( ss.fit_transform(X2test), columns = X2test.columns)
X2test.head()

Unnamed: 0,funded_amnt,int_rate,annual_inc,term,dti,fico_range_low,emp_length,fico_range_high,grade
0,1.242246,-0.381373,2.729055,-0.694058,0.295829,0.238535,-1.590592,0.238525,0.782769
1,0.343193,0.358689,-0.653875,1.440801,0.103482,-0.78715,-0.765402,-0.787137,0.055247
2,0.882624,0.795998,0.297574,1.440801,0.152834,-0.958098,1.160042,-0.95808,-0.672276
3,-0.555861,-0.448651,-0.653875,-0.694058,-0.154666,-0.10336,-1.315528,-0.103362,0.782769
4,0.403129,0.347476,-0.125292,1.440801,-0.145808,-0.445255,-1.040465,-0.44525,0.055247


In [67]:
from sklearn. linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
#logit = LogisticRegression( class_weight='balanced')
#logit = LogisticRegression( )
logit = LogisticRegression( class_weight='balanced',multi_class='ovr',solver='liblinear')
param_grid={"C":np.logspace(-3,3,7)}
#param_grid={"C":np.linspace(1,1000,20)}
#param_grid={"C":np.linspace(10,1000,20)}
logit_cv=GridSearchCV(logit, param_grid, cv =5, n_jobs = -1)
logit_cv.fit(X2train,y2train)

GridSearchCV(cv=5, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight='balanced',
                                          dual=False, fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='ovr',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='liblinear',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [68]:
logit_cv.best_params_ 

{'C': 1.0}

In [69]:
from sklearn.metrics import classification_report, roc_auc_score

In [70]:
roc2=roc_auc_score(y2test, logit_cv.predict_proba(X2test)[:,1])
roc2

0.6952235952493924

In [71]:
from sklearn.metrics import confusion_matrix,classification_report
confusion_matrix(y2test, logit_cv.predict(X2test))

array([[ 9727,  5892],
       [ 5220, 10160]], dtype=int64)

In [72]:
print(classification_report(y2test, logit_cv.predict(X2test)))

              precision    recall  f1-score   support

           0       0.65      0.62      0.64     15619
           1       0.63      0.66      0.65     15380

    accuracy                           0.64     30999
   macro avg       0.64      0.64      0.64     30999
weighted avg       0.64      0.64      0.64     30999

