In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.cross_validation import KFold
from sklearn.preprocessing import StandardScaler


%config InlineBackend.figure_format = 'svg'
%matplotlib inline



In [2]:
df = pd.read_csv("data/loan.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
df_test = df.sample(50000, random_state=25)

In [4]:
df_test.shape

(50000, 74)

In [5]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50000 entries, 348281 to 351889
Data columns (total 74 columns):
id                             50000 non-null int64
member_id                      50000 non-null int64
loan_amnt                      50000 non-null float64
funded_amnt                    50000 non-null float64
funded_amnt_inv                50000 non-null float64
term                           50000 non-null object
int_rate                       50000 non-null float64
installment                    50000 non-null float64
grade                          50000 non-null object
sub_grade                      50000 non-null object
emp_title                      47101 non-null object
emp_length                     47499 non-null object
home_ownership                 50000 non-null object
annual_inc                     50000 non-null float64
verification_status            50000 non-null object
issue_d                        50000 non-null object
loan_status                    50

In [6]:
df_test.sample(5)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m
161186,3495613,4377902,8400.0,8400.0,8400.0,36 months,6.03,255.66,A,A1,...,,,,,,,30600.0,,,
357043,20048815,22311450,31575.0,31575.0,31275.0,36 months,14.49,1086.69,C,C4,...,,,,,,,63900.0,,,
187266,1705746,1998620,10075.0,10075.0,10075.0,36 months,17.77,363.08,D,D1,...,,,,,,,19800.0,,,
146161,4376393,5608659,16750.0,16750.0,16750.0,36 months,12.12,557.31,B,B3,...,,,,,,,22100.0,,,
145904,4434907,5667087,10000.0,10000.0,10000.0,36 months,15.31,348.18,C,C2,...,,,,,,,17600.0,,,


In [7]:
df_test.recoveries.value_counts()

0.00        48559
453.04          2
400.00          2
12.00           2
12.58           2
996.74          2
10.30           2
10.10           2
792.01          1
1150.30         1
5137.34         1
2031.56         1
725.56          1
2812.40         1
955.08          1
942.93          1
1057.26         1
4368.57         1
1273.30         1
1953.01         1
2926.29         1
15958.26        1
2088.89         1
406.42          1
304.52          1
4090.34         1
10.86           1
4574.36         1
1882.36         1
2820.17         1
            ...  
460.93          1
523.03          1
747.00          1
2769.36         1
310.94          1
2329.07         1
403.38          1
742.94          1
1362.12         1
1608.27         1
1517.32         1
1573.14         1
2729.74         1
170.48          1
1143.15         1
3090.57         1
2966.01         1
109.25          1
22.50           1
1158.20         1
807.51          1
2785.30         1
1492.50         1
237.09          1
1432.94   

In [8]:
df_test.loan_status.value_counts()

Current                                                33939
Fully Paid                                             11702
Charged Off                                             2590
Late (31-120 days)                                       614
Issued                                                   440
In Grace Period                                          345
Late (16-30 days)                                        160
Does not meet the credit policy. Status:Fully Paid        92
Default                                                   78
Does not meet the credit policy. Status:Charged Off       40
Name: loan_status, dtype: int64

In [9]:
df_test.grade.value_counts()

B    14392
C    13863
A     8277
D     7909
E     3904
F     1334
G      321
Name: grade, dtype: int64

In [10]:
df_test.verification_status.value_counts()

Source Verified    18640
Verified           16381
Not Verified       14979
Name: verification_status, dtype: int64

In [50]:
df_short = df_test.loc[:,["loan_status", "funded_amnt_inv", "term", "int_rate",
                           "installment", "emp_title", "emp_length", "home_ownership",
                            "annual_inc", "purpose", "addr_state", "grade", "delinq_2yrs",
                             "mths_since_last_delinq", "total_rec_late_fee"]]

In [51]:
df_short.head()

Unnamed: 0,loan_status,funded_amnt_inv,term,int_rate,installment,emp_title,emp_length,home_ownership,annual_inc,purpose,addr_state,grade,delinq_2yrs,mths_since_last_delinq,total_rec_late_fee
348281,Current,6650.0,36 months,14.49,228.87,Reseach Associate,< 1 year,RENT,36000.0,car,CA,C,0.0,,0.0
440716,Late (31-120 days),5000.0,36 months,9.67,160.57,Executive Assistant,4 years,RENT,67000.0,credit_card,NY,B,1.0,13.0,0.0
84524,Fully Paid,4000.0,36 months,14.3,137.3,Mental Health worker,10+ years,RENT,50000.0,credit_card,CA,C,1.0,20.0,0.0
522105,Current,21000.0,36 months,9.99,677.52,Electrician,< 1 year,RENT,88000.0,credit_card,CA,B,0.0,,0.0
490550,Current,18400.0,36 months,10.64,599.27,Server,10+ years,RENT,40000.0,debt_consolidation,TN,B,0.0,59.0,0.0


In [52]:
df_short.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50000 entries, 348281 to 351889
Data columns (total 15 columns):
loan_status               50000 non-null object
funded_amnt_inv           50000 non-null float64
term                      50000 non-null object
int_rate                  50000 non-null float64
installment               50000 non-null float64
emp_title                 47101 non-null object
emp_length                47499 non-null object
home_ownership            50000 non-null object
annual_inc                50000 non-null float64
purpose                   50000 non-null object
addr_state                50000 non-null object
grade                     50000 non-null object
delinq_2yrs               49998 non-null float64
mths_since_last_delinq    24442 non-null float64
total_rec_late_fee        50000 non-null float64
dtypes: float64(7), object(8)
memory usage: 6.1+ MB


In [53]:
df_short.term.value_counts()

 36 months    35164
 60 months    14836
Name: term, dtype: int64

In [54]:
df_short["term_yrs"] = df_short.term.str.extract('(\d\d)')

In [55]:
df_short["term_yrs"] = pd.to_numeric(df_short.term_yrs)/12

In [56]:
df_short.drop("term",axis=1,inplace=True)

In [57]:
df_short.emp_length.value_counts()

10+ years    16248
2 years       4408
< 1 year      4066
3 years       3909
1 year        3306
5 years       3082
4 years       2963
7 years       2577
8 years       2501
6 years       2483
9 years       1956
Name: emp_length, dtype: int64

In [58]:
df_short["emp_length_yrs"]= 0

temp = df_short["emp_length"] == "< 1 year"
df_short.loc[temp,"emp_length_yrs"] = 0

temp = df_short["emp_length"] == "1 year"
df_short.loc[temp,"emp_length_yrs"] = 1

temp = df_short.emp_length == "10+ years"
df_short.loc[temp,"emp_length_yrs"] = 10

for i in range(2,10):
    temp = df_short.emp_length == f"{i} years"
    df_short.loc[temp,"emp_length_yrs"] = i
    
temp = df_short.emp_length.isnull()
df_short.loc[temp,"emp_length_yrs"] = np.median(df_short.emp_length_yrs)

In [59]:
df_short.emp_length_yrs.value_counts()

10.0    16248
6.0      4984
2.0      4408
0.0      4066
3.0      3909
1.0      3306
5.0      3082
4.0      2963
7.0      2577
8.0      2501
9.0      1956
Name: emp_length_yrs, dtype: int64

In [60]:
sum(df_short.emp_length_yrs.isnull())

0

In [61]:
df_short.drop(columns="emp_length", inplace=True)

In [62]:
df_short.head()

Unnamed: 0,loan_status,funded_amnt_inv,int_rate,installment,emp_title,home_ownership,annual_inc,purpose,addr_state,grade,delinq_2yrs,mths_since_last_delinq,total_rec_late_fee,term_yrs,emp_length_yrs
348281,Current,6650.0,14.49,228.87,Reseach Associate,RENT,36000.0,car,CA,C,0.0,,0.0,3.0,0.0
440716,Late (31-120 days),5000.0,9.67,160.57,Executive Assistant,RENT,67000.0,credit_card,NY,B,1.0,13.0,0.0,3.0,4.0
84524,Fully Paid,4000.0,14.3,137.3,Mental Health worker,RENT,50000.0,credit_card,CA,C,1.0,20.0,0.0,3.0,10.0
522105,Current,21000.0,9.99,677.52,Electrician,RENT,88000.0,credit_card,CA,B,0.0,,0.0,3.0,0.0
490550,Current,18400.0,10.64,599.27,Server,RENT,40000.0,debt_consolidation,TN,B,0.0,59.0,0.0,3.0,10.0


In [63]:
df_short.grade.value_counts()

B    14392
C    13863
A     8277
D     7909
E     3904
F     1334
G      321
Name: grade, dtype: int64

In [64]:
df_short["credit_score"] = 0
rank = ["A","B","C","D","E","F","G"]

for i in range(len(rank)):
    temp = df_short.grade == rank[i]
    df_short.loc[temp,"credit_score"] = len(rank)-i

In [65]:
df_short.credit_score.value_counts()

6    14392
5    13863
7     8277
4     7909
3     3904
2     1334
1      321
Name: credit_score, dtype: int64

In [66]:
df_short.drop(columns="grade",inplace=True)

In [67]:
df_short.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50000 entries, 348281 to 351889
Data columns (total 15 columns):
loan_status               50000 non-null object
funded_amnt_inv           50000 non-null float64
int_rate                  50000 non-null float64
installment               50000 non-null float64
emp_title                 47101 non-null object
home_ownership            50000 non-null object
annual_inc                50000 non-null float64
purpose                   50000 non-null object
addr_state                50000 non-null object
delinq_2yrs               49998 non-null float64
mths_since_last_delinq    24442 non-null float64
total_rec_late_fee        50000 non-null float64
term_yrs                  50000 non-null float64
emp_length_yrs            50000 non-null float64
credit_score              50000 non-null int64
dtypes: float64(9), int64(1), object(5)
memory usage: 6.1+ MB


In [68]:
df_short.emp_title.value_counts()

Teacher                                    746
Manager                                    587
Owner                                      352
Registered Nurse                           314
Supervisor                                 290
RN                                         253
Sales                                      242
Project Manager                            227
Office Manager                             198
General Manager                            185
manager                                    179
Driver                                     179
teacher                                    173
Director                                   171
owner                                      167
Engineer                                   152
President                                  148
driver                                     125
Attorney                                   121
Administrative Assistant                   121
Vice President                             120
Accountant   

In [69]:
df_short.home_ownership.value_counts()

MORTGAGE    24908
RENT        20176
OWN          4902
OTHER          10
NONE            4
Name: home_ownership, dtype: int64

In [70]:
df_short["own_home"] = 0

musk = df_short.home_ownership=="OWN"
df_short.loc[musk,"own_home"]=2
musk = df_short.home_ownership=="MORTGAGE"
df_short.loc[musk,"own_home"]=1

In [71]:
df_short.own_home.value_counts()

1    24908
0    20190
2     4902
Name: own_home, dtype: int64

In [73]:
df_short.loan_status.value_counts()

Current                                                33939
Fully Paid                                             11702
Charged Off                                             2590
Late (31-120 days)                                       614
Issued                                                   440
In Grace Period                                          345
Late (16-30 days)                                        160
Does not meet the credit policy. Status:Fully Paid        92
Default                                                   78
Does not meet the credit policy. Status:Charged Off       40
Name: loan_status, dtype: int64

In [74]:
good = ["Current", "Fully Paid", "Issued"]
df_short.loc[df_short.loan_status.isin(good), "loan_status"] = "Good"
df_short.loc[df_short.loan_status != "Good", "loan_status"] = "Bad"

In [75]:
df_short.loan_status.value_counts()

Good    46081
Bad      3919
Name: loan_status, dtype: int64

In [99]:
df_short.loan_status.head()

348281    Good
440716     Bad
84524     Good
522105    Good
490550    Good
Name: loan_status, dtype: object

In [76]:
from patsy import dmatrices

In [77]:
y,X=dmatrices('loan_status ~ funded_amnt_inv+ \
              int_rate+ \
              installment+ \
              annual_inc+ \
              term_yrs+ \
              emp_length_yrs+ \
              credit_score+ \
              own_home',
              data=df_short,return_type='dataframe')

In [106]:
y = y.iloc[:,1]

In [107]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=25)

In [108]:
print(X_train.shape)
print(X_test.shape)

(35000, 9)
(15000, 9)


In [109]:
print(y_train.shape)
print(y_test.shape)

(35000,)
(15000,)


In [110]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))

0.9210666666666667


In [111]:
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))

0.9236


In [112]:
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))

0.8654666666666667


In [113]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))

0.9122


In [114]:
from sklearn.model_selection import cross_val_score

In [146]:
def accuracy_score(model,x,y,cv=10,scoring="accuracy"):
    score = cross_val_score(model,x,y,cv=cv,scoring=scoring).mean()
    return score

In [153]:
knn = KNeighborsClassifier(n_neighbors=9)
print(accuracy_score(knn,X_train,y_train,10,'accuracy'))
print(accuracy_score(knn,X_train,y_train,10,'precision'))
print(accuracy_score(knn,X_train,y_train,10,'recall'))
print(accuracy_score(knn,X_train,y_train,10,'f1'))
print(accuracy_score(knn,X_train,y_train,10,'roc_auc'))

0.9193143157224514
0.9202036302765022
0.9989443597046556
0.957958624400068
0.5387512926783338
