# Loan Default Prediction
## Team Members:
* Harish Puvvada
* Vamsi Mohan Ramineedi

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn import preprocessing,metrics 
from IPython.core.display import HTML
pd.set_option("display.max_columns",75)
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn import linear_model,svm
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import confusion_matrix

In [2]:
df2012_13 = pd.read_csv(os.getenv('FDS')+'LoanStats_2012_to_2013.csv',low_memory=False,skiprows=1)
df2014 = pd.read_csv(os.getenv('FDS')+'LoanStats_2014.csv',low_memory=False,skiprows=1)

## Data Cleaning

In [3]:
dataset = pd.concat([df2012_13, df2014]) #merging 2007 to 2014 datasets
dataset = dataset.iloc[:,2:111]          #removing empty columns
empty_cols = [i for i in range(45,72)] #more empty columns
dataset = dataset.drop(dataset.columns[empty_cols],axis=1)
data_with_loanstatus_sliced = dataset[(dataset['loan_status']=="Fully Paid") | (dataset['loan_status']=="Charged Off")]
di = {"Fully Paid":0, "Charged Off":1}   #converting target variable to boolean
Dataset_withBoolTarget= data_with_loanstatus_sliced.replace({"loan_status": di})

In [4]:
Dataset_withBoolTarget['loan_status'].value_counts()
Dataset_withBoolTarget.shape

(376233, 82)

In [5]:
#print(Dataset_withBoolTarget.shape)
dataset=Dataset_withBoolTarget.dropna(thresh = 340000,axis=1) #340000 is minimum number of non-NA values
#print(x.isnull().sum()) #- to check how many null values in all the columns
print(dataset.shape)

(376233, 74)


In [6]:
del_col_names = ["delinq_2yrs",  "last_pymnt_d", "chargeoff_within_12_mths","delinq_amnt","emp_title", "term", "emp_title", "pymnt_plan","purpose","title", "zip_code", "verification_status", "dti","earliest_cr_line", "initial_list_status", "out_prncp",
"pymnt_plan", "num_tl_90g_dpd_24m", "num_tl_30dpd", "num_tl_120dpd_2m", "num_accts_ever_120_pd", "delinq_amnt", 
"chargeoff_within_12_mths", "total_rec_late_fee", "out_prncp_inv", "issue_d"] #deleting some more columns
dataset = dataset.drop(labels = del_col_names, axis = 1) 
dataset.shape

(376233, 52)

In [7]:
features = ['funded_amnt','emp_length','annual_inc','home_ownership','grade',
            "last_pymnt_amnt", "mort_acc",
            "pub_rec", "int_rate", "open_acc","num_actv_rev_tl",
            "mo_sin_rcnt_rev_tl_op","mo_sin_old_rev_tl_op","bc_util","bc_open_to_buy",
            "avg_cur_bal","acc_open_past_24mths",'loan_status'] #selecting final features #'addr_state''tax_liens',
Final_data = dataset[features] #19 features with target var
Final_data["int_rate"] = Final_data["int_rate"].apply(lambda x:float(x[:-1]) ) #reomving % sign, conv to float  - int_rate column
Final_data= Final_data.reset_index(drop=True)
print(Final_data.shape)

(376233, 18)


## Data Transformation


In [8]:
#Data encoding
Final_data['grade'] = Final_data['grade'].map({'A':7,'B':6,'C':5,'D':4,'E':3,'F':2,'G':1})
Final_data["home_ownership"] = Final_data["home_ownership"].map({"MORTGAGE":6,"RENT":5,"OWN":4,"OTHER":3,"NONE":2,"ANY":1})
Final_data["emp_length"] = Final_data["emp_length"].replace({'years':'','year':'',' ':'','<':'','\+':'','n/a':'0'}, regex = True)
Final_data["emp_length"] = Final_data["emp_length"].apply(lambda x:int(x))
Final_data.shape

(376233, 18)

In [11]:
sns.Final_data['annual_inc']

0          55000.0
1          60000.0
2          39600.0
3         130000.0
4          28000.0
5          81500.0
6         102000.0
7          73000.0
8          33000.0
9          98000.0
10         32760.0
11        325000.0
12         63000.0
13        100000.0
14         40000.0
15         88000.0
16         96500.0
17         26000.0
18         90000.0
19        105000.0
20         25000.0
21         98000.0
22        110000.0
23         65000.0
24         70000.0
25         80000.0
26        295000.0
27         60000.0
28         84000.0
29         70000.0
            ...   
376203      7500.0
376204     45600.0
376205    284481.0
376206    113926.0
376207     50000.0
376208    100000.0
376209    210000.0
376210     48000.0
376211    120000.0
376212     68000.0
376213     72787.0
376214     65000.0
376215    128000.0
376216     65000.0
376217     60000.0
376218     57231.0
376219    133000.0
376220     70000.0
376221     99975.0
376222     25000.0
376223     45000.0
376224     2

## Filling Missing values and Feature scaling 


In [9]:
data_clean.isnull().sum()

NameError: name 'data_clean' is not defined

In [None]:
Final_data.fillna(Final_data.mean(),inplace = True)
#HTML(Final_data.tail().to_html())
Final_data.shape

In [None]:
scl = preprocessing.StandardScaler() #instance of preprocessing
fields = Final_data.columns.values[0:-1] 
data_clean = pd.DataFrame(scl.fit_transform(Final_data[fields]), columns = fields)
data_clean['loan_status'] = Final_data['loan_status']
data_clean['loan_status'].value_counts()

In [None]:
loanstatus_0 = data_clean[data_clean["loan_status"]==0]
loanstatus_1 = data_clean[data_clean["loan_status"]==1]
subset_of_loanstatus_0 = loanstatus_0.sample(n=70000)
data_clean = pd.concat([loanstatus_1, subset_of_loanstatus_0])
data_clean = data_clean.sample(frac=1).reset_index(drop=True)
data_clean.shape

In [None]:
dataViz = data_clean[['funded_amnt','emp_length','annual_inc','home_ownership','grade','last_pymnt_amnt','mort_acc','int_rate','open_acc','loan_status']]

sns.set_context(context='notebook')
fig, ax = plt.subplots(figsize=(8,8)) 
corr = dataViz.corr()

# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.tril_indices_from(mask)] = True

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

sns.heatmap(corr, cmap=cmap,linewidths=1, vmin=-1, vmax=1, square=True, cbar=True, center=0, ax=ax, mask=mask)

## Algorithms


## Random Forest


In [None]:
X_train, X_test, y_train, y_test = train_test_split(data_clean.iloc[:,:-1], data_clean.iloc[:,-1], test_size=0.33, random_state=42)
randomForest = RandomForestClassifier(criterion = "gini")
randomForest.fit(X_train,y_train)
rfPredict = randomForest.predict(X_test)
rfAccuracy = accuracy_score(y_test,rfPredict)
roc_score = metrics.roc_auc_score(y_test,rfPredict)
print(rfAccuracy,roc_score)

In [15]:
confusion_matrix(y_test,rfPredict)

array([[18745,  4312],
       [ 3807, 18082]])

In [21]:
from sklearn.metrics import precision_recall_fscore_support as prf1
prf1(y_test,rfPredict)

(array([ 0.83119014,  0.80744842]),
 array([ 0.81298521,  0.82607702]),
 array([ 0.82198689,  0.8166565 ]),
 array([23057, 21889], dtype=int64))

In [None]:
fig, ax = plt.subplots()
width=0.35
ax.bar(np.arange(len(fields)), randomForest.feature_importances_, width, color='r')
ax.set_xticks(np.arange(len(randomForest.feature_importances_)))
ax.set_xticklabels(X_train.columns.values,rotation=90)
plt.title('Feature Importance from DT')
ax.set_ylabel('Normalized Gini Importance')

In [None]:
import seaborn as sns
sns.set('talk', 'whitegrid', 'dark', font_scale=1, font='Ricty',rc={"lines.linewidth": 2, 'grid.linestyle': '--'})
def plotAUC(truth, pred, lab):
    fpr, tpr, _ = metrics.roc_curve(truth,pred)
    roc_auc = metrics.auc(fpr, tpr)
    lw = 2
    plt.figure(figsize=(6,5))
    plt.plot(fpr, tpr, color='darkorange',lw=lw, label= lab +'(AUC = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC curve') #Receiver Operating Characteristic 
    plt.legend(loc="lower right")
plotAUC(y_test,rfPredict, 'Random Forest')
plt.show()

average_precision = average_precision_score(y_test, rfPredict)
print('Average precision-recall score: {0:0.2f}'.format(average_precision))

precision, recall, _ = precision_recall_curve(y_test, rfPredict)
plt.figure(figsize=(6,5))
plt.step(recall, precision, color='b', alpha=0.2,where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2,color='b')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('2-class Precision-Recall curve: AP={0:0.2f}'.format(average_precision))

## Logistic Regression


In [None]:
clf_LR = linear_model.LogisticRegression(C=1e30)
clf_LR.fit(X_train,y_train)
LR_Predict = clf_LR.predict(X_test)
LR_Accuracy = accuracy_score(y_test,LR_Predict.round())
print(LR_Accuracy)
plotAUC(y_test,LR_Predict,'Logistic Regression')
confusion_matrix(y_test,LR_Predict)
plt.show()

average_precision = average_precision_score(y_test, LR_Predict)
print('Average precision-recall score: {0:0.2f}'.format(average_precision))

precision, recall, _ = precision_recall_curve(y_test, LR_Predict)
plt.figure(figsize=(6,5))
plt.step(recall, precision, color='b', alpha=0.2,where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2,color='b')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('2-class Precision-Recall curve: AP={0:0.2f}'.format(average_precision))

In [22]:
confusion_matrix(y_test,LR_Predict.round())

array([[17376,  5681],
       [ 1994, 19895]])

In [23]:
prf1(y_test,LR_Predict.round())

(array([ 0.89705731,  0.7778777 ]),
 array([ 0.75361062,  0.90890402]),
 array([ 0.81910104,  0.83830191]),
 array([23057, 21889], dtype=int64))

## Support Vector Machines(SVM)


In [None]:
# clf_svm = svm.SVC(kernel = "linear")
# clf_svm.fit(X_train,y_train)
# predictions_svm = clf_svm.predict(X_test)
# SVM_Accuracy = accuracy_score(y_test,predictions_svm)
# print(SVM_Accuracy)
# plotAUC(y_test,predictions_svm, 'SVM')
# plt.show()

## Cross Validation


## Things to be done before Initial Presentation

2. Get proper reasoning to fill the NULL values.
3. Get proper reasoning for scaling method that has to be applied.
4. Few data visualizations(loan defaulters according to state, correlation plots etc.) - Doing
5. Pick different combinations of features. - Done
7. Check for answers to potential questions.
8. Have to go through "how the company works"
9. Calculate precision and recall for the applied algo - Done

## extra features - acc_now_delinq, pct_tl_nvr_dlq, num_sats, sub_grade
