# Home Loan Default Prediction

__Columns chosen as features are__

* TD013,AP004,TD014,TD023,AP003
* A few columns were manually removed because they had a lot of missing data and on the remaining dataset Pearson's correlation coefficient was used to choose the features with highest(positive or negative) correlation. 
* The same set of features chosen above is used in all the three machine learning algorithms.
* Machine Learning algorithms used are: 1)Logistic Regression 2)KNN 3)Naive Bayes Classifier

In [1]:
import pandas as pd
import numpy as np
from sklearn import neighbors, preprocessing, model_selection

In [2]:
loandf = pd.read_csv('LoanDefaultData.csv')

In [3]:
import pandas as pd
from sklearn import preprocessing
#feature_loan=loandf[['TD013','AP004','TD014','TD023','AP003','TD024','MB005','PA028']]
#feature_loan=loandf[['AP002','AP003','AP004','CR009']] 
loandf=loandf[['TD013','AP004','TD014','TD023','AP003','loan_default']]
loandf=loandf.dropna(axis=0, how='any')
feature_loan=loandf[['TD013','AP004','TD014','TD023','AP003']]
y=loandf[['loan_default']].values
X=feature_loan

In [4]:
from sklearn.model_selection import train_test_split

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, random_state=42,test_size=0.3)

# Logistic Regression

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
logmodel = LogisticRegression(solver='lbfgs')
logmodel.fit(Xtrain, ytrain)
predictions = logmodel.predict(Xtrain)
print(classification_report(ytrain, predictions))

  return f(**kwargs)


              precision    recall  f1-score   support

           0       0.80      1.00      0.89     40476
           1       0.46      0.02      0.03     10159

    accuracy                           0.80     50635
   macro avg       0.63      0.51      0.46     50635
weighted avg       0.73      0.80      0.72     50635



In [6]:
from sklearn.metrics import accuracy_score
accuracy_score(ytrain, predictions)

0.7988150488792337

# Naive Bayes

In [7]:
from sklearn.naive_bayes import GaussianNB 
model = GaussianNB()                       
model.fit(Xtrain, ytrain)
y_predicted = model.predict(Xtest)

  return f(**kwargs)


In [8]:
from sklearn.metrics import accuracy_score
accuracy_score(ytest, y_predicted)

0.7680858907013178

In [9]:
from sklearn.metrics import classification_report
print(classification_report(ytest, y_predicted))

              precision    recall  f1-score   support

           0       0.82      0.91      0.86     17387
           1       0.35      0.19      0.24      4315

    accuracy                           0.77     21702
   macro avg       0.58      0.55      0.55     21702
weighted avg       0.72      0.77      0.74     21702



# KNN

In [10]:
knn = neighbors.KNeighborsClassifier(n_neighbors=14)
knn.fit(Xtrain, ytrain)
accuracy = knn.score(Xtest, ytest)
y_pred = knn.predict(Xtest)
from sklearn.metrics import accuracy_score
accuracy_score(ytest, y_pred)

  knn.fit(Xtrain, ytrain)


0.7847663809787117

In [11]:

from sklearn.metrics import classification_report

print(classification_report(ytest,y_pred))


              precision    recall  f1-score   support

           0       0.80      0.97      0.88     17387
           1       0.25      0.04      0.07      4315

    accuracy                           0.78     21702
   macro avg       0.52      0.50      0.47     21702
weighted avg       0.69      0.78      0.72     21702



__Findings__ : Here we observe that F1 score values are not as expected. The relative contribution of precision and recall is unequal. This is because of the imbalanced nature of the dataset.

__Solution__ : One of the ways dataset can be balanced is by using Resampling Techniques. Here as we can see the F1 score for 0 is very high than that of 1, showing non-defaulters are at majority. To solve this imbalance, we do Undersampling.

# Undersampling Majority class

In [12]:
target = loandf[['loan_default']]
X = pd.concat([feature_loan, target], axis=1)
X

Unnamed: 0,TD013,AP004,TD014,TD023,AP003,loan_default
0,14,12,2,9.0,1,1
1,3,12,2,6.0,1,0
2,9,12,1,9.0,4,0
4,2,12,1,3.0,1,0
5,6,12,2,12.0,1,1
...,...,...,...,...,...,...
79994,10,12,4,21.0,3,1
79995,8,12,3,0.0,1,0
79996,2,12,0,3.0,4,0
79997,2,12,2,0.0,1,0


In [13]:
from sklearn.utils import resample

not_default = X[X.loan_default==0]
default = X[X.loan_default==1]

notdefault_downsampled = resample(not_default,
                                replace = False, # sample without replacement
                                n_samples = len(default), # match minority n
                                random_state = 27) # reproducible results
downsampled = pd.concat([notdefault_downsampled, default])
downsampled

Unnamed: 0,TD013,AP004,TD014,TD023,AP003,loan_default
24297,5,12,3,3.0,3,0
76057,5,12,3,0.0,3,0
38531,3,12,1,3.0,1,0
46905,2,12,0,3.0,4,0
59626,2,12,1,0.0,1,0
...,...,...,...,...,...,...
79984,7,12,3,6.0,1,1
79988,10,12,5,6.0,3,1
79990,7,12,3,3.0,1,1
79994,10,12,4,21.0,3,1


In [14]:
target=downsampled['loan_default']
feature=downsampled.drop('loan_default',axis = 1)

In [15]:
# Normalizing the values
xn=(feature-feature.min())/(feature.max()-feature.min())
xn.replace(np.nan, -9999, inplace=True)
xn

Unnamed: 0,TD013,AP004,TD014,TD023,AP003
24297,0.096154,1.0,0.069767,0.1,0.4
76057,0.096154,1.0,0.069767,0.0,0.4
38531,0.057692,1.0,0.023256,0.1,0.0
46905,0.038462,1.0,0.000000,0.1,0.6
59626,0.038462,1.0,0.023256,0.0,0.0
...,...,...,...,...,...
79984,0.134615,1.0,0.069767,0.2,0.0
79988,0.192308,1.0,0.116279,0.2,0.4
79990,0.134615,1.0,0.069767,0.1,0.0
79994,0.192308,1.0,0.093023,0.7,0.4


In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(xn,target,test_size=0.3,random_state=48)


# Logistic Regression

In [17]:
logmodel = LogisticRegression(solver='lbfgs')
logmodel.fit(X_train, y_train)
predictions = logmodel.predict(X_train)

from sklearn.metrics import classification_report
print(classification_report(y_train, predictions))

              precision    recall  f1-score   support

           0       0.61      0.60      0.60     10133
           1       0.60      0.61      0.61     10130

    accuracy                           0.61     20263
   macro avg       0.61      0.61      0.61     20263
weighted avg       0.61      0.61      0.61     20263



# Naive Bayes

In [18]:
from sklearn.naive_bayes import GaussianNB # 1. choose model class

model = GaussianNB()                       # 2. instantiate model
model.fit(X_train, y_train)

GaussianNB()

In [19]:
y_model = model.predict(X_test)
# y_model = model.predict(X_train)

In [20]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_model)
#accuracy_score(y_train, y_model)

0.6039147956246402

In [21]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_model))
# print(classification_report(y_train, y_model))


              precision    recall  f1-score   support

           0       0.62      0.53      0.57      4341
           1       0.59      0.68      0.63      4344

    accuracy                           0.60      8685
   macro avg       0.61      0.60      0.60      8685
weighted avg       0.61      0.60      0.60      8685



# KNN

In [22]:
knn = neighbors.KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train, y_train)
accuracy = knn.score(X_test, y_test)
accuracy


0.5635002878526194

In [23]:
y_pred = knn.predict(X_test)
from sklearn.metrics import classification_report

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.56      0.55      0.56      4341
           1       0.56      0.57      0.57      4344

    accuracy                           0.56      8685
   macro avg       0.56      0.56      0.56      8685
weighted avg       0.56      0.56      0.56      8685



After balancing the dataset we find that, f1 score as well as model has improved for all the three algorithms.
By the choice of the features I have selected here and the choice of K-value, Logistic regression shows comparably better performance for this dataset.