Final Assignment - IBM Machine Learning with Python
by Jorge Virgilio de Almeida

Objective:
Load a historical dataset from previous loan applications, clean the data, and apply different classification algorithm on the data.

In [249]:
#Load Libraries
import itertools
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter
import pandas as pd
import numpy as np
import matplotlib.ticker as ticker
from sklearn import preprocessing
%matplotlib inline

In [250]:
#Obtain data from internet
path='https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/ML0101ENv3/labs/loan_train.csv'

df=pd.read_csv(path) #we can read it direct form internet

df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,loan_status,Principal,terms,effective_date,due_date,age,education,Gender
0,0,0,PAIDOFF,1000,30,9/8/2016,10/7/2016,45,High School or Below,male
1,2,2,PAIDOFF,1000,30,9/8/2016,10/7/2016,33,Bechalor,female
2,3,3,PAIDOFF,1000,15,9/8/2016,9/22/2016,27,college,male
3,4,4,PAIDOFF,1000,30,9/9/2016,10/8/2016,28,college,female
4,6,6,PAIDOFF,1000,30,9/9/2016,10/8/2016,29,college,male


In [251]:
df.to_csv('loan.csv',index_label=False) #or we can save it first

df_local=pd.read_csv('loan.csv') #and the read it from the local csv file

df_local.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,loan_status,Principal,terms,effective_date,due_date,age,education,Gender
0,0,0,PAIDOFF,1000,30,9/8/2016,10/7/2016,45,High School or Below,male
1,2,2,PAIDOFF,1000,30,9/8/2016,10/7/2016,33,Bechalor,female
2,3,3,PAIDOFF,1000,15,9/8/2016,9/22/2016,27,college,male
3,4,4,PAIDOFF,1000,30,9/9/2016,10/8/2016,28,college,female
4,6,6,PAIDOFF,1000,30,9/9/2016,10/8/2016,29,college,male


For simplicity's sake, let's consider the case data is read directly from internet.

Prior to classfication, we must preprocess the data by:

1) Converting data to the appropriate object format
1) Eliminating columns who are not significant for prediction
2) Converting the categorical features into numerical quantities
4) Creating a normalized set of features to predict the loan status

In [252]:
#Convert effective_date and due_date in data time object
df['effective_date'] = pd.to_datetime(df['effective_date'])
df['due_date'] = pd.to_datetime(df['due_date'])



In [253]:
#Removing useless columns Unnamed: 0.1, Unnamed: 0 and due_date

df= df.drop(['Unnamed: 0.1','Unnamed: 0','due_date'],axis=1)
df.head()

Unnamed: 0,loan_status,Principal,terms,effective_date,age,education,Gender
0,PAIDOFF,1000,30,2016-09-08,45,High School or Below,male
1,PAIDOFF,1000,30,2016-09-08,33,Bechalor,female
2,PAIDOFF,1000,15,2016-09-08,27,college,male
3,PAIDOFF,1000,30,2016-09-09,28,college,female
4,PAIDOFF,1000,30,2016-09-09,29,college,male


In [254]:
#Transforming effective_date and due_date into binary variable weekend

#Convert effective_date into days of the week: 0 to 6, Monday=0, Sunday=6
df['dayofweek'] = df['effective_date'].dt.dayofweek

#Dummy variable for weekend
df['weekend'] = df['dayofweek'].apply(lambda x: 1 if (x>4)  else 0) #Weekend x=5 or x=6
df.head()

#Remove effective_date and dayofweek
df = df.drop(columns=['effective_date','dayofweek'])
df.head()


Unnamed: 0,loan_status,Principal,terms,age,education,Gender,weekend
0,PAIDOFF,1000,30,45,High School or Below,male,0
1,PAIDOFF,1000,30,33,Bechalor,female,0
2,PAIDOFF,1000,15,27,college,male,0
3,PAIDOFF,1000,30,28,college,female,0
4,PAIDOFF,1000,30,29,college,male,0


In [255]:
#Convert categorical features

#Conversion of gender into dummy variables by direct replacement
df['Gender'].replace(to_replace=['male','female'], value=[0,1],inplace=True)

#Conversion of education into numerical values using one hot encoding
df = pd.concat([df,pd.get_dummies(df['education'])], axis=1)
df = df.drop(columns=['education'])
df.head()

Unnamed: 0,loan_status,Principal,terms,age,Gender,weekend,Bechalor,High School or Below,Master or Above,college
0,PAIDOFF,1000,30,45,0,0,0,1,0,0
1,PAIDOFF,1000,30,33,1,0,1,0,0,0
2,PAIDOFF,1000,15,27,0,0,0,0,0,1
3,PAIDOFF,1000,30,28,1,0,0,0,0,1
4,PAIDOFF,1000,30,29,0,0,0,0,0,1


In [256]:
#Set an array of the normalized features for loan status prediction
X = df[['Principal','terms','age','Gender','Bechalor','High School or Below','Master or Above','college']]
X = preprocessing.StandardScaler().fit(X).transform(X)
X


array([[ 0.51578458,  0.92071769,  2.33152555, ...,  1.13639374,
        -0.07624929, -0.86968108],
       [ 0.51578458,  0.92071769,  0.34170148, ..., -0.87997669,
        -0.07624929, -0.86968108],
       [ 0.51578458, -0.95911111, -0.65321055, ..., -0.87997669,
        -0.07624929,  1.14984679],
       ...,
       [-1.31458942, -0.95911111,  1.33661351, ..., -0.87997669,
        -0.07624929,  1.14984679],
       [ 0.51578458,  0.92071769, -0.48739188, ..., -0.87997669,
        -0.07624929,  1.14984679],
       [ 0.51578458,  0.92071769, -0.81902922, ..., -0.87997669,
        -0.07624929,  1.14984679]])

In [257]:
#finally we set an array of loan status
y = df['loan_status'].values
y

array(['PAIDOFF', 'PAIDOFF', 'PAIDOFF', 'PAIDOFF', 'PAIDOFF', 'PAIDOFF',
       'PAIDOFF', 'PAIDOFF', 'PAIDOFF', 'PAIDOFF', 'PAIDOFF', 'PAIDOFF',
       'PAIDOFF', 'PAIDOFF', 'PAIDOFF', 'PAIDOFF', 'PAIDOFF', 'PAIDOFF',
       'PAIDOFF', 'PAIDOFF', 'PAIDOFF', 'PAIDOFF', 'PAIDOFF', 'PAIDOFF',
       'PAIDOFF', 'PAIDOFF', 'PAIDOFF', 'PAIDOFF', 'PAIDOFF', 'PAIDOFF',
       'PAIDOFF', 'PAIDOFF', 'PAIDOFF', 'PAIDOFF', 'PAIDOFF', 'PAIDOFF',
       'PAIDOFF', 'PAIDOFF', 'PAIDOFF', 'PAIDOFF', 'PAIDOFF', 'PAIDOFF',
       'PAIDOFF', 'PAIDOFF', 'PAIDOFF', 'PAIDOFF', 'PAIDOFF', 'PAIDOFF',
       'PAIDOFF', 'PAIDOFF', 'PAIDOFF', 'PAIDOFF', 'PAIDOFF', 'PAIDOFF',
       'PAIDOFF', 'PAIDOFF', 'PAIDOFF', 'PAIDOFF', 'PAIDOFF', 'PAIDOFF',
       'PAIDOFF', 'PAIDOFF', 'PAIDOFF', 'PAIDOFF', 'PAIDOFF', 'PAIDOFF',
       'PAIDOFF', 'PAIDOFF', 'PAIDOFF', 'PAIDOFF', 'PAIDOFF', 'PAIDOFF',
       'PAIDOFF', 'PAIDOFF', 'PAIDOFF', 'PAIDOFF', 'PAIDOFF', 'PAIDOFF',
       'PAIDOFF', 'PAIDOFF', 'PAIDOFF', 'PAIDOFF', 

Now the data is correctly preprocessed, we can apply the different classification algorithms to X in order to create models for the loan status prediction

Classification Algorithm 1: k-Nearest Neighbour

In [258]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=4)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

Train set: (276, 8) (276,)
Test set: (70, 8) (70,)


In [259]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

Ks = 12
mean_acc = np.zeros((Ks-1))
std_acc = np.zeros((Ks-1))
ConfustionMx = [];
for n in range(1,Ks):
    
    #Model Training and Load Status Prediction 
    neigh = KNeighborsClassifier(n_neighbors = n).fit(X_train,y_train)
    yhat=neigh.predict(X_test)
    mean_acc[n-1] = metrics.accuracy_score(y_test, yhat)

    
    std_acc[n-1]=np.std(yhat==y_test)/np.sqrt(yhat.shape[0])

print("The best accuracy was with", mean_acc.max(), "with k = ", mean_acc.argmax()+1)

The best accuracy was with 0.7714285714285715 with k =  5


In [260]:
k = 5
neigh5 = KNeighborsClassifier(n_neighbors = k).fit(X_train, y_train)
yhat5 = neigh5.predict(X_test)
print("Accuracy: ", metrics.accuracy_score(y_test, yhat5))

Accuracy:  0.7714285714285715


Classification Algorithm 2: Decision Tree

In [261]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

modTree = DecisionTreeClassifier(criterion="entropy", max_depth = 4)
modTree.fit(X_train, y_train)
predTree = modTree.predict(X_test)
print("predited: ", predTree[0:5])
print("test set: ", y_test[0:5])
print("Accuracy: ", metrics.accuracy_score(y_test, predTree))

predited:  ['PAIDOFF' 'PAIDOFF' 'PAIDOFF' 'PAIDOFF' 'PAIDOFF']
test set:  ['PAIDOFF' 'PAIDOFF' 'PAIDOFF' 'PAIDOFF' 'PAIDOFF']
Accuracy:  0.7857142857142857


Classification Algorithm 3: Support Vector Machine

In [262]:
from sklearn.svm import SVC

clf = SVC(gamma = 'auto')
clf.fit(X_train, y_train)
clf_pred = clf.predict(X_test)
print("Predicted: ", metrics.accuracy_score(y_test, clf_pred))

Predicted:  0.7857142857142857


Classification Algorithm 4: Logistic Regression

In [263]:
# import require libraries
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

LR = LogisticRegression(C=0.01, solver='liblinear').fit(X_train,y_train)

# predictions and probabilities
LR_yhat = LR.predict(X_test)
yhat_proba = LR.predict_proba(X_test)
print("predicted: ", yhat[0:5])
print("test set: ", y_test[0:5])
print("proba: ", yhat_proba[0:5])
print("Accuracy: ", accuracy_score(y_test, LR_yhat))

predicted:  ['PAIDOFF' 'PAIDOFF' 'PAIDOFF' 'COLLECTION' 'PAIDOFF']
test set:  ['PAIDOFF' 'PAIDOFF' 'PAIDOFF' 'PAIDOFF' 'PAIDOFF']
proba:  [[0.4351017  0.5648983 ]
 [0.38272792 0.61727208]
 [0.40002151 0.59997849]
 [0.44830788 0.55169212]
 [0.42327858 0.57672142]]
Accuracy:  0.7857142857142857


Model Evaluation using Test Set

In [264]:
#First we download the test set

path='https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/ML0101ENv3/labs/loan_test.csv'

test_df=pd.read_csv(path) #we can read it direct form internet

test_df.to_csv('loan_test.csv',index_label=False) #or we can save it first

#test_df_local=pd.read_csv('loan_test.csv') #In case local csv is preferred

y_true = test_df['loan_status'].values

In [265]:
algorithm = ['KNN', 'Decision Tree', 'SVM', 'LogisticRegression']

knn = yhat5[0:54]
Dtree = predTree[0:54]
LogReg = LR_yhat[0:54]
svm_pred = clf_pred[0:54]
yhat_proba = yhat_proba[0:54]
model_predictions = [knn, Dtree, svm_pred, LogReg]

scores = []
for i in range(4):
            scores.append([
                metrics.accuracy_score(y_true, model_predictions[i]), #jaccard
                metrics.f1_score(y_true, model_predictions[i], average = 'weighted') #f1-score
               # metrics.log_loss(y_true, yhat_proba[0:54])
            ])

logLoss = [
    'NA',
    'NA',
    'NA',
    metrics.log_loss(y_true, yhat_proba)
]

df_score = pd.DataFrame(scores, index = algorithm, columns = ['Jaccard', 'F1-score'])
df_score['LogLoss'] = logLoss
df_score

Unnamed: 0,Jaccard,F1-score,LogLoss
KNN,0.685185,0.602361,
Decision Tree,0.740741,0.630418,
SVM,0.740741,0.630418,
LogisticRegression,0.740741,0.630418,0.615586
