In [18]:
"""Install relevant libraries"""
import itertools
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter
import pandas as pd
import numpy as np
import matplotlib.ticker as ticker
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.metrics import jaccard_score, f1_score, log_loss
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import sklearn.tree as tree
from sklearn.linear_model import LogisticRegression
from sklearn import svm
%matplotlib inline

In [3]:
"""Retreive datasets"""
!wget -O loan_train.csv https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-ML0101EN-SkillsNetwork/labs/FinalModule_Coursera/data/loan_train.csv
!wget -O loan_test.csv https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/ML0101ENv3/labs/loan_test.csv

--2022-06-24 06:22:59--  https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-ML0101EN-SkillsNetwork/labs/FinalModule_Coursera/data/loan_train.csv
Resolving cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)... 169.63.118.104
Connecting to cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)|169.63.118.104|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 23101 (23K) [text/csv]
Saving to: ‘loan_train.csv’


2022-06-24 06:23:00 (95.7 MB/s) - ‘loan_train.csv’ saved [23101/23101]

--2022-06-24 06:23:02--  https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/ML0101ENv3/labs/loan_test.csv
Resolving s3-api.us-geo.objectstorage.softlayer.net (s3-api.us-geo.objectstorage.softlayer.net)... 67.228.254.196
Connecting to s3-api.us-geo.objectstorage.softlayer.net (s3-api.us-

In [4]:
"""Load in and clean data for further use"""
def dataLoader(csvfile): 
    """Takes csvfile and processes it in the manner previously used in the test case"""
    print("Loading: {}".format(csvfile))
    df = pd.read_csv(csvfile)
    df['due_date'] = pd.to_datetime(df['due_date'])
    df['effective_date'] = pd.to_datetime(df['effective_date'])
    df['dayofweek'] = df['effective_date'].dt.dayofweek
    df['weekend'] = df['dayofweek'].apply(lambda x: 1 if (x>3)  else 0)
    df.groupby(['Gender'])['loan_status'].value_counts(normalize=True)
    df['Gender'].replace(to_replace=['male','female'], value=[0,1],inplace=True)
    df.groupby(['education'])['loan_status'].value_counts(normalize=True)
    Feature = df[['Principal','terms','age','Gender','weekend']]
    Feature = pd.concat([Feature,pd.get_dummies(df['education'])], axis=1)
    Feature.drop(['Master or Above'], axis = 1,inplace=True)
    X = Feature
    y = df['loan_status'].values
    X= preprocessing.StandardScaler().fit(X).transform(X)
    print("Finished Loading: {}".format(csvfile))
    return df, X, y

train_df, train_X, train_y = dataLoader('loan_train.csv')
test_df, test_X, test_y = dataLoader('loan_test.csv')
    

Loading: loan_train.csv
Finished Loading: loan_train.csv
Loading: loan_test.csv
Finished Loading: loan_test.csv


In [7]:
"""K Nearest Neighbour - data split"""
X_train_k, X_test_k, y_train_k, y_test_k = train_test_split(train_X, train_y, test_size = 0.33, random_state = 4)


Train set sizes: (231, 8) and (231,)
Test set sizes: (115, 8) and (115,)


In [8]:
"""K Nearest Neighbour - identifying best k"""

Ks = 10
mean_acc = np.zeros((Ks-1))
std_acc = np.zeros((Ks-1))

for n in range(1,Ks):
    
    #Train Model and Predict  
    neigh = KNeighborsClassifier(n_neighbors = n).fit(X_train_k,y_train_k)
    yhat=neigh.predict(X_test_k)
    mean_acc[n-1] = metrics.accuracy_score(y_test_k, yhat)

    
    std_acc[n-1]=np.std(yhat==y_test_k)/np.sqrt(yhat.shape[0])

print("Highest accuracy occurs at k = {}".format(mean_acc.argmax()))


Highest accuracy occurs at k = 6


In [9]:
"""K nearest neighbour - best k on test set"""
kbest = mean_acc.argmax()
neigh = KNeighborsClassifier(n_neighbors = kbest).fit(test_X,test_y)
yhat_KNN = neigh.predict(test_X)

In [10]:
"""Decision Tree"""

Tree = DecisionTreeClassifier(criterion="entropy", max_depth = 4)
Tree.fit(train_X, train_y)
yhat_Tree = Tree.predict(test_X)

In [13]:
"""Support Vector Machine"""
clf = svm.SVC(kernel='rbf')
clf.fit(train_X, train_y) 
yhat_SVM = clf.predict(test_X)

In [14]:
"""Logistic Regression"""
LR = LogisticRegression(C=0.01, solver='liblinear').fit(train_X, train_y)
yhat_LR = LR.predict(test_X)

In [17]:
"""Evaluation for the different methods using various metrics"""
for (yh, method) in zip((yhat_KNN, yhat_Tree, yhat_SVM, yhat_LR), ('KNN', 'Tree', 'SVM', 'LR')): 
    j = jaccard_score(test_y, yh, pos_label = "COLLECTION")
    f = f1_score(test_y, yh, pos_label = "COLLECTION")
    llty = np.zeros_like(test_y, dtype = float)
    posMask = test_y == 'COLLECTION'
    llty[posMask] = 1
    llyhkn = np.zeros_like(yh, dtype = float)
    posMask = yh == 'COLLECTION'
    llyhkn[posMask] = 1
    l = log_loss(llty, llyhkn)
    print("Method \t Jaccard \t F1 Score \t Log Loss \n")
    print("{} \t {} \t {} \t {} \n".format(method, j, f, l))

Method 	 Jaccard 	 F1 Score 	 Log Loss 

KNN 	 0.47619047619047616 	 0.6451612903225806 	 7.03578032455609 

Method 	 Jaccard 	 F1 Score 	 Log Loss 

Tree 	 0.2 	 0.3333333333333333 	 7.675298450673303 

Method 	 Jaccard 	 F1 Score 	 Log Loss 

SVM 	 0.0 	 0.0 	 9.594119361501672 

Method 	 Jaccard 	 F1 Score 	 Log Loss 

LR 	 0.0 	 0.0 	 8.954497583865733 



In [None]:
"KNN is the best method"