In [1]:
import pandas as pd

loan_df=pd.read_csv('Loan_Train.csv')
loan_df.shape

(614, 13)

In [2]:
# Drop Loan_ID
loan_df.drop(['Loan_ID'], axis=1, inplace=True)
loan_df.shape

(614, 12)

In [3]:
loan_df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [4]:
# Drop NULLs
loan_df.dropna(inplace=True)
loan_df.shape

(480, 12)

In [5]:
# Convert the categorical columns to dummy variables
loan_df_dummies = pd.get_dummies(loan_df, columns=['Gender','Married','Dependents','Education','Self_Employed','Property_Area'])
loan_df_dummies.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,Gender_Female,Gender_Male,Married_No,Married_Yes,...,Dependents_1,Dependents_2,Dependents_3+,Education_Graduate,Education_Not Graduate,Self_Employed_No,Self_Employed_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
1,4583,1508.0,128.0,360.0,1.0,N,0,1,0,1,...,1,0,0,1,0,1,0,1,0,0
2,3000,0.0,66.0,360.0,1.0,Y,0,1,0,1,...,0,0,0,1,0,0,1,0,0,1
3,2583,2358.0,120.0,360.0,1.0,Y,0,1,0,1,...,0,0,0,0,1,1,0,0,0,1
4,6000,0.0,141.0,360.0,1.0,Y,0,1,1,0,...,0,0,0,1,0,1,0,0,0,1
5,5417,4196.0,267.0,360.0,1.0,Y,0,1,0,1,...,0,1,0,1,0,0,1,0,0,1


In [6]:
# Split the data into a training and test set, where the Loan_Status column is the target.
# Divide the data set into train and test in 80:20 ratio

import sklearn
from sklearn.model_selection import train_test_split
target = loan_df_dummies.Loan_Status
features=loan_df_dummies.drop('Loan_Status',axis=1)

#splitting
features_train,features_test,target_train,target_test=train_test_split(features,target,test_size=0.2,random_state=1)

#printing shapes of testing and training sets :
print("shape of original dataset :", loan_df_dummies.shape)
print("shape of input - training set", features_train.shape)
print("shape of output - training set", target_train.shape)
print("shape of input - testing set", features_test.shape)
print("shape of output - testing set", target_test.shape)

shape of original dataset : (480, 21)
shape of input - training set (384, 20)
shape of output - training set (384,)
shape of input - testing set (96, 20)
shape of output - testing set (96,)


In [7]:
# Load libraries
import numpy as np
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# Create minmax Scaler
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))

# Create a KNN classifier with default
knn = KNeighborsClassifier(n_neighbors=2, n_jobs=-1)
pipe = Pipeline(steps=[("scale", minmax_scale), ("classifier", knn)])

In [8]:
pipe.fit(features_train, target_train)
pipe.score(features_test,target_test)

0.5729166666666666

In [9]:
# Create space of candidate values
search_space = [{"classifier__n_neighbors": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}]

# Create grid search
classifier = GridSearchCV(pipe, search_space, cv=5, verbose=0).fit(features_train, target_train)

# Best neighborhood size (k)
classifier.best_estimator_.get_params()["classifier__n_neighbors"]

9

In [10]:
classifier.score(features_test,target_test)

0.65625

In [11]:
# include logistic regression and random forest models  

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

knn = [KNeighborsClassifier(n_jobs=-1)]
rf = [RandomForestClassifier(n_jobs=-1)]
logit = [LogisticRegression(solver='liblinear',max_iter=3000)]

# Expand your search space to include logistic regression and random forest models 
search_space2 = [{"classifier": logit,"classifier__penalty": ['l1', 'l2'],"classifier__C": np.logspace(0, 4, 10)},
{"classifier": rf,
"classifier__n_estimators": [10, 100, 1000],
"classifier__max_features": [1, 2, 3]},
{"classifier": knn,
"classifier__n_neighbors": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}
]

# Create grid search
gridsearch = GridSearchCV(pipe, search_space2, cv=5, verbose=0)

# Fit grid search
best_model = gridsearch.fit(features_train, target_train)

# View best model
best_model.best_estimator_.get_params()["classifier"]

In [12]:
best_model.score(features_test,target_test)

0.7395833333333334

The best model among the three models is the logistic regression with the hyper parameters C and penalty of L1 regularization which penalizes the sum of absolute values of the weights and with an accuracy of almost 74%.