## In this exercise, you will work with the Loan_Train.csv dataset which can be downloaded from this link: Loan Approval Data Set. 

In [370]:
#import necessary packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split #used to split data into training/test sets
from sklearn.metrics import plot_confusion_matrix
import sklearn.metrics as metrics
from sklearn.metrics import confusion_matrix
from yellowbrick.classifier import ConfusionMatrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

## 1. Import the dataset and ensure that it loaded properly.

In [371]:
#load the data into pandas dataframe
df = pd.read_csv('Loan_Train.csv') 
df.head(5)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [372]:
# check shape of df
df.shape

(614, 13)

In [373]:
# Create a new column that converts the class to a coded class of 1 and 0 (Y=1 and N=0) for target in model
df['Loan_Status_Coded'] = df['Loan_Status'].replace(to_replace=["Y","N"], value=[1,0])

In [374]:
# View dataframe
df.head(5)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Loan_Status_Coded
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y,1
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N,0
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y,1
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y,1
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y,1


In [375]:
# check shape of df to ensure columns added
df.shape

(614, 14)

## 2. Prepare the data for modeling by performing the following steps:

### Drop the column “Load_ID.”

In [376]:
df = df.drop(['Loan_ID'], axis=1)
df.head(5)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Loan_Status_Coded
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y,1
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N,0
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y,1
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y,1
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y,1


In [377]:
# check shape of df to ensure columns dropped
df.shape

(614, 13)

### Drop any rows with missing data.

In [378]:
# drop rows with missing data using dropna() function  
df = df.dropna()

In [379]:
# check shape of df to ensure rows dropped
df.shape

(480, 13)

In [380]:
# check columns
df.head(5)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Loan_Status_Coded
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N,0
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y,1
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y,1
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y,1
5,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y,1


### Convert the categorical features into dummy variables.

In [381]:
# Summarize the categorical columns in the dataframe
catCols = df.select_dtypes("object").columns
catCols= list(set(catCols))
catCols

['Gender',
 'Married',
 'Self_Employed',
 'Property_Area',
 'Education',
 'Loan_Status',
 'Dependents']

In [382]:
# Create dummy variables for the origin column
df = pd.concat([df.drop(['Self_Employed',
 'Property_Area',
 'Dependents',
 'Gender',
 'Married',
 'Education',
 'Loan_Status'], axis=1), pd.get_dummies(df[['Self_Employed',
 'Property_Area',
 'Dependents',
 'Gender',
 'Married',
 'Education',
 'Loan_Status']])], axis=1)

In [383]:
# View the dataframe to check
df.head(5)

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status_Coded,Self_Employed_No,Self_Employed_Yes,Property_Area_Rural,Property_Area_Semiurban,...,Dependents_2,Dependents_3+,Gender_Female,Gender_Male,Married_No,Married_Yes,Education_Graduate,Education_Not Graduate,Loan_Status_N,Loan_Status_Y
1,4583,1508.0,128.0,360.0,1.0,0,1,0,1,0,...,0,0,0,1,0,1,1,0,1,0
2,3000,0.0,66.0,360.0,1.0,1,0,1,0,0,...,0,0,0,1,0,1,1,0,0,1
3,2583,2358.0,120.0,360.0,1.0,1,1,0,0,0,...,0,0,0,1,0,1,0,1,0,1
4,6000,0.0,141.0,360.0,1.0,1,1,0,0,0,...,0,0,0,1,1,0,1,0,0,1
5,5417,4196.0,267.0,360.0,1.0,1,0,1,0,0,...,1,0,0,1,0,1,1,0,0,1


In [384]:
# check shape of df to ensure columns added
df.shape

(480, 23)

In [385]:
# look at column type of each column
df.dtypes

ApplicantIncome              int64
CoapplicantIncome          float64
LoanAmount                 float64
Loan_Amount_Term           float64
Credit_History             float64
Loan_Status_Coded            int64
Self_Employed_No             uint8
Self_Employed_Yes            uint8
Property_Area_Rural          uint8
Property_Area_Semiurban      uint8
Property_Area_Urban          uint8
Dependents_0                 uint8
Dependents_1                 uint8
Dependents_2                 uint8
Dependents_3+                uint8
Gender_Female                uint8
Gender_Male                  uint8
Married_No                   uint8
Married_Yes                  uint8
Education_Graduate           uint8
Education_Not Graduate       uint8
Loan_Status_N                uint8
Loan_Status_Y                uint8
dtype: object

## 3. Split the data into a training and test set, where the “Loan_Status” column is the target.

In [386]:
# split the data into a training and test set
X = df.drop(['Loan_Status_Coded', 'Loan_Status_N', 'Loan_Status_Y'], axis = 1) # drop the target variables related to Loan_Status
# get the target
y = df['Loan_Status_Coded'] # (Y=1 and N=0)
#split the data into training and test sets (80% Training/20% Test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

#reset indices in the training and test sets to prevent pandas slicing warnings
X_train = X_train.reset_index(drop = True) #drop + True drops the previous index
X_test = X_test.reset_index(drop = True)
y_train = y_train.reset_index(drop = True)
y_test = y_test.reset_index(drop = True)

In [387]:
# show the sizes of the training and test sets
print(X_train.shape)
print(X_test.shape)
# for Loan Status: (Y=1 and N=0)
print(y_train.value_counts())
print(y_test.value_counts())
#view the training features
X_train.head()

(384, 20)
(96, 20)
1    269
0    115
Name: Loan_Status_Coded, dtype: int64
1    63
0    33
Name: Loan_Status_Coded, dtype: int64


Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Self_Employed_No,Self_Employed_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Gender_Female,Gender_Male,Married_No,Married_Yes,Education_Graduate,Education_Not Graduate
0,5819,5000.0,120.0,360.0,1.0,1,0,1,0,0,0,0,1,0,0,1,0,1,1,0
1,1800,2934.0,93.0,360.0,0.0,1,0,0,0,1,1,0,0,0,0,1,0,1,0,1
2,4166,0.0,116.0,360.0,0.0,1,0,0,1,0,1,0,0,0,1,0,1,0,1,0
3,4124,0.0,115.0,360.0,1.0,1,0,0,1,0,1,0,0,0,1,0,1,0,1,0
4,1993,1625.0,113.0,180.0,1.0,1,0,0,1,0,0,0,1,0,0,1,0,1,0,1


## 4. Create a pipeline with a min-max scaler and a KNN classifier (see section 15.3 in the Machine Learning with Python Cookbook).

In [388]:
# load libraries
from sklearn.neighbors import KNeighborsClassifier
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing

In [389]:
# Create minmax scaler
minmax_scale=preprocessing.MinMaxScaler(feature_range=(0,1))

In [390]:
# Create KNN Classifier
knn=KNeighborsClassifier(n_neighbors=5, n_jobs=-1)

In [391]:
# Create pipeline
pipe= Pipeline([("scaler", minmax_scale), ("knn", knn)])

## 5. Fit a default KNN classifier to the data with this pipeline. Report the model accuracy on the test set. Note: Fitting a pipeline model works just like fitting a regular model.

In [392]:
# Fit a default KNN classifier to the data with the pipeline
pipe.fit(X_train, y_train) 

Pipeline(steps=[('scaler', MinMaxScaler()),
                ('knn', KNeighborsClassifier(n_jobs=-1))])

In [393]:
# Predict Output
y_pred = pipe.predict(X_test)
y_pred

array([1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1], dtype=int64)

In [394]:
# Report model accuracy on the test set

In [395]:
pipe.score(X_test,y_test)

0.6354166666666666

## 6. Create a search space for your KNN classifier where your “n_neighbors” parameter varies from 1 to 10. (see section 15.3 in the Machine Learning with Python Cookbook).

In [396]:
# Create space of candidate values
search_space = [{"knn__n_neighbors": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}]

## 7. Fit a grid search with your pipeline, search space, and 5-fold cross-validation to find the best value for the “n_neighbors” parameter.

In [397]:
# Create standardizer
standardizer = StandardScaler()

In [398]:
# Standardize features
X_standardized = standardizer.fit_transform(X)

In [399]:
# Create grid search
classifier = GridSearchCV(pipe, search_space, cv=5, verbose=0).fit(X_standardized, y)

In [400]:
# Best neighborhood size (k)
classifier.best_estimator_.get_params()["knn__n_neighbors"]

3

## 8. Find the accuracy of the grid search best model on the test set. Note: It is possible that this will not be an improvement over the default model, but likely it will be.

In [401]:
print("KNN tuned hpyerparameters :(best parameters) ",classifier.best_params_)
print("KNN accuracy :",classifier.best_score_)

KNN tuned hpyerparameters :(best parameters)  {'knn__n_neighbors': 3}
KNN accuracy : 0.7229166666666667


## 9. Now, repeat steps 6 and 7 with the same pipeline, but expand your search space to include logistic regression and random forest models with the hyperparameter values in section 12.3 of the Machine Learning with Python Cookbook.

In [418]:
# Re-create pipeline with KNN hyperparameters
pipe= Pipeline([('standardizer', StandardScaler()), ("classifier", KNeighborsClassifier(n_neighbors=5, n_jobs=-1))])

In [419]:
# Expand search_space to include logistic regression and random forest models
# Create dictionary with candidate learning algorithms and their hyperparameters
search_space=[
    {'classifier' : [KNeighborsClassifier()], 
     'classifier__n_neighbors' : [1,2,3,4,5,6,7,8,9,10]},
    {'classifier' : [LogisticRegression()],
     'classifier__penalty' : ['l1', 'l2'],
    'classifier__C' : np.logspace(0, 4, 10),
    'classifier__solver' : ['liblinear']},
    {'classifier' : [RandomForestClassifier()],
    'classifier__n_estimators' : [10,100,1000],
    'classifier__max_features' : [1, 2, 3]}
]


In [420]:
# Create grid search
gridsearch=GridSearchCV(pipe, search_space, cv = 5, verbose=0)

In [421]:
# Fit grid search
best_model=gridsearch.fit(X_train, y_train)

## 10. What are the best model and hyperparameters found in the grid search? Find the accuracy of this model on the test set.

In [422]:
# Best neighborhood size (k)
best_model.best_estimator_.get_params()
## The best model is the Logistic Regression Model with the hyperparameters listed below

{'memory': None,
 'steps': [('standardizer', StandardScaler()),
  ('classifier', LogisticRegression(penalty='l1', solver='liblinear'))],
 'verbose': False,
 'standardizer': StandardScaler(),
 'classifier': LogisticRegression(penalty='l1', solver='liblinear'),
 'standardizer__copy': True,
 'standardizer__with_mean': True,
 'standardizer__with_std': True,
 'classifier__C': 1.0,
 'classifier__class_weight': None,
 'classifier__dual': False,
 'classifier__fit_intercept': True,
 'classifier__intercept_scaling': 1,
 'classifier__l1_ratio': None,
 'classifier__max_iter': 100,
 'classifier__multi_class': 'auto',
 'classifier__n_jobs': None,
 'classifier__penalty': 'l1',
 'classifier__random_state': None,
 'classifier__solver': 'liblinear',
 'classifier__tol': 0.0001,
 'classifier__verbose': 0,
 'classifier__warm_start': False}

In [423]:
# Fit grid search
best_model=gridsearch.fit(X_test, y_test)

In [424]:
print("Best Model accuracy:",best_model.best_score_)

Best Model accuracy: 0.7605263157894737


## 11. Summarize your results.

#### The Logistic Regression with the hyperparameters above is the best model to use for this dataset.