In [1]:
# 9.2 Exercise
# Best Model Selection and Hyperparameter Tuning
## Justin Wisniewski

In [2]:
# Load necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
# Read in the Loan Approval dataset
df=pd.read_csv("Loan_train.csv")

In [4]:
# Display the first 5 rows of data
df.head(5)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [5]:
# Dropping Loan_ID column
df = df.drop(['Loan_ID'], axis=1)
df.shape

(614, 12)

In [6]:
df.dropna(inplace=True)
df.isna().sum()

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [7]:
# Convert the categorical columns to dummy variables
df = pd.get_dummies(df, drop_first=True)

In [8]:
df.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Male,Married_Yes,Dependents_1,Dependents_2,Dependents_3+,Education_Not Graduate,Self_Employed_Yes,Property_Area_Semiurban,Property_Area_Urban,Loan_Status_Y
1,4583,1508.0,128.0,360.0,1.0,1,1,1,0,0,0,0,0,0,0
2,3000,0.0,66.0,360.0,1.0,1,1,0,0,0,0,1,0,1,1
3,2583,2358.0,120.0,360.0,1.0,1,1,0,0,0,1,0,0,1,1
4,6000,0.0,141.0,360.0,1.0,1,0,0,0,0,0,0,0,1,1
5,5417,4196.0,267.0,360.0,1.0,1,1,0,1,0,0,1,0,1,1


In [9]:
# Split the data into a training and test set, where the Loan_Status column is the target
X = df.drop(['Loan_Status_Y'], axis=1)
y = df['Loan_Status_Y']

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [11]:
# Create a pipeline with a min-max scaler and a KNN classifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier

pipe = Pipeline([('scaler', MinMaxScaler()), ('knn', KNeighborsClassifier())])
pipe.fit(X_train, y_train)

Pipeline(steps=[('scaler', MinMaxScaler()), ('knn', KNeighborsClassifier())])

In [12]:
# Accuracy of the model
print(pipe.score(X_test, y_test))

0.7222222222222222


In [13]:
# Create a search space for your KNN classifier where your “n_neighbors” parameter varies from 1 to 10
# Use the GridSearchCV class to find the best value for your “n_neighbors” parameter
from sklearn.model_selection import GridSearchCV
pipe = Pipeline([('scaler', MinMaxScaler()), ('knn', KNeighborsClassifier())])
param_grid = {'knn__n_neighbors': range(1, 11)}
classifier = GridSearchCV(pipe, param_grid, cv=5, n_jobs=-1, verbose=1).fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [14]:
# Best value for n_neighbors
classifier.best_params_

{'knn__n_neighbors': 3}

In [15]:
# Accuracy of the grid search best model on the test set.
classifier.best_score_

0.7679543459174715

In [16]:
# Repeat steps 6 and 7 with the same pipeline, but expand your search space to include logistic regression and random forest models with the hyperparameter.
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

pipe = Pipeline([('scaler', MinMaxScaler()), ('classifier', KNeighborsClassifier())])

search_space = [{
    "classifier": [LogisticRegression()],
    "classifier__penalty": ['l2'],
    "classifier__C": np.logspace(0, 4, 10)
}, {
    "classifier": [RandomForestClassifier()],
    "classifier__n_estimators": [10, 100, 1000],
    "classifier__max_features": [1, 2, 3]
}, {
    "classifier": [KNeighborsClassifier()],
    "classifier__n_neighbors": range(1, 11),
    "classifier__weights": ['uniform', 'distance']
}]
gridsearch = GridSearchCV(pipe, search_space, cv=5, verbose=0, n_jobs=-1)
gridsearch.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', MinMaxScaler()),
                                       ('classifier', KNeighborsClassifier())]),
             n_jobs=-1,
             param_grid=[{'classifier': [LogisticRegression()],
                          'classifier__C': array([1.00000000e+00, 2.78255940e+00, 7.74263683e+00, 2.15443469e+01,
       5.99484250e+01, 1.66810054e+02, 4.64158883e+02, 1.29154967e+03,
       3.59381366e+03, 1.00000000e+04]),
                          'classifier__penalty': ['l2']},
                         {'classifier': [RandomForestClassifier(max_features=3,
                                                                n_estimators=1000)],
                          'classifier__max_features': [1, 2, 3],
                          'classifier__n_estimators': [10, 100, 1000]},
                         {'classifier': [KNeighborsClassifier()],
                          'classifier__n_neighbors': range(1, 11),
                          

In [17]:
# Best model from grid search
best_model = gridsearch.fit(X_test, y_test)
best_model.best_params_

{'classifier': LogisticRegression(),
 'classifier__C': 1.0,
 'classifier__penalty': 'l2'}