# Model Selection & Hyperparameter Tuning
#### Joshua Greenert
#### DSC550-T301 Data Mining
#### 10/24/2022

In [50]:
# Import the dataset and ensure that it loaded properly
import pandas as pd
import numpy as np

df_loan = pd.read_csv('Loan_Train.csv')
df_loan.head(5)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [51]:
# Prepare the data for modeling by performing the following steps
#  Drop the column “Load_ID.”
df_loan = df_loan.drop('Loan_ID', axis = 1)

#  Drop any rows with missing data.
df_loan = df_loan.dropna(how = "any")

#  Convert the categorical features into dummy variables.
categorical_columns = df_loan.select_dtypes( include ='object').columns
df_loan_new = pd.get_dummies(df_loan, columns = categorical_columns)

In [52]:
# Split the data into a training and test set, where the “Loan_Status” column is the target.
from sklearn.model_selection import train_test_split

train, test = train_test_split(df_loan_new)

# Set the target and the features.
target_test = test.Loan_Status_Y
features_test = test.drop(['Loan_Status_Y'],axis=1).values

# Set the ones for train set.
target_train = train.Loan_Status_Y
features_train = train.drop(['Loan_Status_Y'],axis=1).values

In [62]:
# Create a pipeline with a min-max scaler and a KNN classifier (see section 15.3 in the Machine Learning with Python Cookbook).
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline

# Create a pipeline.
pipe = Pipeline([('scaler', MinMaxScaler()),('knn', KNeighborsClassifier())])

# Fit the data to the pipe 
pipe.fit(features_train, target_train)

# Find the score for the training set.
print('Training set score: ' + str(pipe.score(features_train, target_train)))

Training set score: 0.9555555555555556


In [63]:
# Fit a default KNN classifier to the data with this pipeline. Report the model accuracy on the test set. 
# Note: Fitting a pipeline model works just like fitting a regular model.
pipe.fit(features_test, target_test)

# Find the score for the test set.
print('Test set score: ' + str(pipe.score(features_test, target_test)))

Test set score: 0.8916666666666667


In [64]:
# Create a search space for your KNN classifier where your “n_neighbors” parameter varies from 1 to 10. 
# (see section 15.3 in the Machine Learning with Python Cookbook).
# Create a space of candidate values.
search_space = [{"knn__n_neighbors": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}]

In [65]:
# Fit a grid search with your pipeline, search space, and 5-fold cross-validation to find the best value for
# the “n_neighbors” parameter.
from sklearn.model_selection import GridSearchCV

# set the knn
knn = KNeighborsClassifier(n_neighbors = 5, n_jobs = -1)
pipe = Pipeline([('scaler', MinMaxScaler()),('knn', knn)])

# Create a grid search.
classifier = GridSearchCV(pipe, search_space, cv = 5, verbose = 0).fit(features_test, target_test)

In [66]:
# Find the accuracy of the grid search best model on the test set. 
# Note: It is possible that this will not be an improvement over the default model, but likely it will be.
best_result = classifier.best_score_
best_result

0.8916666666666666

In [11]:
# Now, repeat steps 6 and 7 with the same pipeline, but expand your search space to include logistic regression 
# and random forest models with the hyperparameter values in section 12.3 of the Machine Learning with Python Cookbook.


In [12]:
# What are the best model and hyperparameters found in the grid search? Find the accuracy of this model on the test set.

In [None]:
# Summarize your results.