## Author: Joshua Ewer {-}

In [15]:
import warnings
warnings.filterwarnings('ignore')
import string
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

### Prepare data for modeling {-}

In [4]:
loan = pd.read_csv('Loan_Train.csv')
loan.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [5]:
loan = loan.drop(columns=['Loan_ID'])

# Drop any rows with missing data
loan = loan.dropna()

In [6]:
loan = pd.get_dummies(loan, columns=['Gender', 'Married', 'Education', 'Dependents', 'Self_Employed', 'Property_Area'])
loan['Loan_Status'] = loan['Loan_Status'].map({'Y': True, 'N': False})
loan['Loan_Status'] = loan['Loan_Status'].astype(bool)

In [7]:
loan.info()

<class 'pandas.core.frame.DataFrame'>
Index: 480 entries, 1 to 613
Data columns (total 21 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ApplicantIncome          480 non-null    int64  
 1   CoapplicantIncome        480 non-null    float64
 2   LoanAmount               480 non-null    float64
 3   Loan_Amount_Term         480 non-null    float64
 4   Credit_History           480 non-null    float64
 5   Loan_Status              480 non-null    bool   
 6   Gender_Female            480 non-null    bool   
 7   Gender_Male              480 non-null    bool   
 8   Married_No               480 non-null    bool   
 9   Married_Yes              480 non-null    bool   
 10  Education_Graduate       480 non-null    bool   
 11  Education_Not Graduate   480 non-null    bool   
 12  Dependents_0             480 non-null    bool   
 13  Dependents_1             480 non-null    bool   
 14  Dependents_2             480 no

In [8]:
# Split into training/test set
x = loan.drop(columns=['Loan_Status']) 
y = loan['Loan_Status']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [9]:
# Set up pipeline
pipeline = Pipeline([
    ('scaler', MinMaxScaler()),
    ('knn', KNeighborsClassifier(n_neighbors=5))  
])

In [10]:
# Fit and report accuracy
pipeline.fit(x_train, y_train)
y_pred = pipeline.predict(x_test)

print(f'Accuracy is: {accuracy_score(y_test, y_pred)}')

Accuracy is: 0.78125


In [11]:
search_space = {
    'knn__n_neighbors': range(1, 11)
}

In [12]:
grid_search = GridSearchCV(pipeline, search_space, cv=5)
grid_search.fit(x_train, y_train)

In [13]:
best_estimator = grid_search.best_estimator_

# Get the best model and show its accuracy
y_pred = best_estimator.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Best n_neighbors: {grid_search.best_params_["knn__n_neighbors"]}')
print(f'Accuracy: {accuracy}')

Best n_neighbors: 9
Accuracy: 0.75


In [17]:
# Repeat steps for logicstic regression and random forest
pipeline = Pipeline([("classifier", RandomForestClassifier())])
# Define the parameter grid for GridSearchCV
search_space = [
    {'classifier': [LogisticRegression(max_iter=500, solver='liblinear')], 'classifier__penalty': ['l1', 'l2'], 'classifier__C': np.logspace(0, 4, 10)},
    {'classifier': [RandomForestClassifier()], 'classifier__n_estimators': [10, 100, 1000], 'classifier__max_features': [1, 2, 3]}
]

In [19]:
grid_search = GridSearchCV(pipeline, search_space, cv=5, verbose=0)
grid_search.fit(x_train, y_train)

In [20]:
best_model = grid_search.best_estimator_
print('Best model:', best_model)

# Test and report accuracy
y_pred = best_model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')

Best model: Pipeline(steps=[('classifier',
                 LogisticRegression(C=2.7825594022071245, max_iter=500,
                                    penalty='l1', solver='liblinear'))])
Accuracy: 0.8229166666666666


### Summary {-}
After trying KNN, Logistic Regression, and Random Forest, the best model selected by sklearn is the LogisticRegression.  The "C" value in the selected model controls the regularization strength.  Too low or too high of a C score can point to under or overfitting a model, but this score shows strong regularization without underfitting the model.  The python cookbook gave me the hyper parameter settings for each of the models, but the choice of the liblinear solver makes sense since it was primarily designed for binary classification problems.