# Best Model Selection and Hyperparameter Tuning

In [2]:
import warnings
# Suppress all warnings
warnings.filterwarnings("ignore")

**Data Source:** 

https://www.kaggle.com/datasets/granjithkumar/loan-approval-data-set
<br>
About Dataset
Finance companies deals with some kinds of home loans. They may have their presence across urban, semi urban and rural areas. Customer first applies for home loan and after that company validates the customer eligibility for loan.
Mostly Company wants to automate the loan eligibility process (real time) based on customer detail provided while filling online application form. These details are Gender, Marital Status, Education, Number of Dependents, Income, Loan Amount, Credit History and others. To automate this process, I have provided a data set to identify the customers segments that are eligible for loan amount so that they can specifically target these customers

s. Try to automate this Loan Eligibility Procer>approved (Y/N)
 

## 1. Import the dataset and ensure that it loaded properly.

In [5]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import matplotlib.dates as mdates
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import VarianceThreshold
import warnings

# Load dataset
df = pd.read_csv('Loan_Train.csv')

# Display the first 5 rows
print("First 3 rows:")
print(df.head(3))

First 3 rows:
    Loan_ID Gender Married Dependents Education Self_Employed  \
0  LP001002   Male      No          0  Graduate            No   
1  LP001003   Male     Yes          1  Graduate            No   
2  LP001005   Male     Yes          0  Graduate           Yes   

   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0             5849                0.0         NaN             360.0   
1             4583             1508.0       128.0             360.0   
2             3000                0.0        66.0             360.0   

   Credit_History Property_Area Loan_Status  
0             1.0         Urban           Y  
1             1.0         Rural           N  
2             1.0         Urban           Y  


In [6]:
#display the dimensions of the dataframe
df.shape

(614, 13)

In [7]:
#look at the column data types
df.dtypes

Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

### Drop the column “Loan_ID”

In [9]:
import matplotlib.pyplot as plt
import seaborn as sns

# 2. Drop Unnecessary Features
df.drop(columns=['Loan_ID'], inplace=True)

#display the dimensions of the dataframe
df.shape

(614, 12)

## Drop any rows with missing data.

In [11]:
# Check missing or null values
print("\nMissing values per column:\n")
print(df.isnull().sum())


Missing values per column:

Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64


## Drop rows with missing data

In [13]:
df = df.dropna()
#display the dimensions of the dataframe
df.shape

(480, 12)

## Convert categorical features into dummy variables

In [15]:
df = pd.get_dummies(df, drop_first=True)
# Display the first 5 rows
print("First 3 rows:")
print(df.head(3))

First 3 rows:
   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
1             4583             1508.0       128.0             360.0   
2             3000                0.0        66.0             360.0   
3             2583             2358.0       120.0             360.0   

   Credit_History  Gender_Male  Married_Yes  Dependents_1  Dependents_2  \
1             1.0         True         True          True         False   
2             1.0         True         True         False         False   
3             1.0         True         True         False         False   

   Dependents_3+  Education_Not Graduate  Self_Employed_Yes  \
1          False                   False              False   
2          False                   False               True   
3          False                    True              False   

   Property_Area_Semiurban  Property_Area_Urban  Loan_Status_Y  
1                    False                False          False  
2                

## Split the data into a training and test set, where the “Loan_Status” column is the target.

In [17]:

# Step 4: Split the data
X = df.drop(columns=['Loan_Status_Y'])
y = df['Loan_Status_Y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Check missing or null values
print("\nMissing values per column:\n")
print(df.isnull().sum())


Missing values per column:

ApplicantIncome            0
CoapplicantIncome          0
LoanAmount                 0
Loan_Amount_Term           0
Credit_History             0
Gender_Male                0
Married_Yes                0
Dependents_1               0
Dependents_2               0
Dependents_3+              0
Education_Not Graduate     0
Self_Employed_Yes          0
Property_Area_Semiurban    0
Property_Area_Urban        0
Loan_Status_Y              0
dtype: int64


##  Create a pipeline with a min-max scaler and a KNN classifier

In [19]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Step 5: Create a pipeline with MinMaxScaler and default KNN
# Scaling is crucial for KNN, because KNN relies on distances between points — unscaled data would skew results if features are on different scales (e.g., income vs. age).
knn_pipeline = Pipeline([
    ('scaler', MinMaxScaler()), #  scales all numeric features to a range between 0 and 1.
    ('knn', KNeighborsClassifier()) # Fit a default KNN classifier to the data with this pipeline
])



## Report the model accuracy on the test set. 
## Note: Fitting a pipeline model works just like fitting a regular model.

In [21]:
# Fit and evaluate default KNN model

# Trains the pipeline on the training data (X_train, y_train).
knn_pipeline.fit(X_train, y_train) 

# Uses the trained model to predict the class labels (Loan_Status) on the test data (X_test).
y_pred = knn_pipeline.predict(X_test) 

# Compares the predicted values (y_pred) to the actual test labels (y_test).
default_knn_accuracy = accuracy_score(y_test, y_pred)

# the proportion of correct predictions over total test samples.
print("Default KNN Accuracy:", default_knn_accuracy) 



Default KNN Accuracy: 0.78125


In [22]:
# KNN Model correctly predicted the loan status for 78.1% of the test data using the default n_neighbors=5 and a standardized feature set
#KNN model is doing reasonably well without any tuning. Model can be improved by adjusting n_neighbors or trying different models.

##	Create a search space for your KNN classifier where your “n_neighbors” parameter varies from 1 to 10.
##                       (see section 15.3 in the Machine Learning with Python Cookbook).

In [24]:
# Define KNN search space
knn_param_grid = {
    'knn__n_neighbors': list(range(1, 11)) # GridSearchCV will try 10 different values for n_neighbors
}

## Fit a grid search with your pipeline, search space, and 5-fold cross-validation to find the best value for the “n_neighbors” parameter.

In [26]:
#  GridSearchCV Grid search for best KNN model to find the best number of neighbors that maximizes prediction accuracy using cross-validation.
knn_grid = GridSearchCV(knn_pipeline, knn_param_grid, cv=5)
knn_grid.fit(X_train, y_train)


## Find the accuracy of the grid search best model on the test set. Note: It is possible that this will not be an improvement over the default model, but likely it will be.

In [28]:
# Step 8: Evaluate best KNN model
best_knn = knn_grid.best_estimator_ #retrieves the best KNN pipeline from the grid search — the one that had the highest cross-validation score.
knn_grid_accuracy = accuracy_score(y_test, best_knn.predict(X_test))  # evaluates the test set accuracy of the best KNN model using accuracy_score.
print("Best KNN Accuracy:", knn_grid_accuracy)
print("Best KNN n_neighbors:", knn_grid.best_params_)



Best KNN Accuracy: 0.7916666666666666
Best KNN n_neighbors: {'knn__n_neighbors': 3}


## Now, repeat steps 6 and 7 with the same pipeline, but expand your search space to include logistic regression and random forest models with the hyperparameter values

In [None]:
# Step 9: Expanded search space for KNN, LogisticRegression, and RandomForest
pipe = Pipeline([
    ('scaler', MinMaxScaler()),
    ('clf', KNeighborsClassifier())  # placeholder
])
#The param_grid is used for grid search hyperparameter tuning in GridSearchCV. 
# This structure allows to perform model selection and hyperparameter tuning in a single search.
# clf__n_neighbors means "vary the n_neighbors parameter of the estimator named 'clf' in the pipeline".

param_grid = [
    {'clf': [KNeighborsClassifier()],
     'clf__n_neighbors': list(range(1, 11))}, # list(range(1, 11)) tests n_neighbors from 1 to 10.
    
    {'clf': [LogisticRegression(max_iter=500)],
     'clf__C': np.logspace(0, 4, 10)},
    
    {'clf': [RandomForestClassifier()], 
     'clf__n_estimators': [10 ,100, 1000], # n_estimators is the number of trees in the forest. 10: very small forest (faster, less accurate), 100: common default, balanced,1000: large forest (more accurate, slower training/inference).
     'clf__max_depth': [None, 5, 10]} # limits how deep each tree in the forest can grow. Controlling max_depth helps regularize the model and prevent overfitting.
] 

grid = GridSearchCV(pipe, param_grid, cv=5)
grid.fit(X_train, y_train)



## 	What are the best model and hyperparameters found in the grid search? Find the accuracy of this model on the test set.

In [None]:
# Step 10: Best model and accuracy
best_model = grid.best_estimator_
print("Best Model Parameters:", grid.best_params_)
print("Best Model Type:", type(grid.best_estimator_.named_steps['clf']).__name__)

final_accuracy = accuracy_score(y_test, best_model.predict(X_test))
print("Best Model Accuracy:", final_accuracy)

LogisticRegression outperformed KNN and RandomForest in the grid search based on cross-validation accuracy.

The chosen C value (7.743 from np.logspace) indicates that moderate-to-low regularization worked best on the dataset.

A simpler, interpretable model like Logistic Regression is sufficient.


In [None]:
# Build Random Forest pipeline and fit to training data
rf_pipeline = Pipeline([
    ('scaler', MinMaxScaler()),
    ('clf', RandomForestClassifier(random_state=42))
])
rf_pipeline.fit(X_train, y_train)
rf_preds = rf_pipeline.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_preds)

knn_accuracy, rf_accuracy


# Get feature importances
#code extracts feature importance scores from a trained RandomForestClassifier and creates a sorted DataFrame to show which features contributed 
#most to the model's predictions.

rf_model = rf_pipeline.named_steps['clf']
feature_importances = rf_model.feature_importances_

# Match feature names to importances
# gets the names of the features (column names from the training set), to label each importance score.
feature_names = X_train.columns

importance_df = pd.DataFrame({
    'Feature': feature_names, # names of the features.
    'Importance': feature_importances 
}).sort_values(by='Importance', ascending=False)

# Plot the feature importances
plt.figure(figsize=(10, 6))
# sns.barplot(data=importance_df, x='Importance', y='Feature', palette='viridis')
sns.barplot(data=importance_df, x='Importance', y='Feature', hue='Feature', palette='viridis', legend=False)
plt.title('Feature Importances from Random Forest')
plt.xlabel('Importance Score')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

Model interpretation:

Credit history, Loan Amount and Applicant Income matters the most when the random forest is predicting loan status.
