#Data Preprocessing & Cleaning

Import Libraries

In [30]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import joblib

Load Dataset

In [31]:
df = pd.read_csv("/content/Loan_Data (1).csv")

Dataset Overview

In [32]:
print("Initial Dataset Overview:")
print(df.head())
print(df.info())
print(df.tail())

Initial Dataset Overview:
    Loan_ID Gender Married Dependents     Education Self_Employed  \
0  LP001002   Male      No          0      Graduate            No   
1  LP001003   Male     Yes          1      Graduate            No   
2  LP001005   Male     Yes          0      Graduate           Yes   
3  LP001006   Male     Yes          0  Not Graduate            No   
4  LP001008   Male      No          0      Graduate            No   

   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0             5849                0.0         NaN             360.0   
1             4583             1508.0       128.0             360.0   
2             3000                0.0        66.0             360.0   
3             2583             2358.0       120.0             360.0   
4             6000                0.0       141.0             360.0   

   Credit_History Property_Area Loan_Status  
0             1.0         Urban           Y  
1             1.0         Rural         

Handle Missing Values (Imputation)

Drop 'Loan_ID' column

Drop rows with missing values for critical columns

In [33]:
print("\nHandling Missing Values...")
df = df.drop('Loan_ID', axis=1)
df = df.dropna(subset=['Gender', 'Dependents', 'Loan_Amount_Term'])
print(f"Dataset Shape after Dropping NaN: {df.shape}")


Handling Missing Values...
Dataset Shape after Dropping NaN: (573, 12)


 Fill Missing 'Self_Employed' and 'Credit_History' with Mode (most frequent value)

In [34]:
df['Self_Employed'].fillna(df['Self_Employed'].mode()[0], inplace=True)
df['Credit_History'].fillna(df['Credit_History'].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Self_Employed'].fillna(df['Self_Employed'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Credit_History'].fillna(df['Credit_History'].mode()[0], inplace=True)


 Clean up the 'Dependents' column, replacing '3+' with '4'

In [35]:
df['Dependents'].replace('3+', '4', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Dependents'].replace('3+', '4', inplace=True)


In [None]:
print("\nMissing Values Check:")
print(df.isnull().sum())


Missing Values Check:
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64


Handle Categorical Data (Encoding)

In [36]:
encoding = {
    'Gender': {'Male': 1, 'Female': 0},
    'Married': {'Yes': 1, 'No': 0},
    'Dependents': {'0': 0, '1': 1, '2': 2, '4': 4},
    'Education': {'Graduate': 1, 'Not Graduate': 0},
    'Self_Employed': {'Yes': 1, 'No': 0},
    'Property_Area': {'Rural': 0, 'Semiurban': 2, 'Urban': 1},
    'Loan_Status': {'Y': 1, 'N': 0}
}
df.replace(encoding, inplace=True)

  df.replace(encoding, inplace=True)


Verify Encoding

In [37]:
print("\nDataset after Encoding:")
print(df.head())


Dataset after Encoding:
   Gender  Married  Dependents  Education  Self_Employed  ApplicantIncome  \
0       1        0           0          1              0             5849   
1       1        1           1          1              0             4583   
2       1        1           0          1              1             3000   
3       1        1           0          0              0             2583   
4       1        0           0          1              0             6000   

   CoapplicantIncome  LoanAmount  Loan_Amount_Term  Credit_History  \
0                0.0         NaN             360.0             1.0   
1             1508.0       128.0             360.0             1.0   
2                0.0        66.0             360.0             1.0   
3             2358.0       120.0             360.0             1.0   
4                0.0       141.0             360.0             1.0   

   Property_Area  Loan_Status  
0              1            1  
1              0           

 Handling Numerical Data

Fill 'LoanAmount' and 'Loan_Amount_Term' with their median values


In [38]:
df['LoanAmount'].fillna(df['LoanAmount'].median(), inplace=True)
df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].median(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['LoanAmount'].fillna(df['LoanAmount'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].median(), inplace=True)


verify dataset after handling numerical data

In [40]:
print("\nDataset after numerical data handling")
print(df.head())


Dataset after numerical data handling
   Gender  Married  Dependents  Education  Self_Employed  ApplicantIncome  \
0       1        0           0          1              0             5849   
1       1        1           1          1              0             4583   
2       1        1           0          1              1             3000   
3       1        1           0          0              0             2583   
4       1        0           0          1              0             6000   

   CoapplicantIncome  LoanAmount  Loan_Amount_Term  Credit_History  \
0                0.0       128.0             360.0             1.0   
1             1508.0       128.0             360.0             1.0   
2                0.0        66.0             360.0             1.0   
3             2358.0       120.0             360.0             1.0   
4                0.0       141.0             360.0             1.0   

   Property_Area  Loan_Status  
0              1            1  
1            

Split Features and Target

In [41]:
X = df.drop('Loan_Status', axis=1)
y = df['Loan_Status']

 Standardize Numerical Features

In [42]:
scaler = StandardScaler()
num_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']
X[num_cols] = scaler.fit_transform(X[num_cols])

Dataset Overview After Preprocessing

In [43]:
print("\nProcessed Data Overview:")
print(X.head())


Processed Data Overview:
   Gender  Married  Dependents  Education  Self_Employed  ApplicantIncome  \
0       1        0           0          1              0         0.088156   
1       1        1           1          1              0        -0.126269   
2       1        1           0          1              1        -0.394385   
3       1        1           0          0              0        -0.465013   
4       1        0           0          1              0         0.113731   

   CoapplicantIncome  LoanAmount  Loan_Amount_Term  Credit_History  \
0          -0.547390   -0.210425          0.281358             1.0   
1          -0.044475   -0.210425          0.281358             1.0   
2          -0.547390   -0.961362          0.281358             1.0   
3           0.238998   -0.307320          0.281358             1.0   
4          -0.547390   -0.052971          0.281358             1.0   

   Property_Area  
0              1  
1              0  
2              1  
3             

#Model Training & Evaluation

 Function to Evaluate a Model with Accuracy & Cross-validation


In [44]:
def evaluate_model(model, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    cross_val = cross_val_score(model, X, y, cv=5)
    avg_cross_val = np.mean(cross_val)
    print(f"{model.__class__.__name__} - Accuracy: {accuracy:.2f}, Cross-Val-Score: {avg_cross_val:.2f}")
    return avg_cross_val

List of Models

In [45]:
models = [
    LogisticRegression(random_state=42),
    DecisionTreeClassifier(random_state=42),
    RandomForestClassifier(random_state=42),
    LGBMClassifier(random_state=42)
]

 Evaluate All Models

In [46]:
model_scores = {model.__class__.__name__: evaluate_model(model, X, y) for model in models}


LogisticRegression - Accuracy: 0.81, Cross-Val-Score: 0.80
DecisionTreeClassifier - Accuracy: 0.73, Cross-Val-Score: 0.72
RandomForestClassifier - Accuracy: 0.78, Cross-Val-Score: 0.78
[LightGBM] [Info] Number of positive: 319, number of negative: 139
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000538 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 357
[LightGBM] [Info] Number of data points in the train set: 458, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.696507 -> initscore=0.830717
[LightGBM] [Info] Start training from score 0.830717
[LightGBM] [Info] Number of positive: 318, number of negative: 140
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000119 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can 

#Hyperparameter Tuning Using RandomizedSearchCV

Hyperparameter Tuning Function for Models

In [47]:
def tune_model(model, param_grid, X, y):
    tuner = RandomizedSearchCV(model, param_grid, cv=5, n_iter=20, verbose=1, random_state=42)
    tuner.fit(X, y)
    print(f"\nBest Score for {model.__class__.__name__}: {tuner.best_score_:.2f}")
    print(f"Best Parameters for {model.__class__.__name__}: {tuner.best_params_}")
    return tuner.best_estimator_

Hyperparameter Grids for Each Model

In [48]:
log_reg_grid = {'C': np.logspace(-4, 4, 20), "solver": ["liblinear"]}
rf_grid = {
    'n_estimators': np.arange(10, 1000, 10),
    'max_features': ['log2', 'sqrt'],
    'max_depth': [None, 3, 5, 10, 20],
    'min_samples_split': [2, 5, 20],
    'min_samples_leaf': [1, 2, 5]
}
dt_grid = {
    'max_depth': [None, 3, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5]
}
lgbm_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [-1, 3, 5, 10],
    'learning_rate': [0.01, 0.05, 0.1, 0.2]
}

Tune Models

In [49]:
best_log_reg = tune_model(LogisticRegression(), log_reg_grid, X, y)
best_rf = tune_model(RandomForestClassifier(), rf_grid, X, y)
best_dt = tune_model(DecisionTreeClassifier(), dt_grid, X, y)
best_lgbm = tune_model(LGBMClassifier(), lgbm_grid, X, y)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[LightGBM] [Info] Number of positive: 318, number of negative: 140
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000108 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 358
[LightGBM] [Info] Number of data points in the train set: 458, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.694323 -> initscore=0.820409
[LightGBM] [Info] Start training from score 0.820409
[LightGBM] [Info] Number of positive: 319, number of negative: 140
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000111 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 361
[LightGBM] [Info] Number of data points in the train set: 45

Save the Best Model (Random Forest in this case)

In [50]:
final_model = best_rf
joblib.dump(final_model, 'loan_status_eligibility_predictor.pkl')
joblib.dump(scaler, 'feature_scaler.pkl')

['feature_scaler.pkl']