In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier


In [3]:
df = pd.read_csv("loan.csv")
df.columns = df.columns.str.strip()
df.head()


Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


In [4]:
df.shape
df.isnull().sum()


loan_id                     0
no_of_dependents            0
education                   0
self_employed               0
income_annum                0
loan_amount                 0
loan_term                   0
cibil_score                 0
residential_assets_value    0
commercial_assets_value     0
luxury_assets_value         0
bank_asset_value            0
loan_status                 0
dtype: int64

In [5]:
df.fillna(df.mode().iloc[0], inplace=True)


In [6]:
le = LabelEncoder()
for col in df.columns:
    df[col] = le.fit_transform(df[col])


In [7]:
X = df.drop('loan_status', axis=1)
y = df['loan_status']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

rf_base = RandomForestClassifier()
rf_base.fit(X_train, y_train)

pred_base = rf_base.predict(X_test)
print("Baseline Accuracy:", accuracy_score(y_test, pred_base))


Baseline Accuracy: 0.9800936768149883


In [8]:
df['total_assets'] = (
    df['bank_asset_value'] +
    df['luxury_assets_value'] +
    df['commercial_assets_value'] +
    df['residential_assets_value']
)


In [9]:
X = df.drop('loan_status', axis=1)
y = df['loan_status']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [10]:
params = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10, 20]
}

grid = GridSearchCV(RandomForestClassifier(), params, cv=5)
grid.fit(X_train, y_train)

best_model = grid.best_estimator_
pred_best = best_model.predict(X_test)

print("Improved Accuracy:", accuracy_score(y_test, pred_best))


Improved Accuracy: 0.9800936768149883


In [11]:
print("Baseline Accuracy:", accuracy_score(y_test, pred_base))
print("Improved Accuracy:", accuracy_score(y_test, pred_best))


Baseline Accuracy: 0.9800936768149883
Improved Accuracy: 0.9800936768149883


In [13]:
df.to_csv("improved_loan_data.csv", index=False)
