In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load dataset
data = pd.read_csv(r'C:\Users\krishna devda\OneDrive\Desktop\Loan_prediction_model\loan_prediction (1).csv')

# Drop unnecessary columns
data.drop(['Loan_ID', 'Dependents'], axis=1, inplace=True)

# Handling missing values
data.fillna({
    'Gender': data['Gender'].mode()[0],
    'Married': data['Married'].mode()[0],
    'Self_Employed': data['Self_Employed'].mode()[0],
    'LoanAmount': data['LoanAmount'].median(),
    'Loan_Amount_Term': data['Loan_Amount_Term'].mode()[0],
    'Credit_History': data['Credit_History'].mode()[0]
}, inplace=True)

# Encoding categorical variables
data['Gender'] = data['Gender'].map({'Male': 1, 'Female': 0})
data['Married'] = data['Married'].map({'Yes': 1, 'No': 0})
data['Education'] = data['Education'].map({'Graduate': 1, 'Not Graduate': 0})
data['Self_Employed'] = data['Self_Employed'].map({'Yes': 1, 'No': 0})
data['Credit_History'] = data['Credit_History'].astype(int)
data = pd.get_dummies(data, columns=['Property_Area'])

# Feature Engineering
data['TotalIncome'] = data['ApplicantIncome'] + data['CoapplicantIncome']
data['LoanAmountLog'] = np.log1p(data['LoanAmount'])  # Log transformation

# Selecting features
X = data[['Gender', 'Married', 'Education', 'TotalIncome', 'LoanAmountLog', 
          'Loan_Amount_Term', 'Credit_History', 'Property_Area_Rural', 
          'Property_Area_Semiurban', 'Property_Area_Urban']]

y = data['Loan_Status'].map({'Y': 1, 'N': 0})

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Handling Imbalanced Data
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

# Standardization
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Model Training (Random Forest)
model = RandomForestClassifier(n_estimators=200, random_state=42, max_depth=10)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Confusion Matrix: \n{conf_matrix}")
print(f"Classification Report: \n{class_report}")


Accuracy: 0.8293
Confusion Matrix: 
[[25 13]
 [ 8 77]]
Classification Report: 
              precision    recall  f1-score   support

           0       0.76      0.66      0.70        38
           1       0.86      0.91      0.88        85

    accuracy                           0.83       123
   macro avg       0.81      0.78      0.79       123
weighted avg       0.83      0.83      0.83       123



In [2]:
import pickle
pickle.dump(model, open('loan_prediction.pkl', 'wb'))
