In [76]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler,LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score,confusion_matrix
import pandas as pd
import numpy as np

In [77]:
df = pd.read_csv(r"C:\Users\STRADIGI\Downloads\loan_approval_dataset.csv")

In [78]:
df.shape

(4269, 13)

In [79]:
df.duplicated().sum()

np.int64(0)

In [80]:
df.isnull().sum()

loan_id                      0
 no_of_dependents            0
 education                   0
 self_employed               0
 income_annum                0
 loan_amount                 0
 loan_term                   0
 cibil_score                 0
 residential_assets_value    0
 commercial_assets_value     0
 luxury_assets_value         0
 bank_asset_value            0
 loan_status                 0
dtype: int64

In [81]:
df = df.drop(columns='loan_id')

In [82]:
df.columns

Index([' no_of_dependents', ' education', ' self_employed', ' income_annum',
       ' loan_amount', ' loan_term', ' cibil_score',
       ' residential_assets_value', ' commercial_assets_value',
       ' luxury_assets_value', ' bank_asset_value', ' loan_status'],
      dtype='object')

In [83]:
df.columns = df.columns.str.strip().str.replace(' ','_').str.replace('\xa0','')

In [84]:
df.columns = df.columns.str.strip()

In [85]:
df.columns.tolist()

['no_of_dependents',
 'education',
 'self_employed',
 'income_annum',
 'loan_amount',
 'loan_term',
 'cibil_score',
 'residential_assets_value',
 'commercial_assets_value',
 'luxury_assets_value',
 'bank_asset_value',
 'loan_status']

In [86]:
num_col = df.select_dtypes(include=['int','float'])

In [87]:
num_col

Unnamed: 0,no_of_dependents,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value
0,2,9600000,29900000,12,778,2400000,17600000,22700000,8000000
1,0,4100000,12200000,8,417,2700000,2200000,8800000,3300000
2,3,9100000,29700000,20,506,7100000,4500000,33300000,12800000
3,3,8200000,30700000,8,467,18200000,3300000,23300000,7900000
4,5,9800000,24200000,20,382,12400000,8200000,29400000,5000000
...,...,...,...,...,...,...,...,...,...
4264,5,1000000,2300000,12,317,2800000,500000,3300000,800000
4265,0,3300000,11300000,20,559,4200000,2900000,11000000,1900000
4266,2,6500000,23900000,18,457,1200000,12400000,18100000,7300000
4267,1,4100000,12800000,8,780,8200000,700000,14100000,5800000


In [88]:
cat_col = df.select_dtypes(include='object')

In [89]:
cat_col

Unnamed: 0,education,self_employed,loan_status
0,Graduate,No,Approved
1,Not Graduate,Yes,Rejected
2,Graduate,No,Rejected
3,Graduate,No,Rejected
4,Not Graduate,Yes,Rejected
...,...,...,...
4264,Graduate,Yes,Rejected
4265,Not Graduate,Yes,Approved
4266,Not Graduate,No,Rejected
4267,Not Graduate,No,Approved


In [99]:
X = df.drop(columns=['loan_status'])
y = df['loan_status']

In [100]:
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object']).columns.tolist()

print("Numeric columns:", num_cols)
print("Categorical columns:", cat_cols)

Numeric columns: ['no_of_dependents', 'income_annum', 'loan_amount', 'loan_term', 'cibil_score', 'residential_assets_value', 'commercial_assets_value', 'luxury_assets_value', 'bank_asset_value']
Categorical columns: ['education', 'self_employed']


In [101]:
from sklearn.impute import SimpleImputer
num_transformer = SimpleImputer(strategy='median')
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

In [105]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,recall_score,precision_score,f1_score

# Preprocessing
num_transformer = SimpleImputer(strategy='median')
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', num_transformer, num_cols),
    ('cat', cat_transformer, cat_cols)
])

# Random Forest model
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier(random_state=42))
])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit and predict
rf_pipeline.fit(X_train, y_train)
y_pred = rf_pipeline.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.9824355971896955


In [107]:
print(confusion_matrix(y_test, y_pred))

[[529   7]
 [  8 310]]


In [110]:
# --- Step 1: Import libraries ---
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, roc_auc_score, classification_report
)


# Clean column names (remove spaces)
df.columns = df.columns.str.strip()

# Define target
X = df.drop(columns=['loan_status'])
y = df['loan_status']

# --- Step 3: Identify numeric and categorical columns ---
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object']).columns.tolist()

print("Numeric columns:", num_cols)
print("Categorical columns:", cat_cols)

# --- Step 4: Preprocessing ---
num_transformer = SimpleImputer(strategy='median')
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_cols),
        ('cat', cat_transformer, cat_cols)
    ]
)

# --- Step 5: Random Forest Pipeline ---
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier(random_state=42))
])

# --- Step 6: Train-test split ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --- Step 7: Train model ---
rf_pipeline.fit(X_train, y_train)

# --- Step 8: Predictions ---
y_pred = rf_pipeline.predict(X_test)

# --- Step 9: Metrics ---
print("\n‚úÖ Model Performance Metrics")
print("-" * 40)
print("Accuracy :", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='weighted'))
print("Recall   :", recall_score(y_test, y_pred, average='weighted'))
print("F1-score :", f1_score(y_test, y_pred, average='weighted'))

print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

# If binary classification ‚Üí show ROC AUC
if len(y.unique()) == 2:
    y_prob = rf_pipeline.predict_proba(X_test)[:, 1]
    print("\nROC AUC:", roc_auc_score(y_test, y_prob))


Numeric columns: ['no_of_dependents', 'income_annum', 'loan_amount', 'loan_term', 'cibil_score', 'residential_assets_value', 'commercial_assets_value', 'luxury_assets_value', 'bank_asset_value']
Categorical columns: ['education', 'self_employed']

‚úÖ Model Performance Metrics
----------------------------------------
Accuracy : 0.9812646370023419
Precision: 0.9814337811513422
Recall   : 0.9812646370023419
F1-score : 0.981204731501346

Classification Report:
               precision    recall  f1-score   support

    Approved       0.98      0.99      0.99       531
    Rejected       0.99      0.96      0.97       323

    accuracy                           0.98       854
   macro avg       0.98      0.98      0.98       854
weighted avg       0.98      0.98      0.98       854


Confusion Matrix:
 [[528   3]
 [ 13 310]]

ROC AUC: 0.9970002273880113


In [118]:
import pandas as pd
import pickle

with open("loan_model.pkl", "wb") as file:
    pickle.dump(rf_pipeline, file)

print("‚úÖ Model saved successfully as 'loan_model.pkl'")

# ------------------ LOAD MODEL ------------------
with open("loan_model.pkl", "rb") as file:
    loaded_model = pickle.load(file)

print("‚úÖ Model loaded successfully\n")

‚úÖ Model saved successfully as 'loan_model.pkl'
‚úÖ Model loaded successfully



In [119]:
# ------------------ PREDICTION ------------------
# Example applicant (likely to be rejected)
new_applicant = pd.DataFrame([{
    'no_of_dependents': 4,
    'income_annum': 200000,          # low income
    'loan_amount': 800000,           # high loan request
    'loan_term': 10,
    'cibil_score': 500,              # poor CIBIL score
    'residential_assets_value': 50000,
    'commercial_assets_value': 0,
    'luxury_assets_value': 0,
    'bank_asset_value': 20000,
    'education': 'Not Graduate',
    'self_employed': 'Yes'
}])

# Predict using loaded model
model = loaded_model.named_steps['model']
classes = model.classes_
probs = loaded_model.predict_proba(new_applicant)[0]

# Show probabilities for each class
for cls, prob in zip(classes, probs):
    print(f"{cls} probability: {prob*100:.2f}%")

# Get predicted label and probability
pred_label = loaded_model.predict(new_applicant)[0]
pred_proba = probs[list(classes).index(pred_label)]

print("\nPredicted Loan Status:", pred_label)
print("Prediction Confidence:", round(pred_proba*100, 2), "%")

# ------------------ IF-ELSE WITH POLITE FEEDBACK ------------------
if pred_label == 'Approved':
    print("\n‚úÖ Congratulations! Your loan application has been approved.")
    print("Thank you for choosing our service. We wish you success with your financial goals.")
else:
    print("\n‚ùå We‚Äôre sorry to inform you that your loan application has been rejected at this stage.")
    print("Please consider improving your CIBIL score or income before reapplying. "
          "Our team appreciates your effort and wishes you all the best in your future applications.")

# ------------------ SHORT POLITE CONCLUSION ------------------
print("\nüìã Conclusion:")
print("The Random Forest‚Äìbased loan prediction system achieved high accuracy (~98%), "
      "providing fair, transparent, and efficient assessments for loan approvals. "
      "It helps financial institutions make quick and data-driven lending decisions.")

 Approved probability: 7.00%
 Rejected probability: 93.00%

Predicted Loan Status:  Rejected
Prediction Confidence: 93.0 %

‚ùå We‚Äôre sorry to inform you that your loan application has been rejected at this stage.
Please consider improving your CIBIL score or income before reapplying. Our team appreciates your effort and wishes you all the best in your future applications.

üìã Conclusion:
The Random Forest‚Äìbased loan prediction system achieved high accuracy (~98%), providing fair, transparent, and efficient assessments for loan approvals. It helps financial institutions make quick and data-driven lending decisions.
