<a href="https://colab.research.google.com/github/leburik12/machine_learning_jupyte_notebooks/blob/main/Loan_Eligibility_Prediction_using_Logistic_Regression_clean.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Mount Google Drive and load dataset
from google.colab import drive
drive.mount('/content/drive')
file_path = '/content/drive/MyDrive/ml_data/loan-test.csv'
df = pd.read_csv(file_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [17]:
# Impute Loan_Amount_Term with mode (most common value = 360.0)
df['Loan_Amount_Term'] = df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mode()[0])

# Impute LoanAmount with median (robust to outliers)
median_load_amount = df['LoanAmount'].median()
df['LoanAmount'] = df['LoanAmount'].fillna(median_load_amount)

# Impute Dependents (categorical) by sampling from observed distribution
dependents_dist = df['Dependents'].dropna().value_counts(normalize=True)
missing_dep = df[df['Dependents'].isna()].index
df.loc[missing_dep, 'Dependents'] = np.random.choice(dependents_dist.index, size=len(missing_dep), p=dependents_dist.values)

# Impute Gender
gender_dist = df['Gender'].dropna().value_counts(normalize=True)
missing_gender = df[df['Gender'].isna()].index
df.loc[missing_gender, 'Gender'] = np.random.choice(gender_dist.index, size=len(missing_gender), p=gender_dist.values)

# Impute Self_Employed
self_emp_dist = df['Self_Employed'].dropna().value_counts(normalize=True)
missing_self_emp = df[df['Self_Employed'].isna()].index
df.loc[missing_self_emp, 'Self_Employed'] = np.random.choice(self_emp_dist.index, size=len(missing_self_emp), p=self_emp_dist.values)

# Impute Credit_History
credit_dist = df['Credit_History'].dropna().value_counts(normalize=True)
missing_credit = df[df['Credit_History'].isna()].index
df.loc[missing_credit, 'Credit_History'] = np.random.choice(credit_dist.index, size=len(missing_credit), p=credit_dist.values)

# Impute Loan_Status
loan_status_dist = df['Loan_Status'].dropna().value_counts(normalize=True)
missing_status = df[df['Loan_Status'].isna()].index
df.loc[missing_status, 'Loan_Status'] = np.random.choice(loan_status_dist.index, size=len(missing_status), p=loan_status_dist.values)


In [18]:
# Label encode binary columns
labelencoder = LabelEncoder()
binary_cols = ['Gender', 'Married', 'Education', 'Self_Employed']
for col in binary_cols:
    df[col] = labelencoder.fit_transform(df[col])

In [19]:
# One-hot encode Property_Area
df_dummies = pd.get_dummies(df, columns=['Property_Area'], prefix='PropArea')
# Convert boolean columns to 0/1
for col in df_dummies.columns:
    if df_dummies[col].dtype == bool:
        df_dummies[col] = df_dummies[col].astype(int)

In [20]:
# Use RobustScaler to handle skewness/outliers
scaler = RobustScaler()
numeric_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']
df_dummies[numeric_cols] = scaler.fit_transform(df_dummies[numeric_cols])

In [21]:
# Fix Dependents: convert '3+' to 3
df_dummies['Dependents'] = df_dummies['Dependents'].replace('3+', 3).astype(float)

In [22]:
def split_data(df, target='Loan_Status', test_size=0.2, random_state=42):
    X = df.drop(columns=[target, 'Loan_ID'])
    y = df[target]
    return train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y)

X_train, X_test, y_train, y_test = split_data(df_dummies)

In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Base Logistic Regression
log_regression = LogisticRegression(max_iter=1000, random_state=42)
log_regression.fit(X_train, y_train)
y_pred = log_regression.predict(X_test)

# Evaluate
accuracy  = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall    = recall_score(y_test, y_pred)
f1        = f1_score(y_test, y_pred)

print(" Logistic Regression Model Performance:")
print(f" Accuracy : {accuracy:.4f}")
print(f" Precision: {precision:.4f}")
print(f" Recall   : {recall:.4f}")
print(f" F1 Score : {f1:.4f}")


 Logistic Regression Model Performance:
 Accuracy : 0.9865
 Precision: 1.0000
 Recall   : 0.9836
 F1 Score : 0.9917


In [24]:
# Initialize Decision Tree
dt_model = DecisionTreeClassifier(random_state=42)


# Fit on training data
dt_model.fit(X_train, y_train)

# Predict on test data
y_pred_dt = dt_model.predict(X_test)

# Evaluation
accuracy_dt = accuracy_score(y_test, y_pred_dt)
precision_dt = precision_score(y_test, y_pred_dt)
recall_dt = recall_score(y_test, y_pred_dt)
f1_dt = f1_score(y_test, y_pred_dt)

print("🌳 Decision Tree Performance:")
print(f" Accuracy : {accuracy_dt:.4f}")
print(f" Precision: {precision_dt:.4f}")
print(f" Recall   : {recall_dt:.4f}")
print(f" F1 Score : {f1_dt:.4f}")


🌳 Decision Tree Performance:
 Accuracy : 0.9595
 Precision: 0.9677
 Recall   : 0.9836
 F1 Score : 0.9756


In [25]:
# Initialize Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit on training data
rf_model.fit(X_train, y_train)

# Predict on test data
y_pred_rf = rf_model.predict(X_test)

# Evaluation
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)

print(" Random Forest Performance:")
print(f" Accuracy : {accuracy_rf:.4f}")
print(f" Precision: {precision_rf:.4f}")
print(f" Recall   : {recall_rf:.4f}")
print(f" F1 Score : {f1_rf:.4f}")

 Random Forest Performance:
 Accuracy : 0.9730
 Precision: 0.9836
 Recall   : 0.9836
 F1 Score : 0.9836
