# Loan Approval Prediction — Final (v2)

Corrected end-to-end pipeline for local Jupyter/VS Code:

- Smart Tkinter file picker with fallbacks
- Raw-file cleaning (removes leading row numbers)
- Robust target detection and mapping to 0/1
- Preprocessing pipelines (compatible with scikit-learn >=1.4)
- Train/test split with safeguards
- Model training (Logistic Regression, Random Forest) and evaluation
- Model saving (`best_loan_model.joblib`) and simple fairness check

Run cells from top to bottom. If anything fails, rerun the smart loader cell first.

In [None]:
# Standard imports
import os, re
from pathlib import Path
from io import StringIO
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report, confusion_matrix

import warnings
warnings.filterwarnings('ignore')
sns.set_style('whitegrid')
print('Libraries loaded')

In [None]:
# Smart dataset loader + cleaning (handles leading row numbers)
from pathlib import Path
from io import StringIO

DEFAULT_PATH = "/mnt/data/loan_approval_dataset.csv"  # edit if needed
PERSIST_PATH_FILE = "dataset_path.txt"

# determine path
if Path(PERSIST_PATH_FILE).exists():
    p = Path(PERSIST_PATH_FILE).read_text().strip()
    if p:
        path = Path(p)
    else:
        path = Path(DEFAULT_PATH)
else:
    path = Path(DEFAULT_PATH)

# Try Tkinter picker to allow re-selection when running interactively
try:
    import tkinter as tk
    from tkinter import filedialog
    root = tk.Tk()
    root.withdraw()
    root.attributes("-topmost", True)
    chosen = filedialog.askopenfilename(title="Select your loan approval dataset (CSV)", filetypes=[("CSV Files","*.csv"),("All Files","*.*")])
    root.destroy()
    if chosen:
        path = Path(chosen)
except Exception:
    # ignore if running in environment without GUI
    pass

if not path.exists():
    raise FileNotFoundError(f"Dataset not found at: {path.resolve()}")

print('Reading raw file:', path.resolve())
raw_text = path.read_text(encoding='utf-8', errors='replace')

# Remove leading row numbers like '1 ' at start of each line
cleaned_lines = [re.sub(r'^\s*\d+\s+', '', line) for line in raw_text.splitlines()]
cleaned_text = "\n".join(cleaned_lines)

# Read CSV
from io import StringIO
df_raw = pd.read_csv(StringIO(cleaned_text), skipinitialspace=True)
# normalize column names
df_raw.columns = [c.strip().lower() for c in df_raw.columns]

print('Raw read shape:', df_raw.shape)
display(df_raw.head())

# Persist chosen path
try:
    with open(PERSIST_PATH_FILE, 'w', encoding='utf-8') as f:
        f.write(str(path.resolve()))
    print('Saved dataset path to', PERSIST_PATH_FILE)
except Exception as e:
    print('Could not persist path:', e)

# Expose df_raw for next cells
df = df_raw.copy()

In [None]:
# Target detection and robust mapping to 0/1
candidate_targets = ['loan_status','loanstatus','status','approved','is_approved']
target_col = None
for t in candidate_targets:
    if t in df.columns:
        target_col = t
        break

if target_col is None:
    for c in df.columns:
        if df[c].nunique() == 2:
            target_col = c
            break

if target_col is None:
    raise RuntimeError('Could not detect a target column. Columns: ' + ','.join(df.columns))

print('Using target column:', target_col)
print('Raw target value counts:')
print(df[target_col].value_counts(dropna=False).head(20))

def map_target(v):
    if pd.isna(v): return np.nan
    s = str(v).strip().lower()
    if s in ('approved','approve','a','y','yes','1','true'): return 1
    if s in ('rejected','reject','r','n','no','0','false'): return 0
    try:
        nv = float(s)
        if nv==1.0: return 1
        if nv==0.0: return 0
    except: pass
    return np.nan

df['target'] = df[target_col].apply(map_target)
print('\nMapped target counts (including NaN):')
print(df['target'].value_counts(dropna=False))

before = len(df)
df = df[df['target'].notna()].reset_index(drop=True)
after = len(df)
print(f'Dropped {before-after} rows with unmapped target. Remaining rows: {after}')

y = df['target'].astype(int)
X = df.drop(columns=[target_col, 'target'])
print('Final X shape:', X.shape, 'y shape:', y.shape)

In [None]:
# Clean column names (strip) and basic type conversions
X.columns = [c.strip().lower() for c in X.columns]
print(X.dtypes)
display(X.head())

In [None]:
# Feature engineering: create debt-to-income if applicable
if {'income_annum','loan_amount'}.issubset(set(X.columns)):
    X['dti_estimate'] = X['loan_amount'] / (X['income_annum'].replace(0, np.nan))
    print('Created dti_estimate')

In [None]:
# Robust train/test split
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

n_samples = len(X)
print('Total samples:', n_samples)
if n_samples == 0:
    raise ValueError('No samples available after processing target.')

class_counts = y.value_counts()
print('Class counts:', class_counts.to_dict())

use_stratify = True
if class_counts.min() < 2 or len(class_counts) != 2:
    print('Disabling stratify due to small/odd class distribution.')
    use_stratify = False

test_size = 0.2
if n_samples < 10:
    test_size = max(1, int(np.floor(0.2 * n_samples)))
    print('Small dataset — using test size (count):', test_size)

if use_stratify:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42, stratify=y)
else:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42, stratify=None)

print('\nTrain/Test shapes:')
print('X_train:', X_train.shape, '| X_test:', X_test.shape)
print('y_train:', y_train.shape, '| y_test:', y_test.shape)

print('\nTrain class distribution:')
print(pd.Series(y_train).value_counts().to_dict())
print('Test class distribution:')
print(pd.Series(y_test).value_counts().to_dict())

In [None]:
# Preprocessing pipelines (numeric + categorical)
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(include=['object','category','bool']).columns.tolist()

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, num_cols),
    ('cat', categorical_transformer, cat_cols)
], remainder='drop')

print('Numeric cols:', num_cols)
print('Categorical cols:', cat_cols)

In [None]:
# Train models: Logistic Regression and Random Forest
models = {}
models['logreg'] = Pipeline([('pre', preprocessor),('clf', LogisticRegression(max_iter=1000, random_state=42))])
models['rf'] = Pipeline([('pre', preprocessor),('clf', RandomForestClassifier(n_estimators=200, random_state=42))])

for name, pipe in models.items():
    print('Training', name)
    pipe.fit(X_train, y_train)
    print('Done')

In [None]:
# Evaluate models
results = {}
for name, pipe in models.items():
    preds = pipe.predict(X_test)
    probs = pipe.predict_proba(X_test)[:,1] if hasattr(pipe, 'predict_proba') else None
    acc = accuracy_score(y_test, preds)
    f1 = f1_score(y_test, preds)
    auc = roc_auc_score(y_test, probs) if probs is not None else None
    print(f"\n{name} - acc: {acc:.4f}, f1: {f1:.4f}, auc: {auc}")
    print(classification_report(y_test, preds))
    cm = confusion_matrix(y_test, preds)
    sns.heatmap(cm, annot=True, fmt='d')
    plt.title(f"{name} - Confusion Matrix")
    plt.show()
    results[name] = {'acc':acc,'f1':f1,'auc':auc,'model':pipe}

In [None]:
# Select best model by F1 then AUC and save
best_name = max(results.keys(), key=lambda k: (results[k]['f1'], results[k]['auc'] if results[k]['auc'] is not None else 0))
best_model = results[best_name]['model']
print('Best model:', best_name, results[best_name])
joblib.dump({'model': best_model, 'features': X.columns.tolist(), 'target_col': target_col}, 'best_loan_model.joblib')
print('Saved best model to best_loan_model.joblib')

In [None]:
# Simple fairness check
protected = None
for p in ['gender','sex','age','married']:
    if p in X_test.columns:
        protected = p
        break

if protected:
    preds = best_model.predict(X_test)
    df_fair = X_test.reset_index(drop=True).copy()
    df_fair['pred'] = preds
    rates = df_fair.groupby(protected)['pred'].mean()
    print('Approval rates by', protected)
    print(rates)
    di = rates.min()/(rates.max()+1e-9)
    print('Disparate impact:', di)
else:
    print('No common protected attribute present (gender/age/married)')

In [None]:
# Demo prediction
sample = X_test.sample(1, random_state=1)
print('Sample:')
display(sample.head())
print('Predicted approval:', best_model.predict(sample)[0])
if hasattr(best_model, 'predict_proba'):
    print('Approval probability:', best_model.predict_proba(sample)[:,1][0])