In [None]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Use imblearn Pipeline + SMOTE
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE


In [None]:
# Step 2: Load Data
loan = pd.read_csv("raw-data(1).csv")


In [None]:
# Step 2a: Rename Columns
rename_dict = {
    "V1": "customer_id",
    "V2": "gender",
    "V3": "married",
    "V4": "age",
    "V5": "dependents",
    "V6": "income",
    "V7": "loan_amount",
    "V8": "loan_term",
    "V9": "credit_amount",
    "V10": "region_code",
    "V11": "employment_status",
    "V12": "education_level",
    "V13": "housing_status",
    "V14": "purpose_of_loan",
    "V15": "credit_history_length",
    "V16": "num_open_accounts",
    "V17": "num_credit_cards",
    "V18": "num_loans",
    "V19": "previous_defaults",
    "V20": "current_default_flag",
    "V21": "asset_value",
    "V22": "collateral_value",
    "V23": "other_income",
    "V24": "coapplicant_income",
    "V25": "total_obligations",
    "V26": "num_late_payments",
    "V27": "num_missed_payments",
    "V28": "public_records_flag",
    "V29": "num_inquiries",
    "V30": "utilization_rate",
    "V31": "risk_tier",
    "V32": "default_flag"
}
loan = loan.rename(columns=rename_dict)

print(loan.head())


In [None]:
loan.info()

In [None]:
loan.isnull().sum()

In [None]:
# Step 4: Data Cleaning
loan = loan.dropna(subset=["default_flag"])

num_cols = loan.select_dtypes(include=np.number).columns.tolist()
num_cols.remove("default_flag")

for col in num_cols:
    loan[col] = loan[col].fillna(loan[col].median())

cat_cols = loan.select_dtypes(include="object").columns.tolist()
for col in cat_cols:
    loan[col] = loan[col].fillna(loan[col].mode()[0])


In [None]:
# Step 5: Encoding + Scaling
X = loan.drop(["default_flag", "customer_id"], axis=1)
y = loan["default_flag"]

num_cols = X.select_dtypes(include=np.number).columns.tolist()
cat_cols = X.select_dtypes(include="object").columns.tolist()

encoder = OneHotEncoder(drop="first", sparse_output=False, handle_unknown="ignore")
scaler  = StandardScaler()

preprocessor = ColumnTransformer([
    ("num", scaler, num_cols),
    ("cat", encoder, cat_cols)
])


In [None]:
# Step 6: Train/Test Split
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


In [None]:
# Step 7: Models + Pipelines (with SMOTE inside)
models = {
    "Logistic Regression": LogisticRegression(max_iter=500),
    "Decision Tree": DecisionTreeClassifier(max_depth=5),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}

pipelines = {
    name: Pipeline([
        ("preprocessor", preprocessor),
        ("smote", SMOTE(random_state=42)),   # <-- SMOTE inside pipeline
        ("model", model)
    ])
    for name, model in models.items()
}


In [None]:
# Step 8: Training & Evaluation
results = []
for name, pipe in pipelines.items():
    pipe.fit(X_train, y_train)   # pipeline handles preprocessing + SMOTE
    
    y_pred = pipe.predict(X_valid)
    y_proba = pipe.predict_proba(X_valid)[:, 1]
    
    acc = accuracy_score(y_valid, y_pred)
    roc_auc = roc_auc_score(y_valid, y_proba)
    
    results.append([name, acc, roc_auc])
    print(f"--- {name} ---")
    print(classification_report(y_valid, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_valid, y_pred))
    print("\n")

results_df = pd.DataFrame(results, columns=["Model", "Accuracy", "ROC-AUC"])
print(results_df)


In [None]:
# Step 9: ROC Curve
plt.figure(figsize=(8,6))
for name, pipe in pipelines.items():
    y_proba = pipe.predict_proba(X_valid)[:, 1]
    fpr, tpr, _ = roc_curve(y_valid, y_proba)
    plt.plot(fpr, tpr, label=f"{name} (AUC={roc_auc_score(y_valid, y_proba):.3f})")

plt.plot([0,1], [0,1], "k--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve Comparison")
plt.legend()
plt.show()


In [None]:
# Step 10: Final Model Training on Full Data + Test Evaluation
best_model = pipelines["Random Forest"]   # pick best model based on Step 8 results
best_model.fit(X_train, y_train)          # pipeline handles preprocessing + SMOTE

# Load test data
test_df = pd.read_csv("test.csv")   # replace with your actual test file

# Drop ID and target to get features
X_test = test_df.drop(["default_flag", "customer_id"], axis=1)
y_test = test_df["default_flag"]

# Evaluate
y_test_pred = best_model.predict(X_test)
y_test_proba = best_model.predict_proba(X_test)[:, 1]

print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
print("Test ROC-AUC:", roc_auc_score(y_test, y_test_proba))
print(classification_report(y_test, y_test_pred))
