preprocessing

In [None]:
import numpy as np
import pandas as pd

# 1) Keep raw frames from EDA
# train_df, test_df already exist (untouched)

# 2) Define target and drop ID
TARGET = "Loan_Status"
ID_COL = "Loan_ID"

# 3) Split features/target from RAW (no edits)
X_raw = train_df.drop(columns=[TARGET, ID_COL], errors="ignore")
y_raw = train_df[TARGET].map({'Y':1,'N':0})  # encode target only

# 4) Identify column types from RAW (no editing)
numeric_cols = X_raw.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = X_raw.select_dtypes(exclude=[np.number]).columns.tolist()

numeric_cols, categorical_cols


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# 5) Train/validation split on RAW (still no edits to the raw frames)
X_train_raw, X_val_raw, y_train, y_val = train_test_split(
    X_raw, y_raw, test_size=0.2, random_state=42, stratify=y_raw
)

# 6) Column-wise preprocessing
num_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", num_pipe, numeric_cols),
        ("cat", cat_pipe, categorical_cols)
    ],
    remainder="drop"  # drop anything unexpected
)

# 7) Example: make a modeling-ready matrix
X_train = preprocess.fit_transform(X_train_raw)
X_val   = preprocess.transform(X_val_raw)

X_train.shape, X_val.shape


baseline models

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix

def evaluate(name, y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    p, r, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary", zero_division=0)
    print(f"\n{name}")
    print(f"Accuracy: {acc:.3f}  Precision: {p:.3f}  Recall: {r:.3f}  F1: {f1:.3f}")
    print("Confusion matrix:\n", confusion_matrix(y_true, y_pred))
    print(classification_report(y_true, y_pred, digits=3))

# 8) Logistic Regression (good, interpretable baseline)
logit = Pipeline(steps=[
    ("pre", preprocess),
    ("clf", LogisticRegression(max_iter=2000, class_weight="balanced"))
])
logit.fit(X_train_raw, y_train)
y_pred_lr = logit.predict(X_val_raw)
evaluate("Logistic Regression", y_val, y_pred_lr)

# 9) Random Forest (often stronger out-of-the-box)
rf = Pipeline(steps=[
    ("pre", preprocess),
    ("clf", RandomForestClassifier(
        n_estimators=300, max_depth=None, random_state=42, class_weight="balanced"
    ))
])
rf.fit(X_train_raw, y_train)
y_pred_rf = rf.predict(X_val_raw)
evaluate("Random Forest", y_val, y_pred_rf)


predict

In [None]:
# Build test feature frame from RAW test_df
X_test_raw = test_df.drop(columns=[ID_COL], errors="ignore")

# Use your best pipeline (e.g., rf) to generate predictions
test_pred = rf.predict(X_test_raw)                 # 0/1 labels
test_pred_proba = rf.predict_proba(X_test_raw)[:,1]  # approval probability

# If you want a submission-style frame:
out = pd.DataFrame({
    ID_COL: test_df[ID_COL],
    "Loan_Status_Pred": np.where(test_pred==1, "Y", "N"),
    "Approval_Prob": test_pred_proba
})
out.head()
