Made it more beginner friendly, still need to read through it fully

In [None]:
import numpy as np
import pandas as pd

# --- 0) Load data
df = pd.read_csv("credit_train.csv")

# --- 1) Drop IDs (not predictive)
df = df.drop(columns=["Loan ID", "Customer ID"], errors="ignore")

# --- 2) Deterministic target mapping (NO LabelEncoder)
# 1 = "bad outcome", 0 = "good outcome"
TARGET_MAP = {
    "Defaulted": 1, "Charged Off": 1, "Charged-off": 1, "Delinquent": 1,
    "Fully Paid": 0, "Paid off": 0, "Current": 0, "Approved": 0
}
df = df[df["Loan Status"].isin(TARGET_MAP)].copy()
df["Loan Status"] = df["Loan Status"].map(TARGET_MAP).astype(int)

# --- 3) Clean obvious sentinel
# In this dataset, 100,000,000 sometimes appears as a bogus "Current Loan Amount"
if "Current Loan Amount" in df.columns:
    df.loc[df["Current Loan Amount"] == 100_000_000, "Current Loan Amount"] = np.nan

# --- 4) Handle missing values (keep rules simple)
# Categorical: fill with mode
for col in ["Term", "Home Ownership", "Years in current job", "Purpose"]:
    if col in df.columns:
        df[col] = df[col].fillna(df[col].mode()[0])

# Numerical: use median, except two special cases below
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
num_cols = [c for c in num_cols if c != "Loan Status"]  # exclude target

for col in num_cols:
    df[col] = df[col].fillna(df[col].median())

# Special cases (beginner‑friendly but helpful):
# Missing delinquency often means "none on record" → set high value
if "Months since last delinquent" in df.columns:
    df["Months since last delinquent"] = df["Months since last delinquent"].fillna(999)

# Missing Bankruptcies/Tax Liens → 0
for col in ["Bankruptcies", "Tax Liens"]:
    if col in df.columns:
        df[col] = df[col].fillna(0)

# --- 5) Minimal, high‑value features (2 lines)
if {"Monthly Debt", "Annual Income"}.issubset(df.columns):
    df["DTI"] = df["Monthly Debt"] / (df["Annual Income"] / 12.0 + 1e-9)
if {"Current Credit Balance", "Maximum Open Credit"}.issubset(df.columns):
    df["CreditUtilization"] = df["Current Credit Balance"] / (df["Maximum Open Credit"] + 1e-9)

# --- 6) One‑hot encode categoricals (simple + readable)
cat_cols = df.select_dtypes(include="object").columns.tolist()
Xy = pd.get_dummies(df, columns=cat_cols, drop_first=True)  # avoids dummy trap

# --- 7) Train/test split
from sklearn.model_selection import train_test_split
y = Xy["Loan Status"].values
X = Xy.drop(columns=["Loan Status"])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --- 8) Scale numerics for LogReg/KNN (trees don’t need it)
from sklearn.preprocessing import StandardScaler
num_cols_after_dummies = X.select_dtypes(include=[np.number]).columns
scaler = StandardScaler()
X_train[num_cols_after_dummies] = scaler.fit_transform(X_train[num_cols_after_dummies])
X_test[num_cols_after_dummies] = scaler.transform(X_test[num_cols_after_dummies])

# Save clean splits for modeling notebook / script
X_train.to_csv("X_train_clean.csv", index=False)
X_test.to_csv("X_test_clean.csv", index=False)
pd.Series(y_train).to_csv("y_train_clean.csv", index=False, header=["Loan Status"])
pd.Series(y_test).to_csv("y_test_clean.csv", index=False, header=["Loan Status"])

print("✅ Preprocessing done. Files written: X_train_clean.csv, X_test_clean.csv, y_train_clean.csv, y_test_clean.csv")

