In [1]:
# ============================================================
# Stage 1: Accepted vs Rejected (Data Prep + Persisted Sample)
# ============================================================

from pathlib import Path
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 140)

# ----------------------------
# Paths
# ----------------------------
DATA_DIR = Path("../data")                # raw CSVs live here (gitignored)
SAMPLE_DIR = Path("../data_sample")       # lightweight samples live here
SAMPLE_DIR.mkdir(parents=True, exist_ok=True)

ACCEPTED_CSV = DATA_DIR / "accepted_2007_to_2018Q4.csv"
REJECTED_CSV = DATA_DIR / "rejected_2007_to_2018Q4.csv"

SAMPLE_PARQUET = SAMPLE_DIR / "stage1_accept_reject_sample.parquet"

# ----------------------------
# Config
# ----------------------------
SAMPLE_N = 500_000     # adjust: 200_000 (fast), 500_000 (good), 1_000_000 (heavier)
RANDOM_STATE = 42

# ----------------------------
# Helpers
# ----------------------------
def strip_pct(x):
    """Convert strings like '13.56%' to float 13.56; return NaN if not convertible."""
    if pd.isna(x):
        return np.nan
    if isinstance(x, str):
        s = x.strip()
        if s.endswith("%"):
            s = s[:-1].strip()
        try:
            return float(s)
        except Exception:
            return np.nan
    return x

def safe_drop(df, cols):
    cols = [c for c in cols if c in df.columns]
    if cols:
        return df.drop(columns=cols)
    return df

# ----------------------------
# Load or build sample
# ----------------------------
if SAMPLE_PARQUET.exists():
    print(f"✅ Loading existing sample: {SAMPLE_PARQUET}")
    df1 = pd.read_parquet(SAMPLE_PARQUET)
else:
    print("⏳ No sample found. Building sample from raw CSVs (one-time cost).")

    # 1) Load raw files
    acc = pd.read_csv(ACCEPTED_CSV, low_memory=False)
    rej = pd.read_csv(REJECTED_CSV, low_memory=False)

    # 2) Add labels
    acc["is_accepted"] = 1
    rej["is_accepted"] = 0

    # 3) Intersection columns (IMPORTANT: do NOT duplicate label)
    common = sorted(set(acc.columns).intersection(set(rej.columns)))
    keep_cols = [c for c in common if c != "is_accepted"] + ["is_accepted"]

    acc_s1 = acc[keep_cols].copy()
    rej_s1 = rej[keep_cols].copy()

    # Release memory
    del acc, rej

    # 4) Concatenate
    df_full = pd.concat([acc_s1, rej_s1], ignore_index=True)

    # 5) Ensure no duplicate columns remain
    df_full = df_full.loc[:, ~df_full.columns.duplicated()].copy()
    assert df_full["is_accepted"].ndim == 1, "❌ is_accepted is not 1-D (duplicate column names still exist)."

    # 6) Optional: drop "UI-unfriendly" / PII-ish / free-text columns if present
    DROP_COLS = [
        "zip_code", "addr_state", "state",
        "emp_title", "title", "desc",
        "url", "id", "member_id"
    ]
    df_full = safe_drop(df_full, DROP_COLS)

    # 7) Basic type cleaning (only if columns exist)
    for col in ["dti", "revol_util", "int_rate"]:
        if col in df_full.columns:
            df_full[col] = df_full[col].apply(strip_pct)

    # term "36 months" -> 36
    if "term" in df_full.columns:
        df_full["term"] = (
            df_full["term"]
            .astype(str)
            .str.extract(r"(\d+)")
            .astype(float)
        )

    # Force label dtype
    df_full["is_accepted"] = pd.to_numeric(df_full["is_accepted"], errors="coerce").astype("int8")

    # 8) Stratified sampling WITHOUT sklearn (fast and memory-friendly)
    df_pos = df_full[df_full["is_accepted"] == 1]
    df_neg = df_full[df_full["is_accepted"] == 0]

    pos_ratio = len(df_pos) / len(df_full)
    n_pos = int(SAMPLE_N * pos_ratio)
    n_neg = SAMPLE_N - n_pos

    print(f"Sampling {SAMPLE_N:,} rows total -> {n_pos:,} accepted, {n_neg:,} rejected")

    df_pos_s = df_pos.sample(n=min(n_pos, len(df_pos)), random_state=RANDOM_STATE)
    df_neg_s = df_neg.sample(n=min(n_neg, len(df_neg)), random_state=RANDOM_STATE)

    df1 = pd.concat([df_pos_s, df_neg_s], ignore_index=True).sample(frac=1.0, random_state=RANDOM_STATE).reset_index(drop=True)

    # Release memory
    del df_full, df_pos, df_neg, df_pos_s, df_neg_s

    # 9) Save sample as Parquet for fast reloads
    # Requires pyarrow or fastparquet:
    # pip install pyarrow
    df1.to_parquet(SAMPLE_PARQUET, index=False)
    print(f"✅ Saved sample to: {SAMPLE_PARQUET}")

# ----------------------------
# Sanity checks
# ----------------------------
print("\n--- Sample Summary ---")
print("Shape:", df1.shape)

dupes = df1.columns[df1.columns.duplicated()].tolist()
print("Duplicate columns:", dupes)

print("\nLabel distribution:")
print(df1["is_accepted"].value_counts())
print(df1["is_accepted"].value_counts(normalize=True))

print("\nDtypes head:")
display(df1.dtypes.head(30))

print("\nMissingness (top 20):")
missing = (df1.isna().mean().sort_values(ascending=False).head(20))
display(missing)


⏳ No sample found. Building sample from raw CSVs (one-time cost).
Sampling 500,000 rows total -> 37,792 accepted, 462,208 rejected
✅ Saved sample to: ..\data_sample\stage1_accept_reject_sample.parquet

--- Sample Summary ---
Shape: (500000, 1)
Duplicate columns: []

Label distribution:
is_accepted
0    462208
1     37792
Name: count, dtype: int64
is_accepted
0    0.924416
1    0.075584
Name: proportion, dtype: float64

Dtypes head:


is_accepted    int8
dtype: object


Missingness (top 20):


is_accepted    0.0
dtype: float64

In [None]:
# ============================================================
# Stage 1 Consolidated Training Cell (with Feature Selection)
# - Uses df1 sample already loaded (500k)
# - Train/Val/Test split
# - Preprocess: impute + scale numeric, impute + one-hot categorical
# - Feature selection: L1 Logistic Regression via SelectFromModel
# - Model candidates: Logistic Regression, Calibrated Linear SVM
# - Select best by Validation ROC-AUC
# - Evaluate on Test
# - Save:
#     ../artifacts/stage1_pipeline.pkl
#     ../artifacts/stage1_metrics.json
#     ../artifacts/metadata.json
# ============================================================

from pathlib import Path
import json
import numpy as np
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import (
    roc_auc_score, average_precision_score,
    classification_report, confusion_matrix
)

# ----------------------------
# Config
# ----------------------------
RANDOM_STATE = 42
ART_DIR = Path("../artifacts")
ART_DIR.mkdir(parents=True, exist_ok=True)

# ----------------------------
# Safety: ensure no duplicate cols and label is 1-D numeric
# ----------------------------
df1 = df1.loc[:, ~df1.columns.duplicated()].copy()
assert "is_accepted" in df1.columns, "df1 must contain is_accepted"

# If somehow duplicated label slipped in earlier, keep first
if isinstance(df1["is_accepted"], pd.DataFrame):
    df1["is_accepted"] = df1["is_accepted"].iloc[:, 0]

df1["is_accepted"] = pd.to_numeric(df1["is_accepted"], errors="coerce").astype("int8")

# ----------------------------
# X/y
# ----------------------------
y = df1["is_accepted"].copy()
X = df1.drop(columns=["is_accepted"]).copy()

# Drop columns that are all-missing (helps preprocessing stability)
all_nan_cols = [c for c in X.columns if X[c].isna().all()]
if all_nan_cols:
    print(f"Dropping {len(all_nan_cols)} all-NaN columns (sample):", all_nan_cols[:10])
    X = X.drop(columns=all_nan_cols)

# Identify numeric/categorical columns
num_cols = X.select_dtypes(include=["number", "bool"]).columns.tolist()
cat_cols = [c for c in X.columns if c not in num_cols]

print(f"Numeric cols: {len(num_cols)} | Categorical cols: {len(cat_cols)} | Total: {X.shape[1]}")

# ----------------------------
# Split (stratified)
# ----------------------------
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=RANDOM_STATE, stratify=y
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=RANDOM_STATE, stratify=y_temp
)

print("Splits:", X_train.shape, X_val.shape, X_test.shape)
print("Train pos rate:", float(y_train.mean()))

# ----------------------------
# Preprocess (sparse-friendly)
# ----------------------------
numeric_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler(with_mean=False)),  # must be False for sparse
])

categorical_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=True)),
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_pipe, num_cols),
        ("cat", categorical_pipe, cat_cols),
    ],
    remainder="drop",
    sparse_threshold=0.3
)

# ----------------------------
# Feature Selection (L1 Logistic Regression)
# - Runs AFTER preprocessing (so it can select among one-hot columns)
# - Uses only TRAIN data inside fit
# ----------------------------
selector = SelectFromModel(
    estimator=LogisticRegression(
        penalty="l1",
        solver="saga",
        class_weight="balanced",
        max_iter=500,
        C=0.5,         # smaller -> more aggressive selection
        n_jobs=-1
    ),
    threshold="median"  # keep features above median abs coef
)

# ----------------------------
# Candidate models
# ----------------------------
candidates = {
    "logreg": LogisticRegression(
        max_iter=400,
        solver="lbfgs",
        class_weight="balanced",
        n_jobs=-1
    ),
    "calibrated_svm": CalibratedClassifierCV(
        estimator=LinearSVC(class_weight="balanced", random_state=RANDOM_STATE),
        method="sigmoid",
        cv=3
    )
}

# ----------------------------
# Train + Validate + pick best
# ----------------------------
results = {}
best_name = None
best_auc = -1.0
best_pipe = None

for name, clf in candidates.items():
    print(f"\n=== Training: {name} (with L1 feature selection) ===")

    pipe = Pipeline(steps=[
        ("prep", preprocess),
        ("select", selector),
        ("model", clf)
    ])

    pipe.fit(X_train, y_train)

    val_proba = pipe.predict_proba(X_val)[:, 1]
    val_auc = roc_auc_score(y_val, val_proba)
    val_ap = average_precision_score(y_val, val_proba)

    # How many transformed features were kept?
    support = pipe.named_steps["select"].get_support()
    kept = int(support.sum())
    total = int(support.shape[0])

    results[name] = {
        "val_roc_auc": float(val_auc),
        "val_pr_auc": float(val_ap),
        "selected_features_kept": kept,
        "selected_features_total": total,
        "selected_features_ratio": float(kept / total) if total else None
    }

    print(f"Val ROC-AUC: {val_auc:.6f} | Val PR-AUC: {val_ap:.6f}")
    print(f"Selected {kept}/{total} transformed features ({kept/total:.1%})")

    if val_auc > best_auc:
        best_auc = val_auc
        best_name = name
        best_pipe = pipe

print(f"\n✅ Best model: {best_name} | Val ROC-AUC: {best_auc:.6f}")

# ----------------------------
# Final test evaluation (best model)
# ----------------------------
test_proba = best_pipe.predict_proba(X_test)[:, 1]
test_auc = roc_auc_score(y_test, test_proba)
test_ap = average_precision_score(y_test, test_proba)

threshold = 0.5
test_pred = (test_proba >= threshold).astype(int)

cm = confusion_matrix(y_test, test_pred)
report = classification_report(y_test, test_pred, digits=4)

print("\n=== Test Metrics (Best Model) ===")
print(f"Test ROC-AUC: {test_auc:.6f}")
print(f"Test PR-AUC : {test_ap:.6f}")
print("\nConfusion Matrix:\n", cm)
print("\nClassification Report:\n", report)

results["best"] = {
    "name": best_name,
    "threshold": threshold,
    "test_roc_auc": float(test_auc),
    "test_pr_auc": float(test_ap),
    "confusion_matrix": cm.tolist(),
    "classification_report": report
}

# ----------------------------
# Save artifacts
# ----------------------------
MODEL_PATH = ART_DIR / "stage1_pipeline.pkl"
METRICS_PATH = ART_DIR / "stage1_metrics.json"
META_PATH = ART_DIR / "metadata.json"

joblib.dump(best_pipe, MODEL_PATH)
with open(METRICS_PATH, "w") as f:
    json.dump(results, f, indent=2)

print(f"\n✅ Saved Stage 1 pipeline to: {MODEL_PATH}")
print(f"✅ Saved Stage 1 metrics to : {METRICS_PATH}")

# ----------------------------
# UI metadata from TRAIN ONLY (no leakage)
# numeric: min/max/p1/p99/recommended_min/recommended_max
# categorical: top categories
# ----------------------------
def numeric_meta(series: pd.Series):
    s = pd.to_numeric(series, errors="coerce").dropna()
    if len(s) == 0:
        return None
    return {
        "min": float(s.min()),
        "max": float(s.max()),
        "p1": float(np.percentile(s, 1)),
        "p99": float(np.percentile(s, 99)),
        "recommended_min": float(np.percentile(s, 10)),
        "recommended_max": float(np.percentile(s, 90)),
    }

meta = {"numeric": {}, "categorical": {}}

for c in X_train.columns:
    if c in num_cols:
        m = numeric_meta(X_train[c])
        if m:
            meta["numeric"][c] = m
    else:
        vals = (
            X_train[c]
            .dropna()
            .astype(str)
            .value_counts()
            .head(30)
            .index
            .tolist()
        )
        meta["categorical"][c] = vals

with open(META_PATH, "w") as f:
    json.dump(meta, f, indent=2)

print(f"✅ Saved UI metadata to     : {META_PATH}")


Numeric cols: 0 | Categorical cols: 0 | Total: 0
Splits: (350000, 0) (75000, 0) (75000, 0)
Train pos rate: 0.07558285714285715

=== Training: logreg (with L1 feature selection) ===




ValueError: Found array with 0 feature(s) (shape=(350000, 0)) while a minimum of 1 is required by LogisticRegression.