## Domain-Specific Dataset Exploration


In [5]:
# adult_classical_vs_nn.py
# Classical ML (LogReg + RandomForest) vs Feed-forward Neural Net on UCI Adult (Census Income)

import os, io, time, urllib.request, numpy as np, pandas as pd, matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, regularizers

# -----------------------------
# Dataset (UCI Adult / Census Income)
# Source: UCI ML Repo (adult.data, adult.test)
# https://archive.ics.uci.edu/dataset/2/adult
# -----------------------------
COLS = [
    "age","workclass","fnlwgt","education","education-num","marital-status",
    "occupation","relationship","race","sex","capital-gain","capital-loss",
    "hours-per-week","native-country","income"
]
DATA_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
TEST_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test"

def fetch_csv(url, names, skiprows=0):
    raw = urllib.request.urlopen(url).read()
    return pd.read_csv(io.BytesIO(raw), header=None, names=names, skiprows=skiprows, na_values=["?"," ?"])

df_train = fetch_csv(DATA_URL, COLS, skiprows=0)
df_test  = fetch_csv(TEST_URL, COLS, skiprows=1)  # adult.test has a header comment line

# Clean labels: remove trailing periods in test labels and strip spaces
df_train["income"] = df_train["income"].astype(str).str.strip()
df_test["income"]  = df_test["income"].astype(str).str.replace(".", "", regex=False).str.strip()

# Concatenate & shuffle once (we’ll re-split with stratification)
df = pd.concat([df_train, df_test], ignore_index=True)
# Replace NaN (from '?') with 'Unknown' for categorical columns
for c in df.columns:
    if df[c].dtype == object:
        df[c] = df[c].fillna("Unknown").str.strip()

# Target to binary
df["income"] = (df["income"] == ">50K").astype(int)

# Features / target
X = df.drop(columns=["income"])
y = df["income"].values

# Identify column types
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in X.columns if c not in num_cols]

# Time-ordered split isn’t relevant here; we do a stratified split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# -----------------------------
# Preprocess: OneHot(cats) + Scale(nums)
# -----------------------------

ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)
pre = ColumnTransformer(
    transformers=[
        ("cat", ohe, cat_cols),
        ("num", StandardScaler(), num_cols),
    ],
    remainder="drop",
    verbose_feature_names_out=False,
)

# Fit on train only
pre.fit(X_train)
Xtr = pre.transform(X_train)
Xva = pre.transform(X_val)
Xte = pre.transform(X_test)

def metrics(y_true, y_prob, thr=0.5):
    y_pred = (y_prob >= thr).astype(int)
    out = {
        "accuracy": accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred, zero_division=0),
        "recall": recall_score(y_true, y_pred, zero_division=0),
        "f1": f1_score(y_true, y_pred, zero_division=0),
        "auroc": roc_auc_score(y_true, y_prob),
    }
    return out

def show(name, train_m, test_m, tsec):
    print(f"\n=== {name} ===")
    print(f"Train time: {tsec:.2f}s")
    print("Train:", {k: round(v,4) for k,v in train_m.items()})
    print("Test :", {k: round(v,4) for k,v in test_m.items()})

results = {}

# -----------------------------
# Classical 1: Logistic Regression (balanced)
# -----------------------------
logreg = LogisticRegression(max_iter=2000, solver="lbfgs", class_weight="balanced")
t0 = time.perf_counter()
logreg.fit(Xtr, y_train)
t1 = time.perf_counter()
train_prob = logreg.predict_proba(Xtr)[:,1]
test_prob  = logreg.predict_proba(Xte)[:,1]
res_tr = metrics(y_train, train_prob)
res_te = metrics(y_test,  test_prob)
show("Logistic Regression", res_tr, res_te, t1 - t0)
results["logreg"] = {"train": res_tr, "test": res_te, "time_s": t1 - t0}

# -----------------------------
# Classical 2: Random Forest
# -----------------------------
rf = RandomForestClassifier(
    n_estimators=400, max_depth=None, random_state=42, n_jobs=-1, class_weight="balanced_subsample"
)
t0 = time.perf_counter()
rf.fit(Xtr, y_train)
t1 = time.perf_counter()
train_prob = rf.predict_proba(Xtr)[:,1]
test_prob  = rf.predict_proba(Xte)[:,1]
res_tr = metrics(y_train, train_prob)
res_te = metrics(y_test,  test_prob)
show("Random Forest", res_tr, res_te, t1 - t0)
results["random_forest"] = {"train": res_tr, "test": res_te, "time_s": t1 - t0}

# -----------------------------
# Neural Network (Feed-forward MLP)
# -----------------------------
def build_mlp(input_dim, width=(256,128), dropout=0.3, l2=1e-4):
    reg = regularizers.l2(l2) if l2 else None
    inp = keras.Input(shape=(input_dim,))
    x = inp
    for w in width:
        x = layers.Dense(w, kernel_regularizer=reg)(x)
        x = layers.ReLU()(x)
        x = layers.Dropout(dropout)(x)
    out = layers.Dense(1, activation="sigmoid")(x)
    model = keras.Model(inp, out)
    model.compile(optimizer=keras.optimizers.Adam(1e-3),
                  loss="binary_crossentropy",
                  metrics=[keras.metrics.AUC(name="auc")])
    return model

mlp = build_mlp(Xtr.shape[1], width=(256,128), dropout=0.3, l2=1e-4)
es = keras.callbacks.EarlyStopping(monitor="val_auc", mode="max", patience=6, restore_best_weights=True)

t0 = time.perf_counter()
hist = mlp.fit(
    Xtr, y_train.astype("float32"),
    validation_data=(Xva, y_val.astype("float32")),
    epochs=50, batch_size=512, verbose=0, callbacks=[es]
)
t1 = time.perf_counter()

train_prob = mlp.predict(Xtr, verbose=0).ravel()
test_prob  = mlp.predict(Xte, verbose=0).ravel()
res_tr = metrics(y_train, train_prob)
res_te = metrics(y_test,  test_prob)
best_epoch = int(np.argmax(hist.history["val_auc"]) + 1)

show("Neural Net (MLP)", res_tr, res_te, t1 - t0)
print(f"Best val AUC epoch: {best_epoch}")
results["mlp"] = {"train": res_tr, "test": res_te, "time_s": t1 - t0, "best_epoch": best_epoch}

# -----------------------------
# Save & plot quick comparison
# -----------------------------
os.makedirs("outputs", exist_ok=True)
pd.Series(results).to_json("outputs/results.json")
print("\nSaved: outputs/results.json")

# Bar plot: Test AUROC & F1
labels = ["LogReg","RandomForest","MLP"]
aucs = [results["logreg"]["test"]["auroc"], results["random_forest"]["test"]["auroc"], results["mlp"]["test"]["auroc"]]
f1s  = [results["logreg"]["test"]["f1"],   results["random_forest"]["test"]["f1"],   results["mlp"]["test"]["f1"]]

plt.figure()
x = np.arange(len(labels))
w = 0.35
plt.bar(x - w/2, aucs, width=w, label="AUROC")
plt.bar(x + w/2, f1s,  width=w, label="F1")
plt.xticks(x, labels)
plt.title("Adult Income — Test AUROC & F1")
plt.ylabel("Score")
plt.ylim(0,1)
plt.legend()
plt.tight_layout()
plt.savefig("outputs/test_scores.png", dpi=160)
print("Saved: outputs/test_scores.png")


TypeError: OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'