# 03 – Modeling

In [7]:
import json
import joblib
import pandas as pd
from pathlib import Path
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report

# --- Paths (Portuguese subject) ---
DATA_PATH = Path("../data/student-por.csv")
ART = Path("../artifacts/portuguese")
ART.mkdir(parents=True, exist_ok=True)

# --- Load UCI data (semicolon-delimited). If misread, re-read with sep=';' ---
df = pd.read_csv(DATA_PATH, sep=";")
if df.shape[1] == 1 and ";" in df.columns[0]:
    # Defensive fallback if it got read as one giant column
    df = pd.read_csv(DATA_PATH, sep=";")

# Normalize column names and ensure numeric G3
df.columns = [c.strip() for c in df.columns]
if "G3" not in df.columns:
    raise ValueError(f"'G3' not found. Columns are: {list(df.columns)}. "
                     "Make sure you're using the original UCI CSV for Portuguese.")

df["G3"] = pd.to_numeric(df["G3"], errors="coerce")

# --- Create binary target (UCI rule: pass if G3 >= 10) ---
if "passed" not in df.columns:
    df["passed"] = (df["G3"].fillna(-1) >= 10).astype(int)

target = "passed"

# --- Features/labels; DROP G3 to avoid leakage in a mid-course model ---
X = df.drop(columns=[target]).copy()
if "G3" in X.columns:
    X = X.drop(columns=["G3"])   # critical leakage fix

y = df[target].copy()

# --- Sanity check: stratified split needs both classes ---
vc = y.value_counts()
if len(vc) < 2:
    raise ValueError(
        f"Only one class present in target: {vc.to_dict()}. "
        "Check delimiter and that G3 has valid values."
    )

# --- Column typing ---
categorical_cols = [c for c in X.columns if X[c].dtype == "object"]
numeric_cols = [c for c in X.columns if c not in categorical_cols]

# --- Preprocess: scale numeric + one-hot categorical ---
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
    ],
    remainder="drop",
)

# --- Models ---
models = {
    "logreg": LogisticRegression(max_iter=500, class_weight="balanced"),
    "dtree": DecisionTreeClassifier(random_state=42),
    "rf": RandomForestClassifier(
        n_estimators=300, random_state=42, class_weight="balanced", n_jobs=-1
    ),
    "xgb": XGBClassifier(
        n_estimators=300, learning_rate=0.08, max_depth=5,
        subsample=0.9, colsample_bytree=0.9, reg_lambda=1.0,
        random_state=42, eval_metric="logloss", n_jobs=-1
    ),
}

# --- Train/test split ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --- Train & select best (by accuracy; switch to F1 if you prefer) ---
results = []
best_name, best_score, best_pipeline = None, -1.0, None

for name, clf in models.items():
    pipe = Pipeline(steps=[("pre", preprocessor), ("clf", clf)])
    pipe.fit(X_train, y_train)

    y_pred = pipe.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    prec, rec, f1, _ = precision_recall_fscore_support(
        y_test, y_pred, average="binary", zero_division=0
    )

    results.append((name, acc, prec, rec, f1))
    if acc > best_score:
        best_score = acc
        best_name = name
        best_pipeline = pipe

print("Model Results (name, acc, prec, rec, f1):")
for row in results:
    print(row)

print(f"\nBest model: {best_name} (acc={best_score:.3f})")
print("\nClassification report for best model:")
print(classification_report(y_test, best_pipeline.predict(X_test), zero_division=0))

# --- Save artifacts (Portuguese folder) ---
model_path = ART / "best_model.joblib"
joblib.dump(best_pipeline, model_path)
print(f"Saved best model → {model_path}")

# --- Save feature schema for the app (based on X columns, so G3 won't appear) ---
schema = []
for col in X.columns:
    typ = "numeric" if col in numeric_cols else "categorical"
    cats = None
    if typ == "categorical":
        cats = sorted(df[col].dropna().astype(str).unique().tolist())
        if len(cats) > 30:
            cats = cats[:30]  # keep UI snappy
    schema.append({"name": col, "type": typ, "categories": cats})

schema_path = ART / "feature_schema.json"
with open(schema_path, "w", encoding="utf-8") as f:
    json.dump(schema, f, ensure_ascii=False, indent=2)
print(f"Saved schema → {schema_path}")


Model Results (name, acc, prec, rec, f1):
('logreg', 0.8769230769230769, 0.9519230769230769, 0.9, 0.9252336448598131)
('dtree', 0.8538461538461538, 0.9174311926605505, 0.9090909090909091, 0.91324200913242)
('rf', 0.8846153846153846, 0.9130434782608695, 0.9545454545454546, 0.9333333333333333)
('xgb', 0.9076923076923077, 0.9375, 0.9545454545454546, 0.9459459459459459)

Best model: xgb (acc=0.908)

Classification report for best model:
              precision    recall  f1-score   support

           0       0.72      0.65      0.68        20
           1       0.94      0.95      0.95       110

    accuracy                           0.91       130
   macro avg       0.83      0.80      0.82       130
weighted avg       0.90      0.91      0.91       130

Saved best model → ..\artifacts\portuguese\best_model.joblib
Saved schema → ..\artifacts\portuguese\feature_schema.json


In [5]:
print(df.shape)
print(sorted(df.columns.tolist())[:15], "...")  # peek at columns
df[['G1','G2','G3','passed']].head()


(649, 34)
['Dalc', 'Fedu', 'Fjob', 'G1', 'G2', 'G3', 'Medu', 'Mjob', 'Pstatus', 'Walc', 'absences', 'activities', 'address', 'age', 'failures'] ...


Unnamed: 0,G1,G2,G3,passed
0,0,11,11,1
1,9,11,11,1
2,12,13,12,1
3,14,14,14,1
4,11,13,13,1
