# 03 – Modeling

In [8]:
# ===========================
# 03_Modeling.ipynb (Unified)
# ===========================
import json
import joblib
import pandas as pd
from pathlib import Path
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report

# -----------------
# Configuration
# -----------------
SUBJECT = "mat"          # <-- set to "mat" or "por"
MID_COURSE = True        # True = drop G3 (no final-grade leakage). False = include G3.

# File paths per subject
DATA_FILE = {
    "mat": "../data/student-mat.csv",
    "por": "../data/student-por.csv",
}[SUBJECT]

ART_DIR = {
    "mat": "../artifacts/math",
    "por": "../artifacts/portuguese",
}[SUBJECT]

# -----------------
# Load data
# -----------------
df = pd.read_csv(DATA_FILE, sep=";")
if df.shape[1] == 1 and ";" in df.columns[0]:
    # safety fallback if delimiter was missed
    df = pd.read_csv(DATA_FILE, sep=";")

df.columns = [c.strip() for c in df.columns]
if "G3" not in df.columns:
    raise ValueError(f"'G3' not in columns. Found: {list(df.columns)}")

df["G3"] = pd.to_numeric(df["G3"], errors="coerce")

# -----------------
# Create target ('passed')
# -----------------
if "passed" not in df.columns:
    df["passed"] = (df["G3"].fillna(-1) >= 10).astype(int)

target = "passed"
X = df.drop(columns=[target]).copy()
y = df[target].copy()

# -----------------
# Anti-leakage for MID_COURSE model
# -----------------
if MID_COURSE and "G3" in X.columns:
    X.drop(columns=["G3"], inplace=True)

# sanity check: stratified split needs both classes
vc = y.value_counts()
if len(vc) < 2:
    raise ValueError(f"Only one class in target: {vc.to_dict()}")

# -----------------
# Preprocessing
# -----------------
categorical_cols = [c for c in X.columns if X[c].dtype == "object"]
numeric_cols = [c for c in X.columns if c not in categorical_cols]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
    ],
    remainder="drop",
)

# -----------------
# Models to try
# -----------------
models = {
    "logreg": LogisticRegression(max_iter=500, class_weight="balanced"),
    "dtree": DecisionTreeClassifier(random_state=42),
    "rf": RandomForestClassifier(n_estimators=300, random_state=42, class_weight="balanced", n_jobs=-1),
    "xgb": XGBClassifier(
        n_estimators=300, learning_rate=0.08, max_depth=5,
        subsample=0.9, colsample_bytree=0.9, reg_lambda=1.0,
        random_state=42, eval_metric="logloss", n_jobs=-1
    ),
}

# -----------------
# Train / evaluate
# -----------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

results = []
best_name, best_score, best_pipeline = None, -1.0, None

for name, clf in models.items():
    pipe = Pipeline(steps=[("pre", preprocessor), ("clf", clf)])
    pipe.fit(X_train, y_train)

    y_pred = pipe.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    prec, rec, f1, _ = precision_recall_fscore_support(
        y_test, y_pred, average="binary", zero_division=0
    )

    results.append((name, acc, prec, rec, f1))
    if acc > best_score:
        best_score = acc
        best_name = name
        best_pipeline = pipe

print("Model Results (name, acc, prec, rec, f1):")
for row in results:
    print(row)

print(f"\nBest model: {best_name} (acc={best_score:.3f})")
print("\nClassification report for best model:")
print(classification_report(y_test, best_pipeline.predict(X_test), zero_division=0))

# -----------------
# Save artifacts
# -----------------
ART = Path(ART_DIR)
ART.mkdir(parents=True, exist_ok=True)

model_path = ART / "best_model.joblib"
joblib.dump(best_pipeline, model_path)
print(f"Saved best model → {model_path}")

# Schema used by the Streamlit app (reflects MID_COURSE setting)
schema = []
for col in X.columns:
    typ = "numeric" if col in numeric_cols else "categorical"
    cats = None
    if typ == "categorical":
        cats = sorted(df[col].dropna().astype(str).unique().tolist())
        if len(cats) > 30:
            cats = cats[:30]
    schema.append({"name": col, "type": typ, "categories": cats})

schema_path = ART / "feature_schema.json"
with open(schema_path, "w", encoding="utf-8") as f:
    json.dump(schema, f, ensure_ascii=False, indent=2)
print(f"Saved schema → {schema_path}")

print("\nDone! Subject =", SUBJECT, "| Mid-course =", MID_COURSE)
print("Features used:", list(X.columns))


Model Results (name, acc, prec, rec, f1):
('logreg', 0.8607594936708861, 0.9565217391304348, 0.8301886792452831, 0.8888888888888888)
('dtree', 0.8607594936708861, 0.9772727272727273, 0.8113207547169812, 0.8865979381443299)
('rf', 0.8734177215189873, 0.9574468085106383, 0.8490566037735849, 0.9)
('xgb', 0.8987341772151899, 0.9787234042553191, 0.8679245283018868, 0.92)

Best model: xgb (acc=0.899)

Classification report for best model:
              precision    recall  f1-score   support

           0       0.78      0.96      0.86        26
           1       0.98      0.87      0.92        53

    accuracy                           0.90        79
   macro avg       0.88      0.91      0.89        79
weighted avg       0.91      0.90      0.90        79

Saved best model → ..\artifacts\math\best_model.joblib
Saved schema → ..\artifacts\math\feature_schema.json

Done! Subject = mat | Mid-course = True
Features used: ['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu'

Columns in your DataFrame:
 ['school;sex;age;address;famsize;Pstatus;Medu;Fedu;Mjob;Fjob;reason;guardian;traveltime;studytime;failures;schoolsup;famsup;paid;activities;nursery;higher;internet;romantic;famrel;freetime;goout;Dalc;Walc;health;absences;G1;G2;G3']
Detected final-grade candidates: ['school;sex;age;address;famsize;Pstatus;Medu;Fedu;Mjob;Fjob;reason;guardian;traveltime;studytime;failures;schoolsup;famsup;paid;activities;nursery;higher;internet;romantic;famrel;freetime;goout;Dalc;Walc;health;absences;G1;G2;G3']
Target 'passed' created. Value counts:
passed
0    395
Name: count, dtype: int64
