# 03 – Modeling

In [6]:
import json
import joblib
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report

ART = Path("../artifacts/portuguese")
ART.mkdir(parents=True, exist_ok=True)
# Choose dataset: engineered or raw
DATA_PATH = Path('../data/student-por.csv')
if not DATA_PATH.exists():
    DATA_PATH = Path('../data/student-por.csv')
df = pd.read_csv(DATA_PATH)

# UCI files use semicolon delimiter
df = pd.read_csv(DATA_PATH, sep=';')

# Normalize column names (strip spaces) and ensure G3 numeric
df.columns = [c.strip() for c in df.columns]
if 'G3' in df.columns:
    df['G3'] = pd.to_numeric(df['G3'], errors='coerce')

if 'passed' not in df.columns:
    if 'final_grade' in df.columns:
        df['passed'] = (df['final_grade'] >= 10).astype(int)
    elif 'G3' in df.columns:   # <-- UCI dataset final grade
        df['passed'] = (df['G3'] >= 10).astype(int)
    else:
        raise ValueError("Dataset must include 'passed', 'final_grade', or 'G3' to create it.")
target = 'passed'

# Start with all features except the target
X = df.drop(columns=[target])

# 🚫 Prevent leakage: drop final grade (G3)
if "G3" in X.columns:
    X = X.drop(columns=["G3"])

y = df[target]

print("Features used for training:", X.columns.tolist())
print("Target:", target)


# Identify column types
categorical_cols = [c for c in X.columns if X[c].dtype == 'object']
numeric_cols = [c for c in X.columns if c not in categorical_cols]

# Preprocess
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
    ]
)

models = {
    "logreg": LogisticRegression(max_iter=200, class_weight='balanced'),
    "dtree": DecisionTreeClassifier(random_state=42),
    "rf": RandomForestClassifier(n_estimators=300, random_state=42, class_weight='balanced'),
    "xgb": XGBClassifier(
        n_estimators=300, learning_rate=0.08, max_depth=5, subsample=0.9, colsample_bytree=0.9,
        reg_lambda=1.0, random_state=42, eval_metric='logloss', n_jobs=-1
    )
}

results = []
best_name, best_score, best_pipeline = None, -1.0, None

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

for name, clf in models.items():
    pipe = Pipeline(steps=[('pre', preprocessor), ('clf', clf)])
    pipe.fit(X_train, y_train)

    y_pred = pipe.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    prec, rec, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary', zero_division=0)

    results.append((name, acc, prec, rec, f1))
    if acc > best_score:
        best_score = acc
        best_name = name
        best_pipeline = pipe

print("Model Results (name, acc, prec, rec, f1):")
for row in results:
    print(row)

print(f"Best model: {best_name} (acc={best_score:.3f})")
print("\nClassification report for best model:")
print(classification_report(y_test, best_pipeline.predict(X_test), zero_division=0))

# Save artifacts
ART = Path('../artifacts/portuguese')
ART.mkdir(exist_ok=True, parents=True)
joblib.dump(best_pipeline, ART / 'best_model.joblib')
print("Saved best model → artifacts/best_model.joblib")

# Save schema for the app to render inputs
schema = []
for col in X.columns:
    typ = 'numeric' if col in numeric_cols else 'categorical'
    cats = sorted(df[col].dropna().unique().tolist()) if typ == 'categorical' else None
    # Limit very high-cardinality to avoid huge dropdowns
    if cats is not None and len(cats) > 30:
        cats = cats[:30]
    schema.append({"name": col, "type": typ, "categories": cats})
with open(ART / 'feature_schema.json', 'w', encoding='utf-8') as f:
    json.dump(schema, f, ensure_ascii=False, indent=2)
print("Saved schema → artifacts/feature_schema.json")


Features used for training: ['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2']
Target: passed
Model Results (name, acc, prec, rec, f1):
('logreg', 0.8769230769230769, 0.9519230769230769, 0.9, 0.9252336448598131)
('dtree', 0.8538461538461538, 0.9174311926605505, 0.9090909090909091, 0.91324200913242)
('rf', 0.8846153846153846, 0.9130434782608695, 0.9545454545454546, 0.9333333333333333)
('xgb', 0.9076923076923077, 0.9375, 0.9545454545454546, 0.9459459459459459)
Best model: xgb (acc=0.908)

Classification report for best model:
              precision    recall  f1-score   support

           0       0.72      0.65      0.68        20
           1       0.94      0.95      0.95       110

    accuracy                       

In [5]:
print(df.shape)
print(sorted(df.columns.tolist())[:15], "...")  # peek at columns
df[['G1','G2','G3','passed']].head()


(649, 34)
['Dalc', 'Fedu', 'Fjob', 'G1', 'G2', 'G3', 'Medu', 'Mjob', 'Pstatus', 'Walc', 'absences', 'activities', 'address', 'age', 'failures'] ...


Unnamed: 0,G1,G2,G3,passed
0,0,11,11,1
1,9,11,11,1
2,12,13,12,1
3,14,14,14,1
4,11,13,13,1
