# 04 — Supervised Learning

Train baseline classifiers:
- Logistic Regression
- Decision Tree
- Random Forest
- SVM

Evaluate: Accuracy, Precision, Recall, F1, ROC/AUC.
Save metrics to `results/evaluation_metrics.txt`.

In [None]:
import numpy as np, pandas as pd, joblib
from pathlib import Path
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score, roc_curve
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import matplotlib.pyplot as plt

RANDOM_STATE = 42

train = pd.read_csv("../data/processed/train.csv")
test = pd.read_csv("../data/processed/test.csv")
target = next((t for t in ["target","num","condition","disease"] if t in train.columns), None)

X_train, y_train = train.drop(columns=[target]), train[target]
X_test, y_test = test.drop(columns=[target]), test[target]

preprocessor = joblib.load("../models/preprocessor.pkl")
Xtr = preprocessor.fit_transform(X_train)
Xte = preprocessor.transform(X_test)

if hasattr(Xtr, "toarray"):
    Xtr = Xtr.toarray()
    Xte = Xte.toarray()

models = {
    "Logistic Regression": LogisticRegression(max_iter=2000, random_state=RANDOM_STATE),
    "Decision Tree": DecisionTreeClassifier(random_state=RANDOM_STATE),
    "Random Forest": RandomForestClassifier(n_estimators=300, random_state=RANDOM_STATE),
    "SVM (RBF)": SVC(probability=True, random_state=RANDOM_STATE),
}

metrics = []
plt.figure()
for name, clf in models.items():
    clf.fit(Xtr, y_train)
    y_pred = clf.predict(Xte)
    y_prob = clf.predict_proba(Xte)[:,1] if hasattr(clf, "predict_proba") else None

    acc = accuracy_score(y_test, y_pred)
    prec, rec, f1, _ = precision_recall_fscore_support(y_test, y_pred, average="binary")
    auc = roc_auc_score(y_test, y_prob) if y_prob is not None else float("nan")

    metrics.append([name, acc, prec, rec, f1, auc])

    if y_prob is not None:
        fpr, tpr, _ = roc_curve(y_test, y_prob)
        plt.plot(fpr, tpr, label=f"{name} (AUC={{auc:.3f}})".format(auc=auc))

plt.plot([0,1], [0,1], linestyle="--")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ROC Curves")
plt.legend()
plt.show()

metrics_df = pd.DataFrame(metrics, columns=["Model","Accuracy","Precision","Recall","F1","AUC"])
metrics_df.sort_values("AUC", ascending=False, inplace=True)
print(metrics_df)

with open("../results/evaluation_metrics.txt", "a") as f:
    f.write("\\n=== Baseline Models ===\\n")
    f.write(metrics_df.to_string(index=False))