In [5]:
#!/usr/bin/env python3
"""
Model selection & evaluation pipeline (Classification: Multinomial Naive Bayes; Clustering: K-Means)

- Loads two-column CSV: text,label (0=ham, 1=spam)
- Generates all visuals into outputs/analyze_2/
- Designed to COEXIST with your existing data_analyze.py (do NOT replace it).

Run:
    python data_analyze/model_selection_eval.py 

Notes:
- By default, auto-detects a processed dataset under outputs/processed/*.processed.csv
  (tries emails_merged.processed.csv, text_spam.processed.csv, emails.processed.csv, email_spam.processed.csv).
- You can override with: DATA_CSV=/full/path/to/your.csv python data_analyze/model_selection_eval.py
- Uses a token_pattern that PRESERVES digits.
"""
from __future__ import annotations
from pathlib import Path
import os
import sys
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, StratifiedKFold, learning_curve
from sklearn.metrics import (
    accuracy_score, precision_recall_fscore_support, roc_auc_score,
    average_precision_score, confusion_matrix, ConfusionMatrixDisplay,
    roc_curve, auc, RocCurveDisplay,
    precision_recall_curve, PrecisionRecallDisplay,
    classification_report, silhouette_score, silhouette_samples,
)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV, calibration_curve
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans

# -------------------------
# Config & paths
# -------------------------
ROOT = Path.cwd().resolve()
OUT  = ROOT / "outputs" / "analyze_2"
OUT.mkdir(parents=True, exist_ok=True)

# Auto-detect a processed 2-column CSV (text,label). Prefer merged, then others.
CANDIDATES = [
    ROOT / "outputs" / "processed" / "emails_merged.processed.csv",
    ROOT / "outputs" / "processed" / "text_spam.processed.csv",
    ROOT / "outputs" / "processed" / "emails.processed.csv",
    ROOT / "outputs" / "processed" / "email_spam.processed.csv",
]
DATA = Path(os.getenv("DATA_CSV", ""))
if not str(DATA):
    for c in CANDIDATES:
        if c.exists():
            DATA = c
            break

# -------------------------
# Utils
# -------------------------

def savefig(fig, name: str):
    OUT.mkdir(parents=True, exist_ok=True)
    fig.tight_layout()
    path = OUT / name
    fig.savefig(path, bbox_inches="tight")
    plt.close(fig)
    print(f"[saved] {path}")

# -------------------------
# Load data
# -------------------------
if not DATA or not Path(DATA).exists():
    sys.exit(
        "ERROR: Could not find a processed dataset. Run data_processing/data_processor.py first"
        "or set DATA_CSV=/full/path/to/your 2-column CSV (text,label)."
        f"Checked: {[str(p) for p in CANDIDATES]}"
    )

df = pd.read_csv(DATA)
if not {"text","label"}.issubset(df.columns):
    sys.exit("ERROR: expected columns: text,label")

print("Loaded:", df.shape)
print(df["label"].value_counts(dropna=False))

# -------------------------
# Vectorize (keep digits)
# -------------------------
vectorizer = TfidfVectorizer(
    lowercase=True,
    ngram_range=(1,2),
    min_df=2,
    token_pattern=r"(?u)\b[a-zA-Z0-9]{2,}\b",  # keeps numbers
)

X = vectorizer.fit_transform(df["text"].astype(str))
y = df["label"].astype(int).values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# -------------------------
# Classification bake-off
# -------------------------
models = {
    "MNB": MultinomialNB(),
    "LogReg": LogisticRegression(max_iter=200),
    "SVM(calib)": CalibratedClassifierCV(LinearSVC(), method="sigmoid", cv=5),
}

rows = []
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    p,r,f1,_ = precision_recall_fscore_support(y_test, y_pred, average="binary", zero_division=0)

    metrics = {"model": name, "accuracy":acc, "precision":p, "recall":r, "f1":f1}

    # get probabilities or decision function for ROC/PR
    y_score = None
    if hasattr(model, "predict_proba"):
        y_score = model.predict_proba(X_test)[:,1]
    else:
        try:
            y_score = model.decision_function(X_test)
        except Exception:
            y_score = None
    if y_score is not None:
        metrics["roc_auc"] = roc_auc_score(y_test, y_score)
        metrics["avg_precision"] = average_precision_score(y_test, y_score)
    rows.append(metrics)

metrics_df = pd.DataFrame(rows).sort_values("f1", ascending=False)
metrics_csv = OUT/"clf_metrics.csv"
metrics_df.to_csv(metrics_csv, index=False)
print("[saved]", metrics_csv)
print(metrics_df)

# Choose MNB as final classifier for plots
clf = models["MNB"].fit(X_train, y_train)
y_hat = clf.predict(X_test)
y_prob = clf.predict_proba(X_test)[:,1]

# Confusion matrix
fig, ax = plt.subplots(figsize=(4,4), dpi=140)
ConfusionMatrixDisplay.from_predictions(y_test, y_hat, cmap="Blues", ax=ax)
ax.set_title("MNB Confusion Matrix")
savefig(fig, "clf_mnb_confmat.png")

# ROC
fpr, tpr, _ = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)
fig, ax = plt.subplots(figsize=(4.5,4.5), dpi=140)
RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, estimator_name="MNB").plot(ax=ax)
ax.set_title(f"MNB ROC (AUC={roc_auc:.3f})")
savefig(fig, "clf_mnb_roc.png")

# Precision-Recall
prec, rec, _ = precision_recall_curve(y_test, y_prob)
ap = average_precision_score(y_test, y_prob)
fig, ax = plt.subplots(figsize=(4.5,4.5), dpi=140)
PrecisionRecallDisplay(precision=prec, recall=rec, average_precision=ap, estimator_name="MNB").plot(ax=ax)
ax.set_title(f"MNB Precision–Recall (AP={ap:.3f})")
savefig(fig, "clf_mnb_pr.png")

# Learning curve (F1)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
sizes, train_scores, val_scores = learning_curve(clf, X, y, cv=cv, scoring="f1", train_sizes=np.linspace(0.1,1.0,8), n_jobs=-1)
fig, ax = plt.subplots(figsize=(5,4), dpi=140)
ax.plot(sizes, train_scores.mean(axis=1), marker="o", label="train")
ax.plot(sizes, val_scores.mean(axis=1), marker="o", label="cv")
ax.set_xlabel("Training samples"); ax.set_ylabel("F1"); ax.set_title("MNB Learning Curve"); ax.legend()
savefig(fig, "clf_mnb_learning_curve.png")

# Top indicative tokens (log-odds)
feat_names = np.array(vectorizer.get_feature_names_out())
logprob = clf.feature_log_prob_
logodds = (logprob[1] - logprob[0])

def barh_tokens(idxs, title, fname):
    toks = feat_names[idxs]
    vals = logodds[idxs]
    order = np.argsort(vals)
    fig, ax = plt.subplots(figsize=(6,5), dpi=140)
    ax.barh(range(len(idxs)), vals[order])
    ax.set_yticks(range(len(idxs)))
    ax.set_yticklabels(toks[order])
    ax.set_title(title)
    ax.set_xlabel("Log-odds (spam vs ham)")
    savefig(fig, fname)

barh_tokens(np.argsort(logodds)[-20:], "Top Indicative Tokens — Spam", "clf_mnb_top_spam_tokens.png")
barh_tokens(np.argsort(logodds)[:20],  "Top Indicative Tokens — Ham",  "clf_mnb_top_ham_tokens.png")

# Calibration
prob_true, prob_pred = calibration_curve(y_test, y_prob, n_bins=10, strategy="quantile")
fig, ax = plt.subplots(figsize=(4.5,4.5), dpi=140)
ax.plot([0,1],[0,1], linestyle="--")
ax.plot(prob_pred, prob_true, marker="o")
ax.set_title("MNB Probability Calibration")
ax.set_xlabel("Predicted probability"); ax.set_ylabel("Observed frequency")
savefig(fig, "clf_mnb_calibration.png")

# -------------------------
# K-Means model selection & visuals
# -------------------------
svd2 = TruncatedSVD(n_components=2, random_state=42)
X2 = svd2.fit_transform(X)

ks = list(range(2, 11))
inertias, silhouettes = [], []
for k in ks:
    km = KMeans(n_clusters=k, n_init="auto", random_state=42)
    labels_k = km.fit_predict(X)
    inertias.append(km.inertia_)
    silhouettes.append(silhouette_score(X, labels_k))

pd.DataFrame({"k":ks,"inertia":inertias,"silhouette":silhouettes}).to_csv(OUT/"kmeans_selection.csv", index=False)
print("[saved]", OUT/"kmeans_selection.csv")

# Elbow
fig, ax = plt.subplots(figsize=(5,4), dpi=140)
ax.plot(ks, inertias, marker="o")
ax.set_xlabel("k"); ax.set_ylabel("Inertia"); ax.set_title("K-Means Elbow")
savefig(fig, "km_elbow.png")

# Silhouette vs k
fig, ax = plt.subplots(figsize=(5,4), dpi=140)
ax.plot(ks, silhouettes, marker="o")
ax.set_xlabel("k"); ax.set_ylabel("Silhouette"); ax.set_title("K-Means Silhouette vs k")
savefig(fig, "km_silhouette_vs_k.png")

best_k = ks[int(np.argmax(silhouettes))]
print("Chosen k:", best_k)

km_final = KMeans(n_clusters=best_k, n_init="auto", random_state=42)
labels = km_final.fit_predict(X)
np.savetxt(OUT/"km_labels.txt", labels, fmt="%d")
print("[saved]", OUT/"km_labels.txt")

# 2D scatter
fig, ax = plt.subplots(figsize=(5,4), dpi=140)
scatter = ax.scatter(X2[:,0], X2[:,1], c=labels, s=6, alpha=0.6)
ax.set_title(f"K-Means (k={best_k}) — 2D SVD Projection")
ax.set_xlabel("SVD1"); ax.set_ylabel("SVD2")
savefig(fig, "km_scatter_svd.png")

# Silhouette diagram
sil_samples = silhouette_samples(X, labels)
y_lower = 10
fig, ax = plt.subplots(figsize=(6,5), dpi=140)
for i in range(best_k):
    vals = sil_samples[labels==i]
    vals.sort()
    y_upper = y_lower + vals.shape[0]
    ax.fill_betweenx(np.arange(y_lower, y_upper), 0, vals, alpha=0.7)
    ax.text(-0.05, (y_lower + y_upper)/2, str(i))
    y_lower = y_upper + 10
ax.axvline(silhouette_score(X, labels), color="black", linestyle="--")
ax.set_title(f"Silhouette Plot (k={best_k})")
ax.set_xlabel("Silhouette coefficient"); ax.set_ylabel("Cluster")
savefig(fig, "km_silhouette_plot.png")

# Cluster sizes
sizes = pd.Series(labels).value_counts().sort_index()
fig, ax = plt.subplots(figsize=(5,4), dpi=140)
sizes.plot(kind="bar", ax=ax)
ax.set_xlabel("Cluster"); ax.set_ylabel("Count"); ax.set_title(f"Cluster Sizes (k={best_k})")
savefig(fig, "km_cluster_sizes.png")

# Top terms per cluster
feat_names = np.array(vectorizer.get_feature_names_out())
centroids = km_final.cluster_centers_
rows = []
for c in range(best_k):
    top_idx = np.argsort(centroids[c])[-12:][::-1]
    rows.append({"cluster": c, "top_terms": ", ".join(feat_names[top_idx])})

top_terms_df = pd.DataFrame(rows)
top_terms_df.to_csv(OUT/"km_top_terms.csv", index=False)
print("[saved]", OUT/"km_top_terms.csv")

print("Done. Figures are in outputs/analyze_2/")


IsADirectoryError: [Errno 21] Is a directory: '.'