# MindLens-AI — 02: Model Training

Train a Logistic Regression baseline with 5-fold cross-validation. Includes GPU check, ROC/PR curves, error analysis, and optional DistilBERT fine-tuning.

In [None]:
# Setup
import sys, os
sys.path.insert(0, os.path.abspath(".."))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    RocCurveDisplay, PrecisionRecallDisplay,
    confusion_matrix, ConfusionMatrixDisplay,
)

from src.features import build_tfidf
from src.model import train_baseline, save_model, get_device
from src.evaluation import evaluate_single, cross_validate_model, error_analysis

sns.set_theme(style="whitegrid", palette="muted")

# --- GPU Check ---
device = get_device()
if torch.cuda.is_available():
    print(f"GPU detected: {torch.cuda.get_device_name(0)}")
    print(f"CUDA version: {torch.version.cuda}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_mem / 1e9:.1f} GB")
else:
    print("⚠ No GPU detected — Logistic Regression runs on CPU (fast), DistilBERT will be slow.")

print("Setup complete ✓")

## 1. Load Processed Data & Build Features

In [None]:
# Load processed data (from notebook 01)
df = pd.read_csv("../data/processed/primary_clean.csv")
print(f"Loaded {len(df)} samples")

texts = df["text"].values
labels = df["label"].values

# Build TF-IDF features
vectorizer, X = build_tfidf(texts, max_features=5000)
feature_names = vectorizer.get_feature_names_out()
print(f"TF-IDF matrix: {X.shape}")

# Stratified train-test split (80/20)
X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(
    X, labels, np.arange(len(labels)),
    test_size=0.2, stratify=labels, random_state=42,
)
texts_test = texts[idx_test]
print(f"Train: {X_train.shape[0]}  |  Test: {X_test.shape[0]}")

## 2. Cross-Validation (5-Fold Stratified)

In [None]:
cv_results = cross_validate_model(
    LogisticRegression,
    X, labels,
    n_splits=5,
    model_kwargs={"class_weight": "balanced", "max_iter": 1000, "solver": "lbfgs", "random_state": 42},
)
cv_results

## 3. Train Final Model on Train Set

In [None]:
# Train baseline on train split
model = train_baseline(X_train, y_train, max_iter=1000)

# Evaluate on held-out test set
metrics = evaluate_single(model, X_test, y_test)
print("=== Test Set Results ===")
print(metrics["classification_report"])
print(f"ROC-AUC: {metrics.get('roc_auc', 'N/A'):.4f}")
print(f"PR-AUC:  {metrics.get('pr_auc', 'N/A'):.4f}")

## 4. Confusion Matrix, ROC Curve & PR Curve

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Confusion Matrix
cm = metrics["confusion_matrix"]
ConfusionMatrixDisplay(cm, display_labels=["No Risk", "Risk"]).plot(ax=axes[0], cmap="Blues")
axes[0].set_title("Confusion Matrix")

# ROC Curve
RocCurveDisplay.from_estimator(model, X_test, y_test, ax=axes[1])
axes[1].set_title("ROC Curve")
axes[1].plot([0, 1], [0, 1], "k--", alpha=0.5)

# Precision-Recall Curve
PrecisionRecallDisplay.from_estimator(model, X_test, y_test, ax=axes[2])
axes[2].set_title("Precision-Recall Curve")

plt.tight_layout()
plt.show()

## 5. Error Analysis (Top 20 FP & FN)

In [None]:
fp_df, fn_df = error_analysis(model, X_test, y_test, texts_test, n=20)

print("=== Top 20 False Positives (predicted Risk but actually No Risk) ===")
for i, row in fp_df.iterrows():
    print(f"\n[FP-{i+1}] conf={row['confidence']:.3f}")
    print(f"  {row['text'][:200]}...")

print("\n\n=== Top 20 False Negatives (predicted No Risk but actually Risk) ===")
for i, row in fn_df.iterrows():
    print(f"\n[FN-{i+1}] conf={row['confidence']:.3f}")
    print(f"  {row['text'][:200]}...")

## 6. Save Model Artifacts

In [None]:
# Save model + vectorizer for downstream notebooks
save_model(model, vectorizer, "../data/processed/model_artifacts.joblib")

# Also save the test split info for reproducibility
import joblib
joblib.dump({
    "X_test": X_test, "y_test": y_test, "texts_test": texts_test,
    "X_train": X_train, "y_train": y_train,
    "feature_names": list(feature_names),
}, "../data/processed/test_split.joblib")
print("All artifacts saved ✓")

## 7. (Optional) DistilBERT Fine-tuning on GPU

In [None]:
# Requires: GPU with ≥4 GB VRAM, transformers, torch

from src.model import train_distilbert

texts_train_raw = df.iloc[idx_train]["text"].values
texts_test_raw = df.iloc[idx_test]["text"].values

trainer, tokenizer = train_distilbert(
    train_texts=texts_train_raw,
    train_labels=y_train,
    val_texts=texts_test_raw,
    val_labels=y_test,
    epochs=3,
    batch_size=16,
    output_dir="../data/processed/distilbert_model",
)

# Evaluate DistilBERT
eval_results = trainer.evaluate()
print("DistilBERT results:", eval_results)

print("Model training notebook complete ✓")