
# Symptom2Disease: MLP + Gradient Boosting Stacked Ensemble (with SHAP)

This notebook trains an **ensemble disease diagnosis model** on natural-language symptom descriptions using the **Symptom2Disease** dataset.

**Pipeline**  
- Text preprocessing with **TF‑IDF (1–2 grams)**  
- **Stacking**: `MLPClassifier` + `GradientBoostingClassifier` → `LogisticRegression` (meta)  
- **Baselines**: Multinomial Logistic Regression  
- **Validation & Tuning**: Stratified 5‑fold with `RandomizedSearchCV` (lightweight by default)  
- **Metrics**: Accuracy, Macro Precision/Recall/F1, **Top‑k Accuracy**  
- **Explainability**: Global **SHAP** (TreeExplainer) over the Gradient Boosting branch  

> **Data**: expects `/mnt/data/Symptom2Disease.csv` with columns: `text` (symptoms) and `label` (disease).


In [None]:

# =========================
# Setup & Configuration
# =========================
import warnings, os, json, random
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from collections import Counter

import joblib
import shap

# Runtime switches
SEED          = 42
DATA_PATH     = "/mnt/data/Symptom2Disease.csv"  # change if needed
TEXT_COL      = "text"
LABEL_COL     = "label"

# Choose a mode:
# - 'FAST' trains quickly with fewer features/iters (recommended for first run)
# - 'FULL' enables randomized search tuning (slower)
MODE          = "FAST"   # "FAST" or "FULL"

TEST_SIZE     = 0.15
VAL_SIZE      = 0.15     # of remaining train after test split
TOP_K         = 3

random.seed(SEED)
np.random.seed(SEED)

print("MODE:", MODE)
print("Expecting CSV at:", DATA_PATH)


In [None]:

# =========================
# Load & Preview
# =========================
df = pd.read_csv(DATA_PATH)
assert TEXT_COL in df.columns and LABEL_COL in df.columns, f"Columns not found. Found: {list(df.columns)}"

# Basic cleaning
df = df.dropna(subset=[TEXT_COL, LABEL_COL]).copy()
df[TEXT_COL] = (df[TEXT_COL].astype(str)
                .str.replace(r"\s+", " ", regex=True)
                .str.strip()
                .str.lower())
df = df.drop_duplicates(subset=[TEXT_COL, LABEL_COL]).reset_index(drop=True)

display(df.head(10))
print(f"Rows: {len(df)} | Unique diseases: {df[LABEL_COL].nunique()}")
print("Class distribution (top 10):")
print(df[LABEL_COL].value_counts().head(10))


In [None]:

# =========================
# Split: Train / Val / Test
# =========================
le = LabelEncoder()
y = le.fit_transform(df[LABEL_COL].values)
X = df[TEXT_COL].values

X_trainval, X_test, y_trainval, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, stratify=y, random_state=SEED
)
val_ratio = VAL_SIZE / (1 - TEST_SIZE)
X_train, X_val, y_train, y_val = train_test_split(
    X_trainval, y_trainval, test_size=val_ratio, stratify=y_trainval, random_state=SEED
)

print(f"Splits → train: {len(X_train)}, val: {len(X_val)}, test: {len(X_test)}")
print("Example classes:", le.classes_[:10])


In [None]:

# =========================
# Vectorizer
# =========================
if MODE == "FAST":
    tfidf = TfidfVectorizer(
        ngram_range=(1,2),
        min_df=3,
        max_df=0.9,
        max_features=30000,
        sublinear_tf=True
    )
else:
    tfidf = TfidfVectorizer(
        ngram_range=(1,2),
        min_df=2,
        max_df=0.95,
        max_features=100000,
        sublinear_tf=True
    )


### Baseline: Multinomial Logistic Regression

In [None]:

baseline = Pipeline([
    ("tfidf", tfidf),
    ("clf", LogisticRegression(max_iter=2000, n_jobs=-1, class_weight="balanced",
                               multi_class="multinomial", solver="saga", random_state=SEED))
])
baseline.fit(X_train, y_train)
y_val_pred_base = baseline.predict(X_val)
print("Baseline (VAL) — Accuracy:", accuracy_score(y_val, y_val_pred_base))
print("Baseline (VAL) — Macro F1:", f1_score(y_val, y_val_pred_base, average="macro"))


### Stacked Ensemble: MLP + Gradient Boosting → Logistic Regression (meta)

In [None]:

mlp = MLPClassifier(
    hidden_layer_sizes=(128, 32) if MODE == "FAST" else (256,64),
    activation="relu",
    solver="adam",
    learning_rate="adaptive",
    alpha=1e-4,
    batch_size=128,
    early_stopping=True,
    n_iter_no_change=5 if MODE == "FAST" else 10,
    max_iter=80 if MODE == "FAST" else 120,
    random_state=SEED
)

gb = GradientBoostingClassifier(
    n_estimators=120 if MODE == "FAST" else 200,
    learning_rate=0.08,
    max_depth=3,
    subsample=0.9,
    random_state=SEED
)

meta = LogisticRegression(max_iter=1500, class_weight="balanced",
                          multi_class="multinomial", solver="lbfgs", random_state=SEED)

stack = StackingClassifier(
    estimators=[("mlp", mlp), ("gb", gb)],
    final_estimator=meta,
    n_jobs=-1
)

pipe = Pipeline([("tfidf", tfidf), ("clf", stack)])
pipe


### Train (with optional tuning)

In [None]:

if MODE == "FULL":
    param_dist = {
        "clf__mlp__hidden_layer_sizes": [(256,64), (512,128), (256,128,64)],
        "clf__mlp__alpha": [1e-5, 1e-4, 1e-3],
        "clf__mlp__learning_rate_init": [1e-3, 5e-4, 1e-4],
        "clf__gb__n_estimators": [150, 200, 300],
        "clf__gb__learning_rate": [0.05, 0.08, 0.1],
        "clf__gb__max_depth": [2, 3, 4],
        "tfidf__min_df": [2, 3, 5],
        "tfidf__max_df": [0.85, 0.9, 0.95]
    }
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
    search = RandomizedSearchCV(
        estimator=pipe,
        param_distributions=param_dist,
        n_iter=15,
        scoring="f1_macro",
        n_jobs=-1,
        cv=cv,
        verbose=1,
        random_state=SEED
    )
    search.fit(X_train, y_train)
    best = search.best_estimator_
    print("Best params:", search.best_params_)
    print("Best CV macro‑F1:", search.best_score_)
else:
    best = pipe.fit(X_train, y_train)

best


### Evaluation Helpers

In [None]:

def topk_accuracy(model, X, y_true, k=3):
    probs = model.predict_proba(X)
    topk = np.argsort(-probs, axis=1)[:, :k]
    return float(np.mean([y_true[i] in topk[i] for i in range(len(y_true))]))

def report_set(name, y_true, y_pred):
    print(f"{name} — Accuracy:", accuracy_score(y_true, y_pred))
    print(f"{name} — Macro Precision:", precision_score(y_true, y_pred, average="macro", zero_division=0))
    print(f"{name} — Macro Recall:", recall_score(y_true, y_pred, average="macro", zero_division=0))
    print(f"{name} — Macro F1:", f1_score(y_true, y_pred, average="macro"))


### Validation Metrics

In [None]:

y_val_pred = best.predict(X_val)
report_set("VAL", y_val, y_val_pred)
print(f"VAL — Top-{TOP_K} Accuracy:", topk_accuracy(best, X_val, y_val, k=TOP_K))


### Final Test Evaluation

In [None]:

best.fit(np.concatenate([X_train, X_val]), np.concatenate([y_train, y_val]))
y_test_pred = best.predict(X_test)
report_set("TEST", y_test, y_test_pred)
print(f"TEST — Top-{TOP_K} Accuracy:", topk_accuracy(best, X_test, y_test, k=TOP_K))


### Confusion Matrix (Top 20 classes)

In [None]:

from collections import Counter
cnt = Counter(y_test)
top_classes = [c for c,_ in cnt.most_common(20)]
mask = np.isin(y_test, top_classes)
cm = confusion_matrix(y_test[mask], y_test_pred[mask], labels=top_classes)

plt.figure(figsize=(10,8))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=le.inverse_transform(top_classes))
disp.plot(values_format='d', xticks_rotation=90, cmap=None)  # do not set explicit colors
plt.title("Confusion Matrix (Top 20 classes)")
plt.tight_layout()
plt.show()


### SHAP Explainability (Global Terms from Gradient Boosting branch)

In [None]:

# Train a standalone GB pipeline for explainability using the same TF-IDF
gb = GradientBoostingClassifier(
    n_estimators= (150 if MODE == "FAST" else 200),
    learning_rate=0.08,
    max_depth=3,
    subsample=0.9,
    random_state=SEED
)
gb_pipe = Pipeline([("tfidf", best.named_steps["tfidf"]), ("clf", gb)])
gb_pipe.fit(np.concatenate([X_train, X_val]), np.concatenate([y_train, y_val]))

# Sample a manageable number for SHAP
S = min(150, len(X_test))
idx = np.random.choice(len(X_test), size=S, replace=False)
X_sample = list(np.array(X_test)[idx])

tfidf_model = gb_pipe.named_steps["tfidf"]
X_exp = tfidf_model.transform(X_sample)
if hasattr(X_exp, "toarray"):
    X_exp = X_exp.toarray()

# TreeExplainer for GradientBoosting
try:
    explainer = shap.TreeExplainer(gb_pipe.named_steps["clf"])
    shap_values = explainer.shap_values(X_exp)

    # Aggregate mean |SHAP| across classes if multiclass
    if isinstance(shap_values, list):
        mean_abs = np.mean([np.abs(sv).mean(axis=0) for sv in shap_values], axis=0)
    else:
        mean_abs = np.abs(shap_values).mean(axis=0)

    feature_names = tfidf_model.get_feature_names_out()
    order = np.argsort(-mean_abs)[:20]

    # Simple horizontal bar chart
    terms = feature_names[order]
    vals = mean_abs[order]

    plt.figure(figsize=(8,6))
    y_pos = np.arange(len(terms))
    plt.barh(y_pos, vals)   # no explicit colors
    plt.yticks(y_pos, terms)
    plt.gca().invert_yaxis()
    plt.title("Top SHAP Terms (Global Importance)")
    plt.xlabel("Mean |SHAP value|")
    plt.tight_layout()
    plt.show()
except Exception as e:
    print("SHAP computation skipped due to:", e)


### Save Artifacts

In [None]:

joblib.dump(best, "/mnt/data/diagnosis_ensemble.joblib")
joblib.dump(le, "/mnt/data/label_encoder.joblib")
with open("/mnt/data/model_classes.json", "w") as f:
    json.dump(le.classes_.tolist(), f, indent=2)
print("Saved: /mnt/data/diagnosis_ensemble.joblib, /mnt/data/label_encoder.joblib, /mnt/data/model_classes.json")


### Inference Helper (Top‑k predictions)

In [None]:

def predict_symptoms(texts, model=best, label_encoder=le, top_k=3):
    probs = model.predict_proba(texts)
    out = []
    for row in probs:
        top = np.argsort(-row)[:top_k]
        out.append([{"label": label_encoder.inverse_transform([i])[0], "prob": float(row[i])} for i in top])
    return out

examples = [
    "fever, dry cough, shortness of breath, fatigue",
    "abdominal pain, nausea, vomiting, low appetite"
]
preds = predict_symptoms(examples, top_k=TOP_K)
print(json.dumps(preds, indent=2))


### Dependencies

In [None]:

# If needed, install these locally:
# !pip install -U scikit-learn pandas numpy matplotlib shap joblib
