In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder


In [None]:
from pathlib import Path

DATA_PATH = Path('..') / 'data' / 'raw' / 'mental_health_dataset.csv'
df = pd.read_csv(DATA_PATH)
df

Unnamed: 0,Student_ID,Age,Gender,GPA,Stress_Level,Anxiety_Score,Depression_Score,Daily_Reflections,Sleep_Hours,Steps_Per_Day,Mood_Description,Sentiment_Score,Mental_Health_Status
0,1,23,Other,2.52,5,20,6,Onto foreign do environmental anyone every nea...,6.8,4166,Happy,0.0000,2
1,2,19,Male,2.74,5,3,7,Party but others visit admit industry country ...,5.1,4949,Tired,0.4678,2
2,3,21,Female,3.53,5,11,24,Religious sure wait do chance decade according...,8.3,7632,Sad,0.5106,2
3,4,18,Male,2.04,4,15,14,A task effect entire coach join series.,8.2,5548,Sad,0.2960,2
4,5,19,Other,2.87,1,2,4,Knowledge several camera wait week write quali...,5.9,3698,Happy,0.4588,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,496,20,Other,3.34,4,0,21,Land floor page trade social away animal cut e...,5.4,8725,Motivated,-0.2732,2
496,497,18,Female,3.22,2,7,3,Almost wide majority technology positive parti...,4.5,3692,Anxious,0.7269,0
497,498,23,Other,2.86,4,17,1,Property answer method call law dream maybe mo...,8.2,6000,Motivated,0.6461,2
498,499,18,Female,2.45,4,14,0,Care can now outside real rest that perform.,6.0,5013,Stressed,0.4939,1


In [4]:
df= df.drop(columns=[
    "Student_ID",
    "Daily_Reflections"
])

In [5]:
df

Unnamed: 0,Age,Gender,GPA,Stress_Level,Anxiety_Score,Depression_Score,Sleep_Hours,Steps_Per_Day,Mood_Description,Sentiment_Score,Mental_Health_Status
0,23,Other,2.52,5,20,6,6.8,4166,Happy,0.0000,2
1,19,Male,2.74,5,3,7,5.1,4949,Tired,0.4678,2
2,21,Female,3.53,5,11,24,8.3,7632,Sad,0.5106,2
3,18,Male,2.04,4,15,14,8.2,5548,Sad,0.2960,2
4,19,Other,2.87,1,2,4,5.9,3698,Happy,0.4588,0
...,...,...,...,...,...,...,...,...,...,...,...
495,20,Other,3.34,4,0,21,5.4,8725,Motivated,-0.2732,2
496,18,Female,3.22,2,7,3,4.5,3692,Anxious,0.7269,0
497,23,Other,2.86,4,17,1,8.2,6000,Motivated,0.6461,2
498,18,Female,2.45,4,14,0,6.0,5013,Stressed,0.4939,1


In [6]:
# Features/label
X = df.drop(columns=["Mental_Health_Status"])
y = df["Mental_Health_Status"]

print('X shape:', X.shape)
display(y.value_counts())

X shape: (500, 10)


Mental_Health_Status
2    341
1    137
0     22
Name: count, dtype: int64

In [7]:
# Quick data sanity checks
print('Shape:', df.shape)
print('\nColumns:', list(df.columns))

print('\nMissing values per column (top 20):')
display(df.isna().sum().sort_values(ascending=False).head(20))

print('\nLabel distribution:')
display(df['Mental_Health_Status'].value_counts(dropna=False))
display((df['Mental_Health_Status'].value_counts(normalize=True, dropna=False) * 100).round(2))

Shape: (500, 11)

Columns: ['Age', 'Gender', 'GPA', 'Stress_Level', 'Anxiety_Score', 'Depression_Score', 'Sleep_Hours', 'Steps_Per_Day', 'Mood_Description', 'Sentiment_Score', 'Mental_Health_Status']

Missing values per column (top 20):


Age                     0
Gender                  0
GPA                     0
Stress_Level            0
Anxiety_Score           0
Depression_Score        0
Sleep_Hours             0
Steps_Per_Day           0
Mood_Description        0
Sentiment_Score         0
Mental_Health_Status    0
dtype: int64


Label distribution:


Mental_Health_Status
2    341
1    137
0     22
Name: count, dtype: int64

Mental_Health_Status
2    68.2
1    27.4
0     4.4
Name: proportion, dtype: float64

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

# Define feature types
categorical_cols = ["Gender", "Mood_Description"]
numeric_cols = [c for c in X.columns if c not in categorical_cols]

numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ]
 )

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
 )

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols),
    ],
    remainder="drop",
    verbose_feature_names_out=False,
 )

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y,
 )

print('Train size:', X_train.shape, ' Test size:', X_test.shape)

Train size: (400, 10)  Test size: (100, 10)


In [10]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(400, 10)
(100, 10)
(400,)
(100,)


In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    f1_score,
    classification_report,
 )

# Pipeline: preprocess -> model
pipe = Pipeline(
    steps=[
        ("preprocess", preprocess),
        ("model", LogisticRegression(max_iter=5000, class_weight="balanced")),
    ]
 )

# Try a couple strong baselines and tune them
param_grid = [
    {
        "model": [LogisticRegression(max_iter=5000, class_weight="balanced", solver="saga")],
        "model__C": [0.1, 0.3, 1.0, 3.0, 10.0],
        "model__penalty": ["l1", "l2"],
    },
    {
        "model": [
            RandomForestClassifier(
                random_state=42,
                class_weight="balanced_subsample",
            )
        ],
        "model__n_estimators": [300, 600],
        "model__max_depth": [None, 8, 16],
        "model__min_samples_split": [2, 5],
        "model__min_samples_leaf": [1, 2],
    },
 ]

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Use balanced_accuracy so the model doesn't win by ignoring the rare class
search = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring="balanced_accuracy",
    cv=cv,
    n_jobs=-1,
    verbose=1,
 )

search.fit(X_train, y_train)
best_model = search.best_estimator_
print('Best CV balanced_accuracy:', search.best_score_)
print('Best params:', search.best_params_)

# Holdout evaluation
y_pred = best_model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
bacc = balanced_accuracy_score(y_test, y_pred)
macro_f1 = f1_score(y_test, y_pred, average="macro")
weighted_f1 = f1_score(y_test, y_pred, average="weighted")

print('\nHoldout Accuracy:', acc)
print('Holdout Balanced Accuracy:', bacc)
print('Holdout Macro F1:', macro_f1)
print('Holdout Weighted F1:', weighted_f1)
print('\nClassification Report:\n', classification_report(y_test, y_pred))

Fitting 5 folds for each of 34 candidates, totalling 170 fits
Best CV balanced_accuracy: 0.8303030303030303
Best params: {'model': RandomForestClassifier(class_weight='balanced_subsample', random_state=42), 'model__max_depth': 8, 'model__min_samples_leaf': 2, 'model__min_samples_split': 2, 'model__n_estimators': 300}

Holdout Accuracy: 0.96
Holdout Balanced Accuracy: 0.7876543209876544
Holdout Macro F1: 0.8309002433090025
Holdout Weighted F1: 0.9543222106360792

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.40      0.57         5
           1       0.90      0.96      0.93        27
           2       0.99      1.00      0.99        68

    accuracy                           0.96       100
   macro avg       0.96      0.79      0.83       100
weighted avg       0.96      0.96      0.95       100



In [13]:
import joblib
from pathlib import Path

MODEL_PATH = Path('..') / 'models' / 'mental_health_model.pkl'
joblib.dump(best_model, MODEL_PATH)

['..\\models\\mental_health_model.pkl']

# By using another model 

In [14]:
# Train an alternative model (ExtraTrees) using the SAME preprocessing and split above
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    f1_score,
    classification_report,
 )
from pathlib import Path
import joblib

# Safety check: make sure the earlier cells were run
required_vars = ["preprocess", "X_train", "X_test", "y_train", "y_test"]
missing = [v for v in required_vars if v not in globals()]
if missing:
    raise NameError(f"Run the cells above first. Missing: {missing}")

alt_pipe = Pipeline(
    steps=[
        ("preprocess", preprocess),
        (
            "model",
            ExtraTreesClassifier(
                random_state=42,
                class_weight="balanced",
                n_jobs=-1,
            ),
        ),
    ]
 )

alt_param_grid = {
    "model__n_estimators": [500, 800],
    "model__max_depth": [None, 12, 20],
    "model__min_samples_split": [2, 5],
    "model__min_samples_leaf": [1, 2],
    "model__max_features": ["sqrt", "log2"],
}

alt_search = GridSearchCV(
    estimator=alt_pipe,
    param_grid=alt_param_grid,
    scoring="balanced_accuracy",
    cv=cv,
    n_jobs=-1,
    verbose=1,
)

alt_search.fit(X_train, y_train)
alt_best_model = alt_search.best_estimator_

print("ExtraTrees best CV balanced_accuracy:", alt_search.best_score_)
print("ExtraTrees best params:", alt_search.best_params_)

# Holdout evaluation (same metrics as above)
alt_pred = alt_best_model.predict(X_test)
alt_acc = accuracy_score(y_test, alt_pred)
alt_bacc = balanced_accuracy_score(y_test, alt_pred)
alt_macro_f1 = f1_score(y_test, alt_pred, average="macro")
alt_weighted_f1 = f1_score(y_test, alt_pred, average="weighted")

print("\nExtraTrees Holdout Accuracy:", alt_acc)
print("ExtraTrees Holdout Balanced Accuracy:", alt_bacc)
print("ExtraTrees Holdout Macro F1:", alt_macro_f1)
print("ExtraTrees Holdout Weighted F1:", alt_weighted_f1)
print("\nExtraTrees Classification Report:\n", classification_report(y_test, alt_pred))

# Save this alternative model too
ALT_MODEL_PATH = Path('..') / 'models' / 'mental_health_model_extratrees.pkl'
joblib.dump(alt_best_model, ALT_MODEL_PATH)
print("\nSaved ExtraTrees model to:", ALT_MODEL_PATH)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
ExtraTrees best CV balanced_accuracy: 0.81989898989899
ExtraTrees best params: {'model__max_depth': None, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 2, 'model__min_samples_split': 5, 'model__n_estimators': 800}

ExtraTrees Holdout Accuracy: 0.9
ExtraTrees Holdout Balanced Accuracy: 0.6815904139433551
ExtraTrees Holdout Macro F1: 0.6825784868759649
ExtraTrees Holdout Weighted F1: 0.8956497683993825

ExtraTrees Classification Report:
               precision    recall  f1-score   support

           0       0.33      0.20      0.25         5
           1       0.77      0.89      0.83        27
           2       0.98      0.96      0.97        68

    accuracy                           0.90       100
   macro avg       0.70      0.68      0.68       100
weighted avg       0.90      0.90      0.90       100


Saved ExtraTrees model to: ..\models\mental_health_model_extratrees.pkl


# another model

#### his new section trains LinearSVC (SVM) + CalibratedClassifierCV (good for sparse one‚Äëhot features), evaluates with the same metrics, and saves to mental_health_model_linearsvc.pkl

In [15]:
# Another model (Linear SVM) that often performs well on sparse one-hot features
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    f1_score,
    classification_report,
 )
from pathlib import Path
import joblib

# Safety check: make sure the earlier cells were run
required_vars = ["preprocess", "cv", "X_train", "X_test", "y_train", "y_test"]
missing = [v for v in required_vars if v not in globals()]
if missing:
    raise NameError(f"Run the cells above first. Missing: {missing}")

# LinearSVC is strong for high-dimensional sparse data; calibration adds predict_proba-like behavior
svm_base = LinearSVC(class_weight="balanced", random_state=42)
svm = CalibratedClassifierCV(estimator=svm_base, method="sigmoid", cv=3)

svm_pipe = Pipeline(
    steps=[
        ("preprocess", preprocess),
        ("model", svm),
    ]
 )

svm_param_grid = {
    # CalibratedClassifierCV exposes the base estimator via estimator__...
    "model__estimator__C": [0.1, 0.3, 1.0, 3.0, 10.0],
}

svm_search = GridSearchCV(
    estimator=svm_pipe,
    param_grid=svm_param_grid,
    scoring="balanced_accuracy",
    cv=cv,
    n_jobs=-1,
    verbose=1,
 )

svm_search.fit(X_train, y_train)
svm_best_model = svm_search.best_estimator_

print("LinearSVC best CV balanced_accuracy:", svm_search.best_score_)
print("LinearSVC best params:", svm_search.best_params_)

svm_pred = svm_best_model.predict(X_test)
svm_acc = accuracy_score(y_test, svm_pred)
svm_bacc = balanced_accuracy_score(y_test, svm_pred)
svm_macro_f1 = f1_score(y_test, svm_pred, average="macro")
svm_weighted_f1 = f1_score(y_test, svm_pred, average="weighted")

print("\nLinearSVC Holdout Accuracy:", svm_acc)
print("LinearSVC Holdout Balanced Accuracy:", svm_bacc)
print("LinearSVC Holdout Macro F1:", svm_macro_f1)
print("LinearSVC Holdout Weighted F1:", svm_weighted_f1)
print("\nLinearSVC Classification Report:\n", classification_report(y_test, svm_pred))

SVM_MODEL_PATH = Path('..') / 'models' / 'mental_health_model_linearsvc.pkl'
joblib.dump(svm_best_model, SVM_MODEL_PATH)
print("\nSaved LinearSVC model to:", SVM_MODEL_PATH)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
LinearSVC best CV balanced_accuracy: 0.5150392817059484
LinearSVC best params: {'model__estimator__C': 0.1}

LinearSVC Holdout Accuracy: 0.81
LinearSVC Holdout Balanced Accuracy: 0.5161583151779231
LinearSVC Holdout Macro F1: 0.5100762527233115
LinearSVC Holdout Weighted F1: 0.7833006535947713

LinearSVC Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         5
           1       0.67      0.59      0.63        27
           2       0.86      0.96      0.90        68

    accuracy                           0.81       100
   macro avg       0.51      0.52      0.51       100
weighted avg       0.76      0.81      0.78       100


Saved LinearSVC model to: ..\models\mental_health_model_linearsvc.pkl


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


###### Note: this SVM model did not improve results on the current split (its balanced accuracy was lower than the ExtraTrees and your earlier grid search), but it‚Äôs now available as an additional model option without touching the earlier pipeline/code.

# another model to see better accuracy 

In [16]:
# Try more algorithms that can work well for this multi-class target
# (Uses the SAME preprocess, cv, and train/test split defined above)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    f1_score,
    classification_report,
 )
from pathlib import Path
import joblib

required_vars = ["preprocess", "cv", "X_train", "X_test", "y_train", "y_test"]
missing = [v for v in required_vars if v not in globals()]
if missing:
    raise NameError(f"Run the cells above first. Missing: {missing}")

def evaluate_and_save(model_name: str, estimator, param_grid: dict, out_name: str):
    pipe_local = Pipeline(
        steps=[
            ("preprocess", preprocess),
            ("model", estimator),
        ]
    )
    search_local = GridSearchCV(
        estimator=pipe_local,
        param_grid=param_grid,
        scoring="balanced_accuracy",
        cv=cv,
        n_jobs=-1,
        verbose=1,
    )
    search_local.fit(X_train, y_train)
    best_local = search_local.best_estimator_

    pred = best_local.predict(X_test)
    acc_ = accuracy_score(y_test, pred)
    bacc_ = balanced_accuracy_score(y_test, pred)
    macro_f1_ = f1_score(y_test, pred, average="macro")
    weighted_f1_ = f1_score(y_test, pred, average="weighted")

    print("\n==", model_name, "==")
    print("Best CV balanced_accuracy:", search_local.best_score_)
    print("Best params:", search_local.best_params_)
    print("Holdout Accuracy:", acc_)
    print("Holdout Balanced Accuracy:", bacc_)
    print("Holdout Macro F1:", macro_f1_)
    print("Holdout Weighted F1:", weighted_f1_)
    print("\nClassification Report:\n", classification_report(y_test, pred))

    out_path = Path('..') / 'models' / out_name
    joblib.dump(best_local, out_path)
    print("Saved model to:", out_path)
    return {
        "model": model_name,
        "cv_balanced_accuracy": float(search_local.best_score_),
        "holdout_accuracy": float(acc_),
        "holdout_balanced_accuracy": float(bacc_),
        "holdout_macro_f1": float(macro_f1_),
        "holdout_weighted_f1": float(weighted_f1_),
        "path": str(out_path),
    }

results = []

# 1) Decision Tree (fast baseline; can capture nonlinear interactions)
dt = DecisionTreeClassifier(random_state=42, class_weight="balanced")
dt_grid = {
    "model__max_depth": [None, 6, 10, 16],
    "model__min_samples_split": [2, 5, 10],
    "model__min_samples_leaf": [1, 2, 4],
}
results.append(evaluate_and_save("DecisionTree", dt, dt_grid, "mental_health_model_decisiontree.pkl"))

# 2) KNN (works on sparse output if we force brute search)
knn = KNeighborsClassifier()
knn_grid = {
    "model__n_neighbors": [3, 5, 7, 11, 15],
    "model__weights": ["uniform", "distance"],
    "model__metric": ["minkowski"],
    "model__p": [1, 2],
    "model__algorithm": ["brute"],
}
results.append(evaluate_and_save("KNN", knn, knn_grid, "mental_health_model_knn.pkl"))

# Summary table (sorted by balanced accuracy)
results_df = pd.DataFrame(results).sort_values("holdout_balanced_accuracy", ascending=False)
print("\nSummary (higher balanced_accuracy is better for imbalanced classes):")
display(results_df)

Fitting 5 folds for each of 36 candidates, totalling 180 fits

== DecisionTree ==
Best CV balanced_accuracy: 0.9698989898989898
Best params: {'model__max_depth': None, 'model__min_samples_leaf': 1, 'model__min_samples_split': 2}
Holdout Accuracy: 1.0
Holdout Balanced Accuracy: 1.0
Holdout Macro F1: 1.0
Holdout Weighted F1: 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      1.00      1.00        27
           2       1.00      1.00      1.00        68

    accuracy                           1.00       100
   macro avg       1.00      1.00      1.00       100
weighted avg       1.00      1.00      1.00       100

Saved model to: ..\models\mental_health_model_decisiontree.pkl
Fitting 5 folds for each of 20 candidates, totalling 100 fits

== KNN ==
Best CV balanced_accuracy: 0.6338496071829406
Best params: {'model__algorithm': 'brute', 'model__metric': 'minkowski', 'model__n_

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,model,cv_balanced_accuracy,holdout_accuracy,holdout_balanced_accuracy,holdout_macro_f1,holdout_weighted_f1,path
0,DecisionTree,0.969899,1.0,1.0,1.0,1.0,..\models\mental_health_model_decisiontree.pkl
1,KNN,0.63385,0.78,0.546115,0.511628,0.770388,..\models\mental_health_model_knn.pkl


In [17]:
# Sanity-check: why did DecisionTree get 1.0 accuracy?
# This does NOT change any earlier code; it just checks for leakage/duplicates and re-evaluates saved models.
import numpy as np
import pandas as pd
from pathlib import Path
import joblib
from sklearn.metrics import classification_report, confusion_matrix

required_vars = ["df", "X", "y", "X_train", "X_test", "y_train", "y_test"]
missing = [v for v in required_vars if v not in globals()]
if missing:
    raise NameError(f"Run the cells above first. Missing: {missing}")

print("Rows:", df.shape)
print("Train/Test sizes:", X_train.shape, X_test.shape)

# 1) Check exact duplicate rows (features) in the whole dataset
dup_all = df.duplicated(subset=X.columns.tolist(), keep=False).sum()
print("\nDuplicate feature rows in full dataset:", int(dup_all))

# 2) Check if any identical feature rows appear in BOTH train and test
train_hash = pd.util.hash_pandas_object(X_train, index=False)
test_hash = pd.util.hash_pandas_object(X_test, index=False)
overlap = np.intersect1d(train_hash.values, test_hash.values)
print("Duplicate feature rows overlapping Train‚ÜîTest:", int(len(overlap)))

# 3) If overlapping duplicates exist, check if their labels are always the same (leakage risk)
if len(overlap) > 0:
    train_idx = train_hash[train_hash.isin(overlap)].index
    test_idx = test_hash[test_hash.isin(overlap)].index
    train_pairs = pd.DataFrame({"row_hash": train_hash.loc[train_idx].values, "y": y_train.loc[train_idx].values})
    test_pairs = pd.DataFrame({"row_hash": test_hash.loc[test_idx].values, "y": y_test.loc[test_idx].values})
    merged = train_pairs.merge(test_pairs, on="row_hash", suffixes=("_train", "_test"))
    disagree = (merged["y_train"] != merged["y_test"]).sum()
    print("Overlapping duplicates with different labels:", int(disagree))
else:
    print("No exact Train‚ÜîTest duplicate feature rows detected.")

# 4) Quick check: does any categorical value map 1-to-1 to the target? (strong signal / possible leakage)
for col in [c for c in X.columns if df[c].dtype == "object"]:
    nunique_max = df.groupby(col)["Mental_Health_Status"].nunique().max()
    print(f"\nMax #target-labels per '{col}' category:", int(nunique_max))
    display(df.groupby(col)["Mental_Health_Status"].nunique().sort_values(ascending=False).head(10))

# 5) Re-evaluate the SAVED decision tree model on the same holdout (no warnings)
MODELS_DIR = Path('..') / 'models'
dt_path = MODELS_DIR / 'mental_health_model_decisiontree.pkl'
knn_path = MODELS_DIR / 'mental_health_model_knn.pkl'

if dt_path.exists():
    dt_model = joblib.load(dt_path)
    dt_pred = dt_model.predict(X_test)
    print("\n[Saved DecisionTree] classification report (zero_division=0):")
    print(classification_report(y_test, dt_pred, zero_division=0))
    print("Confusion matrix:")
    display(pd.DataFrame(confusion_matrix(y_test, dt_pred), index=sorted(y.unique()), columns=sorted(y.unique())))
    if hasattr(dt_model, "named_steps") and "model" in dt_model.named_steps:
        model_obj = dt_model.named_steps["model"]
        if hasattr(model_obj, "get_depth"):
            print("Tree depth:", model_obj.get_depth(), " | leaves:", model_obj.get_n_leaves())
else:
    print("\nDecisionTree model file not found:", dt_path)

if knn_path.exists():
    knn_model = joblib.load(knn_path)
    knn_pred = knn_model.predict(X_test)
    print("\n[Saved KNN] classification report (zero_division=0):")
    print(classification_report(y_test, knn_pred, zero_division=0))
else:
    print("\nKNN model file not found:", knn_path)

Rows: (500, 11)
Train/Test sizes: (400, 10) (100, 10)

Duplicate feature rows in full dataset: 0
Duplicate feature rows overlapping Train‚ÜîTest: 0
No exact Train‚ÜîTest duplicate feature rows detected.

Max #target-labels per 'Gender' category: 3


Gender
Female    3
Male      3
Other     3
Name: Mental_Health_Status, dtype: int64


Max #target-labels per 'Mood_Description' category: 3


Mood_Description
Anxious      3
Happy        3
Motivated    3
Relaxed      3
Sad          3
Stressed     3
Tired        3
Name: Mental_Health_Status, dtype: int64


[Saved DecisionTree] classification report (zero_division=0):
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      1.00      1.00        27
           2       1.00      1.00      1.00        68

    accuracy                           1.00       100
   macro avg       1.00      1.00      1.00       100
weighted avg       1.00      1.00      1.00       100

Confusion matrix:


Unnamed: 0,0,1,2
0,5,0,0
1,0,27,0
2,0,0,68


Tree depth: 5  | leaves: 10

[Saved KNN] classification report (zero_division=0):
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         5
           1       0.56      0.81      0.67        27
           2       0.92      0.82      0.87        68

    accuracy                           0.78       100
   macro avg       0.49      0.55      0.51       100
weighted avg       0.78      0.78      0.77       100



# esamble method

In [18]:
# Ensemble Method: Voting Classifier combining all 5 trained models
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    f1_score,
    classification_report,
)
from pathlib import Path
import joblib

# Safety check
required_vars = ["X_train", "X_test", "y_train", "y_test"]
missing = [v for v in required_vars if v not in globals()]
if missing:
    raise NameError(f"Run the cells above first. Missing: {missing}")

MODELS_DIR = Path('..') / 'models'

# Load all 5 trained models
model_paths = {
    'decision_tree': MODELS_DIR / 'mental_health_model_decisiontree.pkl',
    'random_forest': MODELS_DIR / 'mental_health_model.pkl',
    'extra_trees': MODELS_DIR / 'mental_health_model_extratrees.pkl',
    'knn': MODELS_DIR / 'mental_health_model_knn.pkl',
    'linear_svc': MODELS_DIR / 'mental_health_model_linearsvc.pkl',
}

# Load models that exist
loaded_models = []
for name, path in model_paths.items():
    if path.exists():
        model = joblib.load(path)
        loaded_models.append((name, model))
        print(f"‚úì Loaded {name} from {path.name}")
    else:
        print(f"‚úó Model not found: {path.name}")

if len(loaded_models) < 2:
    raise ValueError("Need at least 2 models for ensemble. Train more models first.")

print(f"\nCreating ensemble with {len(loaded_models)} models...")

‚úì Loaded decision_tree from mental_health_model_decisiontree.pkl
‚úì Loaded random_forest from mental_health_model.pkl
‚úì Loaded extra_trees from mental_health_model_extratrees.pkl
‚úì Loaded knn from mental_health_model_knn.pkl
‚úì Loaded linear_svc from mental_health_model_linearsvc.pkl

Creating ensemble with 5 models...


In [19]:
# Create Voting Classifier (hard voting - majority vote)
ensemble_hard = VotingClassifier(
    estimators=loaded_models,
    voting='hard',  # Majority vote
    n_jobs=-1
)

# Fit the ensemble (this is fast since models are already trained)
print("Fitting hard voting ensemble...")
ensemble_hard.fit(X_train, y_train)

# Evaluate on test set
y_pred_hard = ensemble_hard.predict(X_test)
acc_hard = accuracy_score(y_test, y_pred_hard)
bacc_hard = balanced_accuracy_score(y_test, y_pred_hard)
macro_f1_hard = f1_score(y_test, y_pred_hard, average="macro")
weighted_f1_hard = f1_score(y_test, y_pred_hard, average="weighted")

print("\n=== Hard Voting Ensemble Results ===")
print(f"Holdout Accuracy: {acc_hard:.4f}")
print(f"Holdout Balanced Accuracy: {bacc_hard:.4f}")
print(f"Holdout Macro F1: {macro_f1_hard:.4f}")
print(f"Holdout Weighted F1: {weighted_f1_hard:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_hard))

# Save the hard voting ensemble
ENSEMBLE_HARD_PATH = MODELS_DIR / 'mental_health_model_ensemble_hard.pkl'
joblib.dump(ensemble_hard, ENSEMBLE_HARD_PATH)
print(f"\n‚úì Saved hard voting ensemble to: {ENSEMBLE_HARD_PATH}")

Fitting hard voting ensemble...

=== Hard Voting Ensemble Results ===
Holdout Accuracy: 0.9500
Holdout Balanced Accuracy: 0.7210
Holdout Macro F1: 0.7461
Holdout Weighted F1: 0.9380

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.20      0.33         5
           1       0.87      0.96      0.91        27
           2       0.99      1.00      0.99        68

    accuracy                           0.95       100
   macro avg       0.95      0.72      0.75       100
weighted avg       0.95      0.95      0.94       100


‚úì Saved hard voting ensemble to: ..\models\mental_health_model_ensemble_hard.pkl


In [20]:
# Create Soft Voting Classifier (weighted by predicted probabilities)
# Note: All models must support predict_proba for soft voting
try:
    ensemble_soft = VotingClassifier(
        estimators=loaded_models,
        voting='soft',  # Weighted by probabilities
        n_jobs=-1
    )
    
    print("Fitting soft voting ensemble...")
    ensemble_soft.fit(X_train, y_train)
    
    # Evaluate on test set
    y_pred_soft = ensemble_soft.predict(X_test)
    acc_soft = accuracy_score(y_test, y_pred_soft)
    bacc_soft = balanced_accuracy_score(y_test, y_pred_soft)
    macro_f1_soft = f1_score(y_test, y_pred_soft, average="macro")
    weighted_f1_soft = f1_score(y_test, y_pred_soft, average="weighted")
    
    print("\n=== Soft Voting Ensemble Results ===")
    print(f"Holdout Accuracy: {acc_soft:.4f}")
    print(f"Holdout Balanced Accuracy: {bacc_soft:.4f}")
    print(f"Holdout Macro F1: {macro_f1_soft:.4f}")
    print(f"Holdout Weighted F1: {weighted_f1_soft:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred_soft))
    
    # Save the soft voting ensemble
    ENSEMBLE_SOFT_PATH = MODELS_DIR / 'mental_health_model_ensemble_soft.pkl'
    joblib.dump(ensemble_soft, ENSEMBLE_SOFT_PATH)
    print(f"\n‚úì Saved soft voting ensemble to: {ENSEMBLE_SOFT_PATH}")
    
except Exception as e:
    print(f"\n‚úó Soft voting failed: {e}")
    print("Some models may not support predict_proba. Using hard voting only.")

Fitting soft voting ensemble...

=== Soft Voting Ensemble Results ===
Holdout Accuracy: 0.9700
Holdout Balanced Accuracy: 0.8543
Holdout Macro F1: 0.8961
Holdout Weighted F1: 0.9678

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.60      0.75         5
           1       0.93      0.96      0.95        27
           2       0.99      1.00      0.99        68

    accuracy                           0.97       100
   macro avg       0.97      0.85      0.90       100
weighted avg       0.97      0.97      0.97       100


‚úì Saved soft voting ensemble to: ..\models\mental_health_model_ensemble_soft.pkl


In [21]:
# Compare all models including ensemble
import pandas as pd

comparison_results = []

# Individual models
for name, model in loaded_models:
    pred = model.predict(X_test)
    comparison_results.append({
        'Model': name,
        'Type': 'Individual',
        'Accuracy': accuracy_score(y_test, pred),
        'Balanced_Accuracy': balanced_accuracy_score(y_test, pred),
        'Macro_F1': f1_score(y_test, pred, average='macro'),
        'Weighted_F1': f1_score(y_test, pred, average='weighted'),
    })

# Hard voting ensemble
comparison_results.append({
    'Model': 'ensemble_hard',
    'Type': 'Ensemble',
    'Accuracy': acc_hard,
    'Balanced_Accuracy': bacc_hard,
    'Macro_F1': macro_f1_hard,
    'Weighted_F1': weighted_f1_hard,
})

# Soft voting ensemble (if it worked)
if 'acc_soft' in locals():
    comparison_results.append({
        'Model': 'ensemble_soft',
        'Type': 'Ensemble',
        'Accuracy': acc_soft,
        'Balanced_Accuracy': bacc_soft,
        'Macro_F1': macro_f1_soft,
        'Weighted_F1': weighted_f1_soft,
    })

comparison_df = pd.DataFrame(comparison_results)
comparison_df = comparison_df.sort_values('Balanced_Accuracy', ascending=False)

print("\n" + "="*80)
print("MODEL COMPARISON (sorted by Balanced Accuracy)")
print("="*80)
display(comparison_df)

# Highlight the best model
best_model = comparison_df.iloc[0]
print(f"\nüèÜ Best Model: {best_model['Model']} ({best_model['Type']})")
print(f"   Balanced Accuracy: {best_model['Balanced_Accuracy']:.4f}")
print(f"   Accuracy: {best_model['Accuracy']:.4f}")


MODEL COMPARISON (sorted by Balanced Accuracy)


Unnamed: 0,Model,Type,Accuracy,Balanced_Accuracy,Macro_F1,Weighted_F1
0,decision_tree,Individual,1.0,1.0,1.0,1.0
6,ensemble_soft,Ensemble,0.97,0.854321,0.896052,0.967809
1,random_forest,Individual,0.96,0.787654,0.8309,0.954322
5,ensemble_hard,Ensemble,0.95,0.720988,0.746105,0.938019
2,extra_trees,Individual,0.9,0.68159,0.682578,0.89565
3,knn,Individual,0.78,0.546115,0.511628,0.770388
4,linear_svc,Individual,0.81,0.516158,0.510076,0.783301



üèÜ Best Model: decision_tree (Individual)
   Balanced Accuracy: 1.0000
   Accuracy: 1.0000


In [22]:
# Analyze voting patterns - see where models agree/disagree
import numpy as np

print("\n" + "="*80)
print("VOTING PATTERN ANALYSIS")
print("="*80)

# Get predictions from all models
all_predictions = {}
for name, model in loaded_models:
    all_predictions[name] = model.predict(X_test)

# Convert to DataFrame for easier analysis
pred_df = pd.DataFrame(all_predictions)
pred_df['ensemble_hard'] = y_pred_hard
pred_df['actual'] = y_test.values

# Calculate agreement rate
def calculate_agreement(row):
    model_preds = [row[name] for name, _ in loaded_models]
    return len(set(model_preds)) == 1  # True if all agree

pred_df['all_agree'] = pred_df.apply(calculate_agreement, axis=1)

agreement_rate = pred_df['all_agree'].mean()
print(f"\nAll models agree on {agreement_rate*100:.1f}% of test cases")
print(f"Models disagree on {(1-agreement_rate)*100:.1f}% of test cases")

# Check accuracy when models agree vs disagree
agree_correct = (pred_df[pred_df['all_agree']]['ensemble_hard'] == pred_df[pred_df['all_agree']]['actual']).mean()
disagree_correct = (pred_df[~pred_df['all_agree']]['ensemble_hard'] == pred_df[~pred_df['all_agree']]['actual']).mean()

print(f"\nWhen all models agree: {agree_correct*100:.1f}% correct")
print(f"When models disagree: {disagree_correct*100:.1f}% correct")

# Show some examples where models disagree
disagreement_cases = pred_df[~pred_df['all_agree']].head(5)
if len(disagreement_cases) > 0:
    print(f"\nExample cases where models disagree:")
    display(disagreement_cases[[name for name, _ in loaded_models] + ['ensemble_hard', 'actual']])


VOTING PATTERN ANALYSIS

All models agree on 67.0% of test cases
Models disagree on 33.0% of test cases

When all models agree: 100.0% correct
When models disagree: 84.8% correct

Example cases where models disagree:


Unnamed: 0,decision_tree,random_forest,extra_trees,knn,linear_svc,ensemble_hard,actual
0,2,2,2,1,2,2,2
17,1,1,1,1,2,1,1
19,1,2,2,2,2,2,1
21,2,2,2,1,1,2,2
22,2,2,2,1,2,2,2


## Summary

The ensemble method combines predictions from all 5 trained models:

1. **Hard Voting**: Each model votes for a class, majority wins
2. **Soft Voting**: Weighted by predicted probabilities (if supported)

### Benefits of Ensemble:
- Reduces overfitting by averaging predictions
- More robust to individual model weaknesses
- Often achieves better generalization

### When to Use:
- Use ensemble when you want the most reliable prediction
- Use individual models when you want to see different perspectives
- The comparison table above shows which approach works best for this dataset