In [12]:
import pandas as pd
from sklearn.metrics import accuracy_score
from transformers import pipeline

# Load a few test samples
df = pd.read_csv("../data/mental_health_dataset.csv").sample(n=200, random_state=42)  # using 10–20 rows for quick check

# Load model from Hugging Face
model_name = "lishaangral/roberta-mental-health"
from transformers import pipeline

classifier = pipeline(
    "text-classification",
    model="lishaangral/roberta-mental-health",
    tokenizer="lishaangral/roberta-mental-health",
    framework="pt"  # Force PyTorch backend
)

# Make predictions
preds = classifier(
    list(df["statement"]),
    truncation=True,
    max_length=512
)
pred_labels = [p["label"] for p in preds]

# Evaluate
true_labels = list(df["status"])
accuracy = accuracy_score(true_labels, pred_labels)
print(f"Quick Accuracy on {len(df)} samples: {accuracy:.1%}")

Device set to use cpu


Quick Accuracy on 200 samples: 91.5%


In [14]:
from sklearn.metrics import classification_report

# Generate full classification report
report = classification_report(true_labels, pred_labels, digits=3)
print(report)


                      precision    recall  f1-score   support

             Anxiety      0.917     1.000     0.957        11
             Bipolar      1.000     0.833     0.909         6
          Depression      0.897     0.897     0.897        68
              Normal      1.000     0.985     0.992        66
Personality disorder      1.000     1.000     1.000         4
              Stress      0.727     0.889     0.800         9
            Suicidal      0.829     0.806     0.817        36

            accuracy                          0.915       200
           macro avg      0.910     0.916     0.910       200
        weighted avg      0.917     0.915     0.915       200



In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from datasets import load_dataset
from transformers import pipeline
import numpy as np

# Load GoEmotions test data
test = load_dataset("go_emotions", split="test")

# Load multi-label classifier pipeline
model_name2 = "lishaangral/roberta-mental-health-v2"
classifier = pipeline(
    "text-classification",
    model=model_name2,
    tokenizer=model_name2,
    framework="pt",
    return_all_scores=True,
    truncation=True,
    max_length=512,
    top_k=None  # return scores for all labels
)


Device set to use cpu


In [35]:
# Get predictions
sampled_test = test.shuffle(seed=42).select(range(1000))
pred_outputs = classifier(list(sampled_test["text"]))

# Convert predictions to binary using threshold
threshold = 0.5
pred_labels_bin = []
for scores in pred_outputs:
    labels = [1 if s["score"] > threshold else 0 for s in scores]
    pred_labels_bin.append(labels)

# Convert true labels to binary format (GoEmotions uses list of label indices)
num_labels = len(pred_outputs[0])  # should be 28
true_labels_bin = []
for example_labels in sampled_test["labels"]:
    label_vec = [0] * num_labels
    for idx in example_labels:
        label_vec[idx] = 1
    true_labels_bin.append(label_vec)

# Convert to numpy arrays
y_true = np.array(true_labels_bin)
y_pred = np.array(pred_labels_bin)

# Evaluation
f1 = f1_score(y_true, y_pred, average="micro")
precision = precision_score(y_true, y_pred, average="micro")
recall = recall_score(y_true, y_pred, average="micro")
accuracy_per_label = (y_true == y_pred).mean()

print(f"F1 Score (micro): {f1:.4f}")
print(f"Precision (micro): {precision:.4f}")
print(f"Recall (micro): {recall:.4f}")
print(f"Per-label Accuracy: {accuracy_per_label:.4f}")


F1 Score (micro): 0.0921
Precision (micro): 0.1103
Recall (micro): 0.0791
Per-label Accuracy: 0.9359


In [38]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import numpy as np
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

# Load model and tokenizer
model_name = "lishaangral/roberta-mental-health-v2"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

model.eval()

# Tokenize test texts
inputs = tokenizer(
    list(sampled_test["text"]),
    return_tensors="pt",
    padding=True,
    truncation=True,
    max_length=128
)

# Predict logits
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    probs = torch.sigmoid(logits)
    preds = (probs > 0.5).int().cpu().numpy()

# Prepare true labels
# Number of labels
num_labels = model.config.num_labels  # Should be 28 for GoEmotions

# Convert list of label indices to binary multi-hot encoded matrix
true_labels = np.zeros((len(sampled_test), num_labels), dtype=int)
for i, label_list in enumerate(sampled_test["labels"]):
    for label in label_list:
        true_labels[i][label] = 1

# Micro-averaged scores
f1_micro = f1_score(true_labels, preds, average="micro")
precision_micro = precision_score(true_labels, preds, average="micro")
recall_micro = recall_score(true_labels, preds, average="micro")

print(f"\n=== Micro-Averaged Scores ===")
print(f"F1 (micro):     {f1_micro:.4f}")
print(f"Precision:      {precision_micro:.4f}")
print(f"Recall:         {recall_micro:.4f}")

# Per-label metrics
f1_per_label = f1_score(true_labels, preds, average=None)
precision_per_label = precision_score(true_labels, preds, average=None)
recall_per_label = recall_score(true_labels, preds, average=None)
accuracy_per_label = (preds == true_labels).mean(axis=0)

print(f"\n=== Per-Label Metrics ===")
for i in range(len(f1_per_label)):
    print(f"Label {i:2d}:  Acc: {accuracy_per_label[i]:.4f} | F1: {f1_per_label[i]:.4f} | Prec: {precision_per_label[i]:.4f} | Recall: {recall_per_label[i]:.4f}")



=== Micro-Averaged Scores ===
F1 (micro):     0.5972
Precision:      0.7152
Recall:         0.5126

=== Per-Label Metrics ===
Label  0:  Acc: 0.9430 | F1: 0.7077 | Prec: 0.6571 | Recall: 0.7667
Label  1:  Acc: 0.9800 | F1: 0.7959 | Prec: 0.7959 | Recall: 0.7959
Label  2:  Acc: 0.9730 | F1: 0.4255 | Prec: 0.4762 | Recall: 0.3846
Label  3:  Acc: 0.9570 | F1: 0.2712 | Prec: 0.8000 | Recall: 0.1633
Label  4:  Acc: 0.9400 | F1: 0.4000 | Prec: 0.6250 | Recall: 0.2941
Label  5:  Acc: 0.9840 | F1: 0.5000 | Prec: 0.7273 | Recall: 0.3810
Label  6:  Acc: 0.9820 | F1: 0.5714 | Prec: 0.7059 | Recall: 0.4800
Label  7:  Acc: 0.9590 | F1: 0.5393 | Prec: 0.7059 | Recall: 0.4364
Label  8:  Acc: 0.9830 | F1: 0.2609 | Prec: 0.3750 | Recall: 0.2000
Label  9:  Acc: 0.9710 | F1: 0.1212 | Prec: 0.5000 | Recall: 0.0690
Label 10:  Acc: 0.9450 | F1: 0.2667 | Prec: 0.4167 | Recall: 0.1961
Label 11:  Acc: 0.9820 | F1: 0.4706 | Prec: 0.7273 | Recall: 0.3478
Label 12:  Acc: 0.9920 | F1: 0.5556 | Prec: 1.0000 | Reca

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
