In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
import os
import json

In [24]:
df = pd.read_csv('Combined data.csv')
df = df.drop(columns=['Unnamed: 0'])
df = df.dropna()
df.head()

Unnamed: 0,statement,status
0,oh my gosh,Anxiety
1,"trouble sleeping, confused mind, restless hear...",Anxiety
2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety
3,I've shifted my focus to something else but I'...,Anxiety
4,"I'm restless and restless, it's been a month n...",Anxiety


In [25]:
df['status'].value_counts()

status
Normal                  16343
Depression              15404
Suicidal                10652
Anxiety                  3841
Bipolar                  2777
Stress                   2587
Personality disorder     1077
Name: count, dtype: int64

In [31]:
X_train, X_test, y_train, y_test = train_test_split(df['statement'], df['status'], test_size=0.2, random_state=42)
pipeline = Pipeline([
    ('vec', TfidfVectorizer(stop_words='english')),
    ('clf', LogisticRegression(max_iter=1000))
])
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

                      precision    recall  f1-score   support

             Anxiety       0.84      0.74      0.78       755
             Bipolar       0.88      0.66      0.76       527
          Depression       0.67      0.74      0.70      3016
              Normal       0.84      0.94      0.89      3308
Personality disorder       0.87      0.42      0.57       237
              Stress       0.75      0.43      0.54       536
            Suicidal       0.69      0.65      0.67      2158

            accuracy                           0.76     10537
           macro avg       0.79      0.65      0.70     10537
        weighted avg       0.76      0.76      0.75     10537



In [None]:
feature_names = pipeline.named_steps['vec'].get_feature_names_out()
coefs = pipeline.named_steps['clf'].coef_
class_labels = pipeline.named_steps['clf'].classes_

import numpy as np

for i, classe in enumerate(class_labels):
    top10_idx = np.argsort(coefs[i])[-10:]
    top10_terms = feature_names[top10_idx]
    top10_weights = coefs[i][top10_idx]
    print(f"\nClasse '{classe}' — Top 10 termos:")
    for termo, peso in zip(top10_terms[::-1], top10_weights[::-1]):
        print(f"  {termo:20s} → {peso:.4f}")



Classe 'Anxiety' — Top 10 termos:
  anxiety              → 8.8858
  restless             → 8.4171
  anxious              → 6.6096
  worried              → 6.0306
  nervous              → 5.8620
  worry                → 4.5810
  restlessness         → 3.7713
  health               → 3.7705
  symptoms             → 3.6948
  cancer               → 3.5671

Classe 'Bipolar' — Top 10 termos:
  bipolar              → 12.7955
  manic                → 7.6727
  mania                → 5.6517
  meds                 → 5.5900
  lamictal             → 5.5448
  episode              → 5.4103
  lithium              → 4.7613
  hypomanic            → 4.5150
  seroquel             → 4.1023
  ve                   → 4.0036

Classe 'Depression' — Top 10 termos:
  depression           → 13.6120
  wa                   → 8.1917
  depressed            → 5.9729
  doe                  → 4.2057
  ha                   → 3.9086
  http                 → 3.7170
  pression             → 3.5481
  na                   → 3

In [33]:
# pega o máximo absoluto do coeficiente em qualquer classe
max_abs = np.max(np.abs(coefs), axis=0)
top_overall_idx = np.argsort(max_abs)[-10:]
print("\nTop 10 termos mais discriminativos (por magnitude):")
for idx in top_overall_idx[::-1]:
    print(f"  {feature_names[idx]:20s} → {max_abs[idx]:.4f}")



Top 10 termos mais discriminativos (por magnitude):
  depression           → 13.6120
  bipolar              → 12.7955
  don                  → 12.2364
  avpd                 → 12.0696
  ve                   → 9.4356
  stress               → 9.1797
  anxiety              → 8.8858
  restless             → 8.4171
  wa                   → 8.1917
  manic                → 7.6727
