In [199]:
#importing necessary libraries
import pandas as pd  #data analysis
import numpy as np  #numerical operations and arrays
import re  #regular expressions for text cleaning
from nltk.corpus import stopwords  #stop words for preprocessing
from nltk.tokenize import word_tokenize  #tokenize text into words
from nltk.stem import WordNetLemmatizer  #lemmatizes words to base form
from sklearn.feature_extraction.text import CountVectorizer  #bag of words representation
from sklearn.model_selection import train_test_split  #splits data into training and testing sets
from sklearn.naive_bayes import MultinomialNB  #naive bayes for classification
from sklearn.linear_model import LogisticRegression  #logistic regression for classification
from sklearn.metrics import classification_report  #model evaluation

In [200]:
import kagglehub  #tool for downloading datasets directly from kaggle

In [201]:
#download latest version of the dataset
path = kagglehub.dataset_download("suchintikasarkar/sentiment-analysis-for-mental-health")
print("Path to dataset files:", path)  #local path of the downloaded dataset

Path to dataset files: /root/.cache/kagglehub/datasets/suchintikasarkar/sentiment-analysis-for-mental-health/versions/1


In [202]:
import os
print(os.listdir(path))  #lists files in the dataset folder

['Combined Data.csv']


In [203]:
#load the dataset from the downloaded path
dataset_path = f"{path}/Combined Data.csv"  #constructs the full path to the dataset csv file
df = pd.read_csv(dataset_path)  #read csv file into pandas dataframe
print("Dataset columns:", df.columns)  #prints column names to verify dataset structure (for debug)

Dataset columns: Index(['Unnamed: 0', 'statement', 'status'], dtype='object')


In [204]:
#check unique classes in the original dataset
unique_classes = df['status'].unique()
print("Unique classes in the dataset:", unique_classes)

Unique classes in the dataset: ['Anxiety' 'Normal' 'Depression' 'Suicidal' 'Stress' 'Bipolar'
 'Personality disorder']


In [205]:
#apply preprocessing and mapping
df['statement'] = df['statement'].fillna('').astype(str)  #replace missing values
print("Dataset loaded and preprocessed.")

Dataset loaded and preprocessed.


In [225]:
#text preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = re.sub(r'[^\w\s]', '', text)  #remove punctuation and special characters
    text = text.lower()  #convert to lowercase
    tokens = word_tokenize(text)  #tokenize text
    tokens = [word for word in tokens if word not in stopwords.words('english')]  #remove stopwords
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  #lemmatize tokens
    return ' '.join(tokens)

In [207]:
#map multi-class labels to binary labels (normal, abnormal)
def map_to_binary(label):
    abnormal_labels = ['Anxiety', 'Bipolar', 'Depression', 'Personality disorder', 'Stress', 'Suicidal']
    return 'abnormal' if label in abnormal_labels else 'normal'

In [208]:
#apply binary mapping
df['binary_status'] = df['status'].apply(map_to_binary)

In [209]:
#preprocess text data
df['cleaned_text'] = df['statement'].apply(preprocess_text)

In [217]:
# Bag of words representation
vectorizer = CountVectorizer() #convert text to a bag of words representation
X = vectorizer.fit_transform(df['cleaned_text']) #fit and transform the cleaned text to vectors
y = df['binary_status']

In [218]:
# Ensure binary labels only
y = y.apply(lambda x: 'normal' if x == 'normal' else 'abnormal')

In [219]:
#split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [220]:
#train naive bayes model
from sklearn.naive_bayes import MultinomialNB
nb_model = MultinomialNB()  #initialize naive bayes model
nb_model.fit(X_train, y_train)  #train the model
nb_predictions = nb_model.predict(X_test)  #predict on test data
print("Naive Bayes Classification Report:\n", classification_report(y_test, nb_predictions))  #evaluate performance

Naive Bayes Classification Report:
               precision    recall  f1-score   support

    abnormal       0.82      0.97      0.89      7282
      normal       0.90      0.55      0.68      3327

    accuracy                           0.84     10609
   macro avg       0.86      0.76      0.79     10609
weighted avg       0.85      0.84      0.83     10609



In [221]:
#train logistic regression model
lr_model = LogisticRegression(max_iter=1000)  #initialize logistic regression model (1000 max iterations)
lr_model.fit(X_train, y_train)  #train the model
lr_predictions = lr_model.predict(X_test)  #predict on test data
print("Logistic Regression Classification Report:\n", classification_report(y_test, lr_predictions))  #evaluate performance

Logistic Regression Classification Report:
               precision    recall  f1-score   support

    abnormal       0.97      0.94      0.96      7282
      normal       0.88      0.94      0.91      3327

    accuracy                           0.94     10609
   macro avg       0.93      0.94      0.93     10609
weighted avg       0.94      0.94      0.94     10609



In [222]:
from sklearn.metrics import accuracy_score
print("Test Accuracy (Naive Bayes):", accuracy_score(y_test, nb_predictions))  # calculate accuracy for NB
print("Test Accuracy (Logistic Regression):", accuracy_score(y_test, lr_predictions))  #accuracy for LR

Test Accuracy (Naive Bayes): 0.838062022810821
Test Accuracy (Logistic Regression): 0.9405221981336601


In [224]:
# Predict new data
new_posts = [
    "Had a great day with friends!",
    "I feel so stressed and anxious lately.",
    "I'm nervous about the future",
    "What is the point of living anymore"
]
new_posts_cleaned = [preprocess_text(post) for post in new_posts] #clean new posts
new_posts_vectorized = vectorizer.transform(new_posts_cleaned) #transform posts to vectors
new_predictions = lr_model.predict(new_posts_vectorized)
print("Predictions for new posts:", new_predictions)

Predictions for new posts: ['normal' 'abnormal' 'abnormal' 'abnormal']
