## Imports

In [1]:
# Imports.
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import nltk

# Downloading the NLTK stuff.
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to C:\Users\Gagana Methmal
[nltk_data]     (GM)\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Gagana Methmal
[nltk_data]     (GM)\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Text Preprocessing

In [2]:
# Function to Preprocess.
def preprocess_text(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    text = re.sub(r'[^a-zA-Z]', ' ', text).lower()
    words = [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words]
    return ' '.join(words)

## Loading Dataset 

In [3]:
# Load Dataset.
df = pd.read_csv('Mental Health Sentiments.csv')
print("Columns in the dataset:", df.columns)

Columns in the dataset: Index(['ID', 'Statement', 'Status'], dtype='object')


## Data Preprocessing 

In [4]:
# Data Preprocessing.
df['Statement'] = df['Statement'].fillna('Missing Statement').apply(preprocess_text)

## Encoding Labels

In [5]:
# Encoding Labels.
label_encoder = LabelEncoder()
df['Status'] = label_encoder.fit_transform(df['Status'])

## Checking class balance before SMOTE

In [6]:
# Checking Class Balance.
print("\nClass Distribution Before Balancing:")
print(df['Status'].value_counts())


Class Distribution Before Balancing:
Status
3    16351
2    15404
6    10652
0     3888
1     2877
5     2669
4     1201
7        3
Name: count, dtype: int64


## Handling Imbalanced Datset with SMOTE

In [7]:
# Handle Imbalanced Dataset with SMOTE.
X = df['Statement']
y = df['Status']
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
X = vectorizer.fit_transform(X)

# Handle rare minority classes using an adjusted SMOTE.
smote = SMOTE(random_state=42, k_neighbors=2)  
try:
    X_resampled, y_resampled = smote.fit_resample(X, y)
except ValueError as e:
    print("SMOTE Error:", e)
    print("Adjusting the dataset or resampling parameters may be required.")
    raise

print("\nClass Distribution After Balancing:")
print(pd.Series(y_resampled).value_counts())

KeyboardInterrupt: 

## Splitting for Training and Testing

In [8]:
# Splitting Dataset.
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

## Hyperparameter Tuning

In [9]:
# Hyperparameter Tuning for Logistic Regression.
param_grid = {
    'C': [0.1, 1, 10],
    'solver': ['lbfgs', 'liblinear'],
    'penalty': ['l2'],
}
grid_search = GridSearchCV(LogisticRegression(max_iter=1000), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
print("\nBest Parameters:", grid_search.best_params_)


Best Parameters: {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}


## Evaluate Model

In [10]:
# Evaluate Model.
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("\nAccuracy of the model:", accuracy * 100, "%")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy of the model: 89.78671355400964 %

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.98      0.97      3250
           1       0.99      0.95      0.97      3338
           2       0.83      0.68      0.75      3256
           3       0.87      0.90      0.88      3302
           4       1.00      0.90      0.95      3239
           5       0.97      0.97      0.97      3204
           6       0.74      0.81      0.77      3313
           7       0.85      1.00      0.92      3260

    accuracy                           0.90     26162
   macro avg       0.90      0.90      0.90     26162
weighted avg       0.90      0.90      0.90     26162


Confusion Matrix:
[[3169    1   11   17    1    5    5   41]
 [   8 3175   12   10    0    2    5  126]
 [  80   30 2223  150    8   44  721    0]
 [  22    3   70 2971    0   30  205    1]
 [   0    0    0    0 2928    0    0  311]
 [   3    2    6    7    1 3094    4   87]
 [  

## Predict for real-time use

In [11]:
# Predict with Probabilities for Real-Time Use.
custom_input = ["I'm feeling overwhelmed."]
custom_input_preprocessed = [preprocess_text(sentence) for sentence in custom_input]
custom_input_transformed = vectorizer.transform(custom_input_preprocessed)
custom_prediction = best_model.predict(custom_input_transformed)
custom_probabilities = best_model.predict_proba(custom_input_transformed)

predicted_label = label_encoder.inverse_transform(custom_prediction)
print("\nPredicted Class for the User Input:", predicted_label)
print("\nConfidence Scores for Each Class:", custom_probabilities)


Predicted Class for the User Input: ['Anxiety']

Confidence Scores for Each Class: [[3.64426242e-01 4.88491776e-03 4.81330080e-02 9.73443731e-02
  4.49041501e-03 1.90853826e-01 2.89820173e-01 4.70453513e-05]]


In [14]:
import joblib
import os

# Define the path for saving the files
model_dir = 'server/app/models/'

# Ensure the directory exists
os.makedirs(model_dir, exist_ok=True)

# Save the model, vectorizer, and label encoder
model_path = os.path.join(model_dir, 'mental_health_model.pkl')
vectorizer_path = os.path.join(model_dir, 'vectorizer.pkl')
label_encoder_path = os.path.join(model_dir, 'label_encoder.pkl')

print(f"Saving model to: {model_path}")
print(f"Saving vectorizer to: {vectorizer_path}")
print(f"Saving label encoder to: {label_encoder_path}")

joblib.dump(best_model, model_path)
joblib.dump(vectorizer, vectorizer_path)
joblib.dump(label_encoder, label_encoder_path)


Saving model to: server/app/models/mental_health_model.pkl
Saving vectorizer to: server/app/models/vectorizer.pkl
Saving label encoder to: server/app/models/label_encoder.pkl


['server/app/models/label_encoder.pkl']