In [1]:
import pandas as pd
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import hstack


In [2]:
data_path = "../data/processed/emotion_mental_health.csv"
df = pd.read_csv(data_path)


In [3]:
df.head()


Unnamed: 0.1,Unnamed: 0,statement,status,cleaned_statement,emotion
0,0,oh my gosh,Anxiety,oh gosh,surprise
1,1,"trouble sleeping, confused mind, restless hear...",Anxiety,trouble sleeping confused mind restless heart ...,fear
2,2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety,wrong back dear forward doubt stay restless re...,fear
3,3,I've shifted my focus to something else but I'...,Anxiety,ive shifted focus something else im still worried,fear
4,4,"I'm restless and restless, it's been a month n...",Anxiety,im restless restless month boy mean,fear


In [3]:
df.shape


(18441, 5)

In [4]:
df['status'].value_counts()


status
Anxiety                 3000
Depression              3000
Normal                  3000
Suicidal                3000
Bipolar                 2777
Stress                  2587
Personality disorder    1077
Name: count, dtype: int64

In [5]:
tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2)
)

X_text = tfidf.fit_transform(df['cleaned_statement'])
X_text.shape


(18441, 5000)

In [6]:
emotion_encoder = LabelEncoder()
emotion_encoded = emotion_encoder.fit_transform(df['emotion'])
emotion_encoded = emotion_encoded.reshape(-1, 1)


In [7]:
X_features = hstack([X_text, emotion_encoded])
X_features.shape


(18441, 5001)

In [8]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['status'])


In [10]:
with open("../models/tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf, f)

with open("../models/emotion_encoder.pkl", "wb") as f:
    pickle.dump(emotion_encoder, f)

with open("../models/label_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)

with open("../models/X_features.pkl", "wb") as f:
    pickle.dump(X_features, f)

with open("../models/y_labels.pkl", "wb") as f:
    pickle.dump(y, f)
print("Feature engineering completed successfully")


Feature engineering completed successfully
