In [1]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
import time

preprocessed_data = pd.read_csv('processed_emotions_dataset.csv',index_col=0)
preprocessed_data['preprocessed_text_split'] = preprocessed_data['preprocessed_text'].str.split()
preprocessed_data = preprocessed_data.dropna()
print(preprocessed_data)

                                                     text    label  \
0           i just feel really helpless and heavy hearted     fear   
1       ive enjoyed being able to slouch about relax a...  sadness   
2       i gave up my internship with the dmrg and am f...     fear   
3                              i dont know i feel so lost  sadness   
4       i am a kindergarten teacher and i am thoroughl...     fear   
...                                                   ...      ...   
420236  i feel blessed to be able to see that we didn ...      joy   
420237  i think another reason i love concerts is it i...      joy   
420238  i usually take on to more protein when i start...  sadness   
420239  i feel that rich people will never understand ...      joy   
420240  i feel slightly naughty holding this cd seeing...     love   

                                        preprocessed_text  \
0                        feel realli helpless heavi heart   
1       ive enjoy abl slouch relax un

In [None]:
label_counts = preprocessed_data['label'].value_counts()
print(label_counts)


label
joy         141636
sadness     121755
anger        57883
fear         48281
love         35126
surprise     15544
Name: count, dtype: int64


In [3]:
print(preprocessed_data.head())

                                                text    label  \
0      i just feel really helpless and heavy hearted     fear   
1  ive enjoyed being able to slouch about relax a...  sadness   
2  i gave up my internship with the dmrg and am f...     fear   
3                         i dont know i feel so lost  sadness   
4  i am a kindergarten teacher and i am thoroughl...     fear   

                                   preprocessed_text  \
0                   feel realli helpless heavi heart   
1  ive enjoy abl slouch relax unwind frankli need...   
2               gave internship dmrg feel distraught   
3                                dont know feel lost   
4  kindergarten teacher thoroughli weari job take...   

                             preprocessed_text_split  
0             [feel, realli, helpless, heavi, heart]  
1  [ive, enjoy, abl, slouch, relax, unwind, frank...  
2         [gave, internship, dmrg, feel, distraught]  
3                           [dont, know, feel, lost]

In [None]:
w2v_model = Word2Vec(sentences=preprocessed_data['preprocessed_text_split'], vector_size=100, window=5, min_count=1, sg=1, seed=42)

In [None]:
def document_vector(words):
    words = [word for word in words if word in w2v_model.wv]
    if len(words) == 0:
        return np.zeros(100) 
    return np.mean(w2v_model.wv[words], axis=0)


In [None]:
preprocessed_data['doc_vector'] = preprocessed_data['preprocessed_text_split'].apply(document_vector)

In [None]:
X = np.vstack(preprocessed_data['doc_vector'].values)
y = preprocessed_data['label']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       anger       0.38      0.40      0.39     11629
        fear       0.33      0.36      0.34      9576
         joy       0.59      0.59      0.59     28247
        love       0.22      0.22      0.22      6877
     sadness       0.55      0.52      0.53     24583
    surprise       0.16      0.17      0.16      3133

    accuracy                           0.47     84045
   macro avg       0.37      0.37      0.37     84045
weighted avg       0.47      0.47      0.47     84045



In [None]:
predicted_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(predicted_df.head())

          Actual Predicted
412679      fear  surprise
346836   sadness   sadness
80692        joy       joy
292510      love       joy
238292  surprise   sadness
