In [None]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
import time
from gensim import corpora
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel
from pprint import pprint

preprocessed_data = pd.read_csv('processed_emotions_dataset_2.csv',index_col=0)
preprocessed_data['preprocessed_text_split'] = preprocessed_data['preprocessed_text'].str.split()
preprocessed_data = preprocessed_data.dropna()
print(preprocessed_data.head())

                                                     text  label  \
0           i just feel really helpless and heavy hearted      4   
1       ive enjoyed being able to slouch about relax a...      0   
2       i gave up my internship with the dmrg and am f...      4   
3                              i dont know i feel so lost      0   
4       i am a kindergarten teacher and i am thoroughl...      4   
...                                                   ...    ...   
420236  i feel blessed to be able to see that we didn ...      1   
420237  i think another reason i love concerts is it i...      1   
420238  i usually take on to more protein when i start...      0   
420239  i feel that rich people will never understand ...      1   
420240  i feel slightly naughty holding this cd seeing...      2   

                                        preprocessed_text  \
0                        feel realli helpless heavi heart   
1       ive enjoy abl slouch relax unwind frankli need...   


In [None]:

label_counts = preprocessed_data['label'].value_counts()
print(label_counts)


label
1    141636
0    121755
3     57883
4     48281
2     35126
5     15544
Name: count, dtype: int64


In [None]:

w2v_model = Word2Vec(sentences=preprocessed_data['preprocessed_text_split'], vector_size=100, window=5, min_count=1, sg=1, seed=42)

In [None]:

def document_vector(words):
    words = [word for word in words if word in w2v_model.wv]
    if len(words) == 0:
        return np.zeros(100)
    '''
    integrate LDA here?
    '''
    return np.mean(w2v_model.wv[words], axis=0)


In [None]:

preprocessed_data['doc_vector'] = preprocessed_data['preprocessed_text_split'].apply(document_vector)

In [None]:

X = np.vstack(preprocessed_data['doc_vector'].values)
y = preprocessed_data['label']

In [None]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
import torch
from torch.utils.data import DataLoader, Dataset

class CustomDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

dataset_train = CustomDataset(torch.from_numpy(X_train), torch.tensor(y_train.to_list()))

dataloader_train = DataLoader(dataset_train, batch_size=16, shuffle=True)


In [None]:
import torch.nn as nn

class FF_Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(100, 48),
            nn.ReLU(),
            nn.Linear(48, 12),
            nn.ReLU(),
            nn.Linear(12, 6),
        )

    def forward(self, x):
        output = self.linear_relu_stack(x)
        return output


feedforward_net = FF_Net()

criterion = nn.CrossEntropyLoss()

optimizer_ffn = torch.optim.Adam(feedforward_net.parameters(), lr=0.001)


In [None]:
loss_ffn = []
num_epochs_ffn = 20

for epoch in range(num_epochs_ffn):
    running_loss_ffn = 0.0

    for batch_idx, data in enumerate(dataloader_train):
        inputs, labels = data
        optimizer_ffn.zero_grad()


        outputs = feedforward_net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer_ffn.step()
        running_loss_ffn += loss.item()

    print(f"Training loss: {running_loss_ffn}")
    loss_ffn.append(running_loss_ffn)

print('Finished Training')

torch.save(feedforward_net.state_dict(), 'ffn.pth')

Training loss: 18390.042159244418
Training loss: 15822.920703321695
Training loss: 14997.72575198114
Training loss: 14552.11619593203
Training loss: 14247.098839044571
Training loss: 14010.443231776357
Training loss: 13838.617416538298
Training loss: 13693.30406486243
Training loss: 13570.959481112659
Training loss: 13485.723673276603
Training loss: 13403.920531377196
Training loss: 13328.965324550867
Training loss: 13265.332359328866
Training loss: 13223.544266559184
Training loss: 13158.249276332557
Training loss: 13113.720120027661
Training loss: 13082.880478098989
Training loss: 13039.268180586398
Training loss: 13001.805877000093
Training loss: 12974.138691589236
Finished Training


In [None]:
with torch.no_grad():
    inputs = torch.from_numpy(X_test)
    output = feedforward_net(inputs)
    y_pred = output.argmax(1)
    y_true = torch.tensor(y_test.to_list())
    print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.76      0.83      0.79     24583
           1       0.81      0.83      0.82     28247
           2       0.72      0.48      0.58      6877
           3       0.72      0.70      0.71     11629
           4       0.70      0.69      0.69      9576
           5       0.60      0.56      0.58      3133

    accuracy                           0.76     84045
   macro avg       0.72      0.68      0.70     84045
weighted avg       0.76      0.76      0.75     84045



In [None]:

predicted_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(predicted_df.head())

        Actual  Predicted
412679       4          4
346836       0          0
80692        1          1
292510       2          1
238292       5          5
