In [1]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

preprocessed_data = pd.read_csv('processed_emotions_dataset_2.csv',index_col=0)
preprocessed_data['preprocessed_text_split'] = preprocessed_data['preprocessed_text'].str.split()
preprocessed_data = preprocessed_data.dropna()
print(preprocessed_data.head())

                                                text  label  \
0      i just feel really helpless and heavy hearted      4   
1  ive enjoyed being able to slouch about relax a...      0   
2  i gave up my internship with the dmrg and am f...      4   
3                         i dont know i feel so lost      0   
4  i am a kindergarten teacher and i am thoroughl...      4   

                                   preprocessed_text  \
0                   feel realli helpless heavi heart   
1  ive enjoy abl slouch relax unwind frankli need...   
2               gave internship dmrg feel distraught   
3                                dont know feel lost   
4  kindergarten teacher thoroughli weari job take...   

                             preprocessed_text_split  
0             [feel, realli, helpless, heavi, heart]  
1  [ive, enjoy, abl, slouch, relax, unwind, frank...  
2         [gave, internship, dmrg, feel, distraught]  
3                           [dont, know, feel, lost]  
4  [kinde

In [2]:
# Train Word2Vec model on tokenized text data
w2v_model = Word2Vec(sentences=preprocessed_data['preprocessed_text_split'], vector_size=100, window=5, min_count=1, sg=1, seed=42)

In [3]:
# Function to compute the mean of word vectors for each document
def document_vector(words):
    # Filter words to only those in the Word2Vec vocabulary
    words = [word for word in words if word in w2v_model.wv]
    if len(words) == 0:
        return np.zeros(100)  # Return a vector of zeros if no words are in the model
    return np.mean(w2v_model.wv[words], axis=0)


In [4]:
# Apply the function to get document vectors
preprocessed_data['doc_vector'] = preprocessed_data['preprocessed_text_split'].apply(document_vector)

In [5]:
# Split data into features and labels
X = np.vstack(preprocessed_data['doc_vector'].values)
y = preprocessed_data['label']

In [6]:
# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
import torch
from torch.utils.data import DataLoader, Dataset

class CustomDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

dataset_train = CustomDataset(torch.from_numpy(X_train), torch.tensor(y_train.to_list()))

dataloader_train = DataLoader(dataset_train, batch_size=16, shuffle=True)


In [8]:
import torch.nn as nn

class FF_Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(100, 48),
            nn.ReLU(),
            nn.Linear(48, 12),
            nn.ReLU(),
            nn.Linear(12, 6),
        )

    def forward(self, x):
        output = self.linear_relu_stack(x)
        return output
        

feedforward_net = FF_Net()

criterion = nn.CrossEntropyLoss()

optimizer_ffn = torch.optim.Adam(feedforward_net.parameters(), lr=0.001)


In [9]:
loss_ffn = []
num_epochs_ffn = 20

for epoch in range(num_epochs_ffn):  # loop over the dataset multiple times
    running_loss_ffn = 0.0

    for batch_idx, data in enumerate(dataloader_train):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data
        # print(inputs)
        # print(labels)
        # zero the parameter gradients
        optimizer_ffn.zero_grad()

        # forward + backward + optimize
        outputs = feedforward_net(inputs)
        # print(outputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer_ffn.step()
        running_loss_ffn += loss.item()

    print(f"Training loss: {running_loss_ffn}")
    loss_ffn.append(running_loss_ffn)

print('Finished Training')

torch.save(feedforward_net.state_dict(), 'ffn.pth')  # Saves model file (upload with submission)

Training loss: 18468.57271951437
Training loss: 15734.032868422568
Training loss: 14932.137249425054
Training loss: 14445.858680851758
Training loss: 14128.484894528985
Training loss: 13890.129611760378
Training loss: 13727.898867569864
Training loss: 13566.754456803203
Training loss: 13439.284829229116
Training loss: 13353.361218616366
Training loss: 13273.478071521968
Training loss: 13191.23686401546
Training loss: 13125.144766561687
Training loss: 13078.516708016396
Training loss: 13031.13001601398
Training loss: 12975.229168433696
Training loss: 12939.79942690581
Training loss: 12908.740641139448
Training loss: 12876.379453741014
Training loss: 12834.73438629508
Finished Training


In [10]:
with torch.no_grad():    
    inputs = torch.from_numpy(X_test)
    output = feedforward_net(inputs)
    y_pred = output.argmax(1)
    y_true = torch.tensor(y_test.to_list())
    print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.74      0.78     24583
           1       0.77      0.86      0.82     28247
           2       0.65      0.58      0.61      6877
           3       0.68      0.76      0.72     11629
           4       0.72      0.66      0.69      9576
           5       0.64      0.51      0.57      3133

    accuracy                           0.75     84045
   macro avg       0.71      0.69      0.70     84045
weighted avg       0.75      0.75      0.75     84045



In [11]:
# Display the predicted values alongside the actual labels
predicted_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(predicted_df.head())

        Actual  Predicted
412679       4          5
346836       0          0
80692        1          1
292510       2          1
238292       5          5
