In [None]:
import pandas as pd
import numpy as np
# Model
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset, Dataset
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Words
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
data = pd.read_csv('the_most_correct.csv').drop('tokenized_text',axis=1)
X = data['Text']
y = data['Emotion'].map({
    'sadness':0,
    'anger':1,
    'love':2,

    'fear':3,
    'happy':4,
    'surprise':5
})
X

0                                  i didnt feel humiliated
1        i can go from feeling so hopeless to so damned...
2         im grabbing a minute to post i feel greedy wrong
3        i am ever feeling nostalgic about the fireplac...
4                                     i am feeling grouchy
                               ...                        
21454                 melissa stared at her friend in dism
21455    successive state elections have seen the gover...
21456                 vincent was irritated but not dismay
21457    kendall-hume turned back to face the dismayed ...
21458                      i am dismayed , but not surpris
Name: Text, Length: 21459, dtype: object

In [None]:
y

0        0
1        0
2        1
3        2
4        1
        ..
21454    3
21455    3
21456    3
21457    3
21458    3
Name: Emotion, Length: 21459, dtype: int64

In [None]:
# our models first layer size lol :)
vectorizer = CountVectorizer()

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))

    tokens = word_tokenize(text)

    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    return ' '.join(filtered_tokens)
X = X.apply(lambda x: remove_stopwords(x))

vectorizer.fit(X)

vocab_size = len(vectorizer.vocabulary_)
vocab_size

19085

In [None]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, vectorizer):
        self.texts = texts
        self.labels = labels
        self.vectorizer = vectorizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        text_vectorized = torch.Tensor(self.vectorizer.transform([text]).toarray()[0])

        return text_vectorized, label

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size = 0.2, random_state = 42)

train_dataset = TextDataset(X_train, y_train, vectorizer)
test_dataset = TextDataset(X_test, y_test, vectorizer)

# Define DataLoader to handle batching and shuffling
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [None]:
class NeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(NeuralNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.softmax = nn.Softmax(dim=1)
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out = self.fc1(x)
        out = self.softmax(out)
        out = self.fc2(out)
        return out

model = NeuralNet(vocab_size,12,6)
model = model.to(device)

In [None]:
def train_model(model, criterion, optimizer, train_loader, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        for inputs, labels in train_loader:
            inputs = inputs.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            labels = labels.to(device) #.float()  # Convert labels to appropriate type
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {running_loss/len(train_loader)}')

In [None]:
def evalModel(model,testloader):
    correct = 0
    total = 0

    # Set model to evaluation mode
    model.eval()

    # Move model to the same device as the input data
    device = next(model.parameters()).device

    with torch.no_grad():
        for data in testloader:
            images, labels = data
            images = images.to(device)  # Move input data to the same device as the model
            labels = labels.to(device)  # Move labels to the same device as the model

            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    print(f'Accuracy on the test set: {correct / total:.3f}')

In [None]:
criterion = nn.CrossEntropyLoss()  # Cross Entropy Loss
optimizer = optim.Adam(model.parameters(), lr=0.01)
epochs=10

In [None]:
train_model(model, criterion, optimizer, train_loader, epochs)
evalModel(model,test_loader)

  return self._call_impl(*args, **kwargs)


Epoch 1/10, Loss: 0.8516002047567421
Epoch 2/10, Loss: 0.2560795852697462
Epoch 3/10, Loss: 0.14165190855893484
Epoch 4/10, Loss: 0.10042342752433502
Epoch 5/10, Loss: 0.07808699159521022
Epoch 6/10, Loss: 0.06531359147274379
Epoch 7/10, Loss: 0.05918412831256501
Epoch 8/10, Loss: 0.05411385391755958
Epoch 9/10, Loss: 0.044986995890626345
Epoch 10/10, Loss: 0.04355707082239727
Accuracy on the test set: 0.842
