In [1]:
import spacy
import torch
from datasets import load_dataset, Dataset, concatenate_datasets
from torch import nn

In [2]:
def determine_device():
    if torch.cuda.is_available():
        return 'cuda'
    elif torch.backends.mps.is_available():
        return 'mps'
    else:
        return 'cpu'


device = determine_device()
print(f'Device is {device}')

Device is mps


In [3]:
nlp = spacy.load('en_core_web_sm')
dataset = load_dataset('Sp1786/multiclass-sentiment-analysis-dataset')
train_dataset: Dataset = dataset['train']
validation_dataset: Dataset = dataset['validation']
test_dataset: Dataset = dataset['test']

corpus = concatenate_datasets([train_dataset, validation_dataset])['text']
vocabulary = sorted(set(''.join(corpus)))

# TODO: cleanup data to only have English letters
char_to_i = {u: i for i, u in enumerate(vocabulary)}


def encode_x(char_to_i, message):
    return torch.tensor([char_to_i[char] for char in message])


def encode_y(label):
    vector = torch.zeros(3)
    vector[label] = 1
    return vector


train_messages = [encode_x(char_to_i, message) for message in train_dataset['text']]
train_labels = [encode_y(label) for label in train_dataset['label']]

print(len(train_messages))
print(len(train_labels))
print(max([len(message) for message in train_messages]))
print(len(vocabulary))


31232
31232
2176
591


In [4]:
def create_batch(xs, ys, batch_size):
    # TODO: replace back to random once the data is cleaned up
    random_indices = range(0, batch_size)
    return nn.utils.rnn.pad_sequence([xs[i] for i in random_indices], batch_first=True), torch.stack([ys[i] for i in random_indices])

In [5]:
# See https://docs.pytorch.org/docs/stable/generated/torch.nn.GRU.html.
class Model(nn.Module):

    def __init__(self, vocabulary_size, embedding_dim, hidden_size):
        super(Model, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=vocabulary_size, embedding_dim=embedding_dim)
        self.rnn = nn.GRU(input_size=embedding_dim, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, 3)

    def forward(self, sequence, hidden):
        # (batch_size, sequence_length) -> (batch_size, sequence_length, embedding_dim)
        embedded = self.embedding(sequence)
        # (batch_size, sequence_length, embedding_dim)
        # -> (batch_size, sequence_length, hidden_size), (num_layers, batch_size, hidden_size)
        prediction, hidden = self.rnn(embedded, hidden)
        # See https://docs.pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html#creating-the-network.
        return self.linear(hidden[0]), hidden

In [7]:
# TODO: try different batch sizes
batch_size = 64
loss_fn = nn.CrossEntropyLoss(reduction="mean")

embedding_dim = 256
hidden_size = 1024
num_layers = 2
model = Model(len(vocabulary), embedding_dim, hidden_size).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=5e-3)

number_of_epoches = 100
min_validation_loss = float('inf')
for epoch in range(number_of_epoches):
    x_batch, y_batch = create_batch(train_messages, train_labels, batch_size)
    h0 = torch.zeros(num_layers, batch_size, hidden_size)
    prediction, _ = model(x_batch.to(device), h0.to(device))
    loss = loss_fn(prediction, y_batch.to(device))

    loss.backward()
    optimizer.step()
    model.zero_grad()

    if epoch % 1 == 0 or epoch == number_of_epoches - 1:
        print(f'Epoch {epoch}, train loss {loss.item()}')

Epoch 0, train loss 1.1091721057891846
Epoch 1, train loss 9.116984367370605
Epoch 2, train loss 1.900390386581421
Epoch 3, train loss 5.665891170501709
Epoch 4, train loss 2.328508138656616
Epoch 5, train loss 2.052778482437134
Epoch 6, train loss 1.8010810613632202
Epoch 7, train loss 1.884670615196228
Epoch 8, train loss 1.210036039352417
Epoch 9, train loss 1.5176289081573486
Epoch 10, train loss 1.455209493637085
Epoch 11, train loss 1.4329092502593994
Epoch 12, train loss 1.0979852676391602
Epoch 13, train loss 1.1786409616470337
Epoch 14, train loss 1.231377124786377
Epoch 15, train loss 1.0844902992248535
Epoch 16, train loss 0.9691324234008789
Epoch 17, train loss 0.9969683885574341
Epoch 18, train loss 0.9168793559074402
Epoch 19, train loss 0.7745076417922974
Epoch 20, train loss 0.7864413261413574
Epoch 21, train loss 0.710435152053833
Epoch 22, train loss 0.6658961772918701
Epoch 23, train loss 0.5408737659454346
Epoch 24, train loss 0.5052280426025391
Epoch 25, train loss

KeyboardInterrupt: 