In [5]:
import os

import numpy as np
import spacy
import torch
from datasets import load_dataset, Dataset
from torch import nn
from torch.nn import functional as F
from tqdm import tqdm

In [6]:
def determine_device():
    if torch.cuda.is_available():
        return 'cuda'
    elif torch.backends.mps.is_available():
        return 'mps'
    else:
        return 'cpu'


device = determine_device()
print(f'Device is {device}')

Device is mps


In [24]:
def process(example):
    example['text'] = example['text'].strip()
    example['length'] = len(example['text'])
    return example


nlp = spacy.load('en_core_web_sm')


def tokenize_messages(messages):
    tokens = set()
    tokenized_messages = []
    for message in tqdm(messages):
        doc = nlp(message)
        tokenized_message = [token.text.lower() for token in doc]
        tokens.update(tokenized_message)
        tokenized_messages.append(tokenized_message)
    return list(tokens), tokenized_messages


def encode_x(token_to_index, tokens):
    return torch.tensor([token_to_index[token] for token in tokens if token in token_to_index])


def encode_y(label):
    vector = torch.zeros(3)
    vector[label] = 1
    return vector


dataset = load_dataset('Sp1786/multiclass-sentiment-analysis-dataset')
train_dataset: Dataset = dataset['train'].filter(lambda it: len(it['text']) <= 128).map(process).sort('length')
validation_dataset: Dataset = dataset['validation'].filter(lambda it: len(it['text']) <= 128).map(process).sort('length')
test_dataset: Dataset = dataset['test'].filter(lambda it: it['text'] is not None and len(it['text']) <= 128).map(process).sort('length')

train_tokens, train_tokenized_messages = tokenize_messages(train_dataset['text'])
validation_tokens, validation_tokenized_messages = tokenize_messages(validation_dataset['text'])
test_tokens, test_tokenized_messages = tokenize_messages(test_dataset['text'])

Filter:   0%|          | 0/31232 [00:00<?, ? examples/s]

Map:   0%|          | 0/25913 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5205 [00:00<?, ? examples/s]

Map:   0%|          | 0/4279 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5206 [00:00<?, ? examples/s]

Map:   0%|          | 0/4341 [00:00<?, ? examples/s]

100%|██████████| 25913/25913 [00:50<00:00, 513.29it/s]
100%|██████████| 4279/4279 [00:08<00:00, 510.22it/s]
100%|██████████| 4341/4341 [00:08<00:00, 516.51it/s]


In [25]:
# the full list of tokens is sorted to ensure that the encoding of the messages stays the same between the Jupiter Notebook reloads, so that the saved models could be loaded and used for inference
tokens = sorted(set(train_tokens + validation_tokens + test_tokens))
print(tokens[:20])

vocabulary = {token: index for index, token in enumerate(tokens)}
print(len(vocabulary))

token_to_index = {u: i for i, u in enumerate(vocabulary)}

['\t ', '\n', ' ', '  ', '   ', '    ', '     ', '      ', '       ', '        ', '             ', '              ', '               ', '                ', '                                                                                              ', '!', '"', '#', '$', '%']
29886


In [28]:
train_messages = [encode_x(token_to_index, tokens) for tokens in train_tokenized_messages]
train_labels = [encode_y(label) for label in train_dataset['label']]
validation_messages = [encode_x(token_to_index, tokens) for tokens in validation_tokenized_messages]
validation_labels = [encode_y(label) for label in validation_dataset['label']]
test_messages = [encode_x(token_to_index, tokens) for tokens in test_tokenized_messages]
test_labels = [encode_y(label) for label in test_dataset['label']]
print(len(train_messages))
print(len(train_labels))
print(len(validation_messages))
print(len(validation_labels))
print(len(test_messages))
print(len(test_labels))

25913
25913
4279
4279
4341
4341


In [29]:
index = 20000
print(train_dataset['text'][index], train_tokenized_messages[index])

At least he`s in breakthrough performance tho. I just wanted him nominated in his own category ['at', 'least', 'he`s', 'in', 'breakthrough', 'performance', 'tho', '.', 'i', 'just', 'wanted', 'him', 'nominated', 'in', 'his', 'own', 'category']


In [30]:
def create_batch(xs, ys, batch_size, padding_value):
    index = np.random.choice(len(xs) - batch_size + 1)
    indices = range(index, index + batch_size)
    return nn.utils.rnn.pad_sequence([xs[i] for i in indices], batch_first=True,
                                     padding_value=padding_value).int(), torch.stack(
        [ys[i] for i in indices])

In [31]:
def estimate_loss(model, h0, iterations, validation_xs, validation_ys, batch_size, padding_value):
    model.eval()
    loses = torch.zeros(iterations)
    for i in range(iterations):
        validation_x_batch, validation_y_batch = create_batch(validation_xs, validation_ys, batch_size, padding_value)
        validation_prediction, _ = model(validation_x_batch.to(device), h0.to(device))
        validation_loss = F.cross_entropy(validation_prediction, validation_y_batch.to(device), reduction='mean')
        loses[i] = validation_loss.item()
    model.train()
    return loses.mean()

In [32]:
# See https://docs.pytorch.org/docs/stable/generated/torch.nn.GRU.html.
class Model(nn.Module):

    def __init__(self, vocabulary_size, embedding_dim, hidden_size):
        super(Model, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=vocabulary_size, embedding_dim=embedding_dim)
        self.rnn = nn.GRU(input_size=embedding_dim, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, 3)

    def forward(self, sequence, hidden):
        # (batch_size, sequence_length) -> (batch_size, sequence_length, embedding_dim)
        embedded = self.embedding(sequence)
        # (batch_size, sequence_length, embedding_dim)
        # -> (batch_size, sequence_length, hidden_size), (num_layers, batch_size, hidden_size)
        prediction, hidden = self.rnn(embedded, hidden)
        # See https://docs.pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html#creating-the-network.
        return self.linear(prediction[:, -1]), hidden

In [33]:
model_dir = os.path.join(os.getcwd(), "models")
os.makedirs(model_dir, exist_ok=True)

batch_size = 256
loss_fn = nn.CrossEntropyLoss(reduction="mean")

embedding_dim = 512
hidden_size = 1024
num_layers = 2
model = Model(len(vocabulary), embedding_dim, hidden_size).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=5e-3)

number_of_epoches = 1000
min_validation_loss = float('inf')
for epoch in range(number_of_epoches):
    x_batch, y_batch = create_batch(train_messages, train_labels, batch_size, len(vocabulary))
    h0 = torch.zeros(num_layers, batch_size, hidden_size)
    prediction, _ = model(x_batch.to(device), h0.to(device))
    loss = loss_fn(prediction, y_batch.to(device))

    loss.backward()
    optimizer.step()
    model.zero_grad()

    if epoch % 10 == 0 or epoch == number_of_epoches - 1:
        validation_loss = estimate_loss(model, torch.zeros(num_layers, batch_size, hidden_size), 32,
                                        validation_messages, validation_labels, batch_size, len(vocabulary))
        print(f'Epoch {epoch}, train loss {loss.item()}, validation loss {validation_loss.item()}')
        if validation_loss < min_validation_loss:
            model_file_name = os.path.join(model_dir, "model_" + str(epoch) + ".pt")
            torch.save(model.state_dict(), model_file_name)
            print("Model has been saved as", model_file_name)
            min_validation_loss = validation_loss


Epoch 0, train loss 1.095672607421875, validation loss 1.983828067779541
Model has been saved as /Users/yaskovdev/dev/git_home/ai-sandbox/sentiment-analysis/models/model_0.pt
Epoch 10, train loss 1.1603114604949951, validation loss 1.1238842010498047
Model has been saved as /Users/yaskovdev/dev/git_home/ai-sandbox/sentiment-analysis/models/model_10.pt
Epoch 20, train loss 1.1029226779937744, validation loss 1.11034095287323
Model has been saved as /Users/yaskovdev/dev/git_home/ai-sandbox/sentiment-analysis/models/model_20.pt
Epoch 30, train loss 1.0730981826782227, validation loss 1.1077896356582642
Model has been saved as /Users/yaskovdev/dev/git_home/ai-sandbox/sentiment-analysis/models/model_30.pt
Epoch 40, train loss 1.1564178466796875, validation loss 1.0270905494689941
Model has been saved as /Users/yaskovdev/dev/git_home/ai-sandbox/sentiment-analysis/models/model_40.pt
Epoch 50, train loss 1.070112705230713, validation loss 1.0367727279663086
Epoch 60, train loss 0.9878355264663

KeyboardInterrupt: 

In [34]:
model.load_state_dict(
    torch.load(os.path.join(os.getcwd(), "models", 'model_220.pt'), map_location=torch.device(device)))

<All keys matched successfully>

In [35]:
# Measuring the model performance
correct = 0
total = 0
for index in range(len(test_dataset)):
    h0 = torch.zeros(num_layers, 1, hidden_size)
    message = test_dataset['text'][index]
    label = test_dataset['label'][index]
    encoded_message = encode_x(token_to_index, test_tokenized_messages[index])
    encoded_label = encode_y(label)
    x_batch, y_batch = create_batch([encoded_message], [encoded_label], 1, len(vocabulary))
    prediction, _ = model(x_batch.to(device), h0.to(device))
    distribution = torch.nn.functional.softmax(prediction, dim=-1)
    labels = ['negative', 'neutral', 'positive']
    if labels[label] == labels[torch.argmax(distribution)]:
        correct += 1
    total += 1
    if index % 100 == 0:
        print(f'Finished test {index}, accuracy is {correct / total}')

print(correct, total)

Finished test 0, accuracy is 1.0
Finished test 100, accuracy is 0.5148514851485149
Finished test 200, accuracy is 0.6069651741293532
Finished test 300, accuracy is 0.6212624584717608
Finished test 400, accuracy is 0.6533665835411472
Finished test 500, accuracy is 0.6467065868263473
Finished test 600, accuracy is 0.653910149750416
Finished test 700, accuracy is 0.6576319543509273
Finished test 800, accuracy is 0.66167290886392


KeyboardInterrupt: 

In [None]:
h0 = torch.zeros(num_layers, 1, hidden_size)
message = "I hated you"  # "I never hated you"
_, tokenized_messages = tokenize_messages([message])
encoded_message = encode_x(token_to_index, tokenized_messages[0])
x_batch, _ = create_batch([encoded_message], [torch.tensor([0, 0, 0])], 1, len(vocabulary))
prediction, _ = model(x_batch.to(device), h0.to(device))
distribution = torch.nn.functional.softmax(prediction, dim=-1)
print(distribution)
labels = ['negative', 'neutral', 'positive']
print(labels[torch.argmax(distribution)])