In [141]:
import os
import re

import numpy as np
import spacy
import torch
from datasets import load_dataset, Dataset
from torch import nn
from torch.nn import functional as F
from tqdm import tqdm

In [55]:
def determine_device():
    if torch.cuda.is_available():
        return 'cuda'
    elif torch.backends.mps.is_available():
        return 'mps'
    else:
        return 'cpu'


device = determine_device()
print(f'Device is {device}')

Device is mps


In [66]:
def process(example):
    example['text'] = example['text'].strip()
    example['length'] = len(example['text'])
    return example


def tokenize_messages(messages):
    tokens = set()
    tokenized_messages = []
    for message in tqdm(messages):
        doc = nlp(message)
        tokenized_message = [token.text.lower() for token in doc]
        tokens.update(tokenized_message)
        tokenized_messages.append(tokenized_message)
    return list(tokens), tokenized_messages


def encode_x(token_to_index, tokens):
    return torch.tensor([token_to_index[token] for token in tokens if token in token_to_index])


# Returns a scalar with the class index (0, 1 or 2).
def encode_y(label):
    return torch.tensor(label)

In [118]:
nlp = spacy.load('en_core_web_sm')

max_length = float('inf')
dataset = load_dataset('Sp1786/multiclass-sentiment-analysis-dataset')
train_dataset: Dataset = dataset['train'].filter(lambda it: len(it['text']) <= max_length).map(process).sort('length')
validation_dataset: Dataset = dataset['validation'].filter(lambda it: len(it['text']) <= max_length).map(process).sort(
    'length')
test_dataset: Dataset = dataset['test'].filter(lambda it: it['text'] is not None and len(it['text']) <= max_length).map(
    process).sort('length')

train_tokens, train_tokenized_messages = tokenize_messages(train_dataset['text'])
validation_tokens, validation_tokenized_messages = tokenize_messages(validation_dataset['text'])
test_tokens, test_tokenized_messages = tokenize_messages(test_dataset['text'])

Connection closed unexpectedly!


Filter:   0%|          | 0/31232 [00:00<?, ? examples/s]

Map:   0%|          | 0/31232 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5205 [00:00<?, ? examples/s]

Map:   0%|          | 0/5205 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5206 [00:00<?, ? examples/s]

Map:   0%|          | 0/5205 [00:00<?, ? examples/s]

100%|██████████| 31232/31232 [01:19<00:00, 390.56it/s]
100%|██████████| 5205/5205 [00:13<00:00, 393.82it/s]
100%|██████████| 5205/5205 [00:12<00:00, 400.64it/s]


In [119]:
# the full list of tokens is sorted to ensure that the encoding of the messages stays the same between the Jupiter Notebook reloads, so that the saved models could be loaded and used for inference
tokens = sorted(set(train_tokens + validation_tokens + test_tokens))
print(tokens[:20])

vocabulary = {token: index for index, token in enumerate(tokens)}
print(len(vocabulary))

token_to_index = {u: i for i, u in enumerate(vocabulary)}

['\t ', '\n', '\n\n', ' ', '  ', '   ', '    ', '     ', '      ', '       ', '        ', '             ', '              ', '               ', '                ', '                                           ', '                                                                                              ', '!', '"', '"-']
36633


In [120]:
train_messages = [encode_x(token_to_index, tokens) for tokens in train_tokenized_messages]
train_labels = [encode_y(label) for label in train_dataset['label']]
validation_messages = [encode_x(token_to_index, tokens) for tokens in validation_tokenized_messages]
validation_labels = [encode_y(label) for label in validation_dataset['label']]
test_messages = [encode_x(token_to_index, tokens) for tokens in test_tokenized_messages]
test_labels = [encode_y(label) for label in test_dataset['label']]
print(len(train_messages))
print(len(train_labels))
print(len(validation_messages))
print(len(validation_labels))
print(len(test_messages))
print(len(test_labels))

31232
31232
5205
5205
5205
5205


In [121]:
index = 20000
print(train_dataset['text'][index], train_tokenized_messages[index])

At least he`s in breakthrough performance tho. I just wanted him nominated in his own category ['at', 'least', 'he`s', 'in', 'breakthrough', 'performance', 'tho', '.', 'i', 'just', 'wanted', 'him', 'nominated', 'in', 'his', 'own', 'category']


In [122]:
def create_batch(xs, ys, batch_size, padding_value):
    index = np.random.choice(len(xs) - batch_size + 1)
    indices = range(index, index + batch_size)
    return nn.utils.rnn.pad_sequence([xs[i] for i in indices], batch_first=True,
                                     padding_value=padding_value).int(), torch.stack(
        [ys[i] for i in indices])

In [123]:
def estimate_loss(model, h0, iterations, validation_xs, validation_ys, batch_size, padding_value):
    model.eval()
    loses = torch.zeros(iterations)
    for i in range(iterations):
        validation_x_batch, validation_y_batch = create_batch(validation_xs, validation_ys, batch_size, padding_value)
        validation_prediction, _ = model(validation_x_batch.to(device), h0.to(device))
        validation_loss = F.nll_loss(validation_prediction, validation_y_batch.to(device))
        loses[i] = validation_loss.item()
    model.train()
    return loses.mean()

In [133]:
# See https://docs.pytorch.org/docs/stable/generated/torch.nn.GRU.html.
class Model(nn.Module):

    def __init__(self, vocabulary_size, embedding_dim, hidden_size, num_layers):
        super(Model, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=vocabulary_size, embedding_dim=embedding_dim)
        self.rnn = nn.GRU(input_size=embedding_dim, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, 3)
        self.log_softmax = nn.LogSoftmax(
            dim=1)  # The negative log likelihood loss expects log-probabilities of each class.

    def forward(self, sequence, hidden):
        # (batch_size, sequence_length) -> (batch_size, sequence_length, embedding_dim)
        embedded = self.embedding(sequence)
        # (batch_size, sequence_length, embedding_dim)
        # -> (batch_size, sequence_length, hidden_size), (num_layers, batch_size, hidden_size)
        prediction, hidden = self.rnn(embedded, hidden)
        # See https://docs.pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html#creating-the-network.
        linear_prediction = self.linear(prediction[:, -1])
        return self.log_softmax(linear_prediction), hidden

In [131]:
model_dir = os.path.join(os.getcwd(), "models")
os.makedirs(model_dir, exist_ok=True)

batch_size = 256
# See https://discuss.pytorch.org/t/difference-between-cross-entropy-loss-or-log-likelihood-loss/38816/2.
loss_fn = nn.NLLLoss()

patience = 30
min_validation_loss = float('inf')
for num_layers in range(1, 5):
    for embedding_dim_power in range(12):
        for hidden_size_power in range(12):
            embedding_dim = 2 ** embedding_dim_power
            hidden_size = 2 ** hidden_size_power
            model = Model(len(vocabulary), embedding_dim, hidden_size, num_layers).to(device)
            model_min_validation_loss = float('inf')
            optimizer = torch.optim.Adam(model.parameters(), lr=5e-3)

            number_of_epoches = 5000
            model_validation_loses = []

            current_patience = patience
            for epoch in range(number_of_epoches):
                x_batch, y_batch = create_batch(train_messages, train_labels, batch_size, len(vocabulary))
                h0 = torch.zeros(num_layers, batch_size, hidden_size)
                prediction, _ = model(x_batch.to(device), h0.to(device))
                loss = loss_fn(prediction, y_batch.to(device))

                loss.backward()
                optimizer.step()
                model.zero_grad()

                if epoch % 10 == 0 or epoch == number_of_epoches - 1:
                    if current_patience == 0:
                        break
                    validation_loss = estimate_loss(model, torch.zeros(num_layers, batch_size, hidden_size), 32,
                                                    validation_messages, validation_labels, batch_size, len(vocabulary))
                    model_validation_loses.append(validation_loss.item())
                    mean_validation_loss = torch.tensor(model_validation_loses[-8:]).mean()
                    print(
                        f'Epoch {epoch}, current patience {current_patience}, model mean validation loss {mean_validation_loss}, embedding dim {embedding_dim}, hidden size {hidden_size}, num layers {num_layers}, train loss {loss.item()}, validation loss {validation_loss.item()}')
                    if validation_loss < min_validation_loss:
                        model_file_name = os.path.join(model_dir,
                                                       f"model_{validation_loss}_{embedding_dim}_{hidden_size}_{num_layers}_{epoch}.pt")
                        torch.save(model.state_dict(), model_file_name)
                        print("Model has been saved as", model_file_name)
                        min_validation_loss = validation_loss

                    if mean_validation_loss < model_min_validation_loss:
                        model_min_validation_loss = mean_validation_loss
                        current_patience = patience
                    else:
                        current_patience -= 1



Epoch 0, current patience 30, model mean validation loss 1.1073980331420898, embedding dim 1, hidden size 1, num layers 1, train loss 1.1160489320755005, validation loss 1.1073980331420898
Model has been saved as /Users/yaskovdev/dev/git_home/ai-sandbox/sentiment-analysis/models/model_1.1073980331420898_1_1_1_0.pt
Epoch 10, current patience 30, model mean validation loss 1.1028841733932495, embedding dim 1, hidden size 1, num layers 1, train loss 1.0996456146240234, validation loss 1.0983703136444092
Model has been saved as /Users/yaskovdev/dev/git_home/ai-sandbox/sentiment-analysis/models/model_1.0983703136444092_1_1_1_10.pt
Epoch 20, current patience 30, model mean validation loss 1.100232720375061, embedding dim 1, hidden size 1, num layers 1, train loss 1.0947827100753784, validation loss 1.0949299335479736
Model has been saved as /Users/yaskovdev/dev/git_home/ai-sandbox/sentiment-analysis/models/model_1.0949299335479736_1_1_1_20.pt
Epoch 30, current patience 30, model mean validat

RuntimeError: MPS backend out of memory (MPS allocated: 9.68 GB, other allocations: 36.07 GB, max allowed: 45.90 GB). Tried to allocate 286.20 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [149]:
def parse_model_name(model_name):
    match = re.match(r'model_[^_]+_(\d+)_(\d+)_(\d+)_\d+\.pt', model_name)
    if not match:
        raise ValueError('Incorrect model name format')
    return int(match.group(1)), int(match.group(2)), int(match.group(3))


model_name = 'model_0.7129881978034973_32_32_1_530.pt'
embedding_dim, hidden_size, num_layers = parse_model_name(model_name)
model = Model(len(vocabulary), embedding_dim, hidden_size, num_layers).to(device)
model.load_state_dict(
    torch.load(os.path.join(os.getcwd(), "models", model_name), map_location=torch.device(device)))

<All keys matched successfully>

In [151]:
# Measuring the model performance
model.eval()
correct = 0
total = 0
for index in range(len(test_dataset)):
    h0 = torch.zeros(num_layers, 1, hidden_size)
    message = test_dataset['text'][index]
    label = test_dataset['label'][index]
    encoded_message = encode_x(token_to_index, test_tokenized_messages[index])
    encoded_label = encode_y(label)
    x_batch, y_batch = create_batch([encoded_message], [encoded_label], 1, len(vocabulary))
    prediction, _ = model(x_batch.to(device), h0.to(device))
    # https://docs.pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html#creating-the-network
    _, top_i = torch.topk(prediction, k=1)
    labels = ['negative', 'neutral', 'positive']
    if labels[label] == labels[top_i[0].item()]:
        correct += 1
    total += 1
    if index % 100 == 0:
        print(f'Finished test {index}, accuracy is {correct / total}')

print(correct, total)
model.train()

Finished test 0, accuracy is 0.0
Finished test 100, accuracy is 0.5148514851485149
Finished test 200, accuracy is 0.6019900497512438
Finished test 300, accuracy is 0.6312292358803987
Finished test 400, accuracy is 0.6733167082294265
Finished test 500, accuracy is 0.6866267465069861
Finished test 600, accuracy is 0.7038269550748752
Finished test 700, accuracy is 0.703281027104137
Finished test 800, accuracy is 0.6991260923845194
Finished test 900, accuracy is 0.6970033296337403
Finished test 1000, accuracy is 0.6873126873126874


KeyboardInterrupt: 

In [160]:
model.eval()
h0 = torch.zeros(num_layers, 1, hidden_size)
message = "I'm sick of that"  # "I never hated you", "I do not hate you"
_, tokenized_messages = tokenize_messages([message])
encoded_message = encode_x(token_to_index, tokenized_messages[0])
x_batch, _ = create_batch([encoded_message], [torch.tensor([0, 0, 0])], 1, len(vocabulary))
prediction, _ = model(x_batch.to(device), h0.to(device))
print(prediction)
_, top_i = torch.topk(prediction, k=1)
labels = ['negative', 'neutral', 'positive']
print(labels[top_i[0].item()])
model.train()

100%|██████████| 1/1 [00:00<00:00, 162.89it/s]

tensor([[-0.5146, -1.1322, -2.5265]], device='mps:0',
       grad_fn=<LogSoftmaxBackward0>)
negative





Model(
  (embedding): Embedding(36633, 32)
  (rnn): GRU(32, 32, batch_first=True)
  (linear): Linear(in_features=32, out_features=3, bias=True)
  (log_softmax): LogSoftmax(dim=1)
)