In [234]:
import os

import numpy as np
import spacy
import torch
from datasets import load_dataset, Dataset
from torch import nn
from torch.nn import functional as F

In [184]:
def determine_device():
    if torch.cuda.is_available():
        return 'cuda'
    elif torch.backends.mps.is_available():
        return 'mps'
    else:
        return 'cpu'


device = determine_device()
print(f'Device is {device}')

Device is mps


In [185]:
def process(example):
    example['text'] = example['text'].strip()
    example['length'] = len(example['text'])
    return example


nlp = spacy.load('en_core_web_sm')


def tokenize_messages(messages):
    tokens = set()
    tokenized_messages = []
    for message in messages:
        doc = nlp(message)
        tokenized_message = [token.text.lower() for token in doc]
        tokens.update(tokenized_message)
        tokenized_messages.append(tokenized_message)
    return list(tokens), tokenized_messages


def encode_x(token_to_index, tokens):
    return torch.tensor([token_to_index[token] for token in tokens if token in token_to_index])


def encode_y(label):
    vector = torch.zeros(3)
    vector[label] = 1
    return vector


dataset = load_dataset('Sp1786/multiclass-sentiment-analysis-dataset')
train_dataset: Dataset = dataset['train'].filter(lambda it: len(it['text']) <= 128).map(process).sort('length')
validation_dataset: Dataset = dataset['validation'].filter(lambda it: len(it['text']) <= 128).map(process).sort(
    'length')
test_dataset: Dataset = dataset['test']

train_tokens, train_tokenized_messages = tokenize_messages(train_dataset['text'])
validation_tokens, validation_tokenized_messages = tokenize_messages(validation_dataset['text'])

# the full list of tokens is sorted to ensure that the encoding of the messages stays the same between the Jupiter Notebook reloads, so that the saved models could be loaded and used for inference
tokens = sorted(set(train_tokens + validation_tokens))
print(tokens[:20])

vocabulary = {token: index for index, token in enumerate(tokens)}
print(len(vocabulary))

Filter:   0%|          | 0/31232 [00:00<?, ? examples/s]

Map:   0%|          | 0/25913 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5205 [00:00<?, ? examples/s]

Map:   0%|          | 0/4279 [00:00<?, ? examples/s]

['\t ', '\n', ' ', '  ', '   ', '    ', '     ', '      ', '       ', '        ', '             ', '              ', '               ', '                ', '                                                                                              ', '!', '"', '#', '$', '%']
27438


In [186]:
token_to_index = {u: i for i, u in enumerate(vocabulary)}

In [187]:
train_messages = [encode_x(token_to_index, tokens) for tokens in train_tokenized_messages]
train_labels = [encode_y(label) for label in train_dataset['label']]
validation_messages = [encode_x(token_to_index, tokens) for tokens in validation_tokenized_messages]
validation_labels = [encode_y(label) for label in validation_dataset['label']]
print(len(train_messages))
print(len(train_labels))
print(len(validation_messages))
print(len(validation_labels))

25913
25913
4279
4279


In [197]:
index = 20000
print(train_dataset['text'][index], train_tokenized_messages[index])

At least he`s in breakthrough performance tho. I just wanted him nominated in his own category ['at', 'least', 'he`s', 'in', 'breakthrough', 'performance', 'tho', '.', 'i', 'just', 'wanted', 'him', 'nominated', 'in', 'his', 'own', 'category']


In [198]:
def create_batch(xs, ys, batch_size, padding_value):
    index = np.random.choice(len(xs) - batch_size + 1)
    indices = range(index, index + batch_size)
    return nn.utils.rnn.pad_sequence([xs[i] for i in indices], batch_first=True,
                                     padding_value=padding_value).int(), torch.stack(
        [ys[i] for i in indices])

In [230]:
def estimate_loss(model, h0, iterations, validation_xs, validation_ys, batch_size, padding_value):
    model.eval()
    loses = torch.zeros(iterations)
    for i in range(iterations):
        validation_x_batch, validation_y_batch = create_batch(validation_xs, validation_ys, batch_size, padding_value)
        validation_prediction, _ = model(validation_x_batch.to(device), h0.to(device))
        validation_loss = F.cross_entropy(validation_prediction, validation_y_batch.to(device), reduction='mean')
        loses[i] = validation_loss.item()
    model.train()
    return loses.mean()

In [222]:
# See https://docs.pytorch.org/docs/stable/generated/torch.nn.GRU.html.
class Model(nn.Module):

    def __init__(self, vocabulary_size, embedding_dim, hidden_size):
        super(Model, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=vocabulary_size, embedding_dim=embedding_dim)
        self.rnn = nn.GRU(input_size=embedding_dim, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
        self.activation = nn.ReLU()
        self.linear = nn.Linear(hidden_size, 3)

    def forward(self, sequence, hidden):
        # (batch_size, sequence_length) -> (batch_size, sequence_length, embedding_dim)
        embedded = self.embedding(sequence)
        # (batch_size, sequence_length, embedding_dim)
        # -> (batch_size, sequence_length, hidden_size), (num_layers, batch_size, hidden_size)
        prediction, hidden = self.rnn(embedded, hidden)
        # See https://docs.pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html#creating-the-network.
        prediction_with_activation = prediction[:, -1, :]
        return self.linear(prediction_with_activation), hidden

In [235]:
model_dir = os.path.join(os.getcwd(), "models")
os.makedirs(model_dir, exist_ok=True)

# TODO: try different batch sizes
batch_size = 256
loss_fn = nn.CrossEntropyLoss(reduction="mean")

embedding_dim = 512
hidden_size = 1024
num_layers = 2
model = Model(len(vocabulary), embedding_dim, hidden_size).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=5e-3)

number_of_epoches = 1000
min_validation_loss = float('inf')
for epoch in range(number_of_epoches):
    x_batch, y_batch = create_batch(train_messages, train_labels, batch_size, len(vocabulary))
    h0 = torch.zeros(num_layers, batch_size, hidden_size)
    prediction, _ = model(x_batch.to(device), h0.to(device))
    loss = loss_fn(prediction, y_batch.to(device))

    loss.backward()
    optimizer.step()
    model.zero_grad()

    if epoch % 10 == 0 or epoch == number_of_epoches - 1:
        validation_loss = estimate_loss(model, torch.zeros(num_layers, batch_size, hidden_size), 32,
                                        validation_messages, validation_labels, batch_size, len(vocabulary))
        print(f'Epoch {epoch}, train loss {loss.item()}, validation loss {validation_loss.item()}')
        if validation_loss < min_validation_loss:
            model_file_name = os.path.join(model_dir, "model_" + str(epoch) + ".pt")
            torch.save(model.state_dict(), model_file_name)
            print("Model has been saved as", model_file_name)
            min_validation_loss = validation_loss


Epoch 0, train loss 1.1001501083374023, validation loss 1.6942925453186035
Model has been saved as /Users/yaskovdev/dev/git_home/ai-sandbox/sentiment-analysis/models/model_0.pt
Epoch 10, train loss 1.1714333295822144, validation loss 1.1617904901504517
Model has been saved as /Users/yaskovdev/dev/git_home/ai-sandbox/sentiment-analysis/models/model_10.pt
Epoch 20, train loss 1.231337070465088, validation loss 1.225268006324768
Epoch 30, train loss 1.1166577339172363, validation loss 1.1504684686660767
Model has been saved as /Users/yaskovdev/dev/git_home/ai-sandbox/sentiment-analysis/models/model_30.pt
Epoch 40, train loss 1.1084315776824951, validation loss 1.0683907270431519
Model has been saved as /Users/yaskovdev/dev/git_home/ai-sandbox/sentiment-analysis/models/model_40.pt
Epoch 50, train loss 0.8870217800140381, validation loss 1.1332043409347534
Epoch 60, train loss 1.0806005001068115, validation loss 1.032279133796692
Model has been saved as /Users/yaskovdev/dev/git_home/ai-sand

KeyboardInterrupt: 

In [236]:
model.load_state_dict(
    torch.load(os.path.join(os.getcwd(), "models", 'model_250.pt'), map_location=torch.device(device)))

<All keys matched successfully>

In [246]:
h0 = torch.zeros(num_layers, 1, hidden_size)
index = 1034
message = validation_dataset['text'][index]
label = validation_dataset['label'][index]
print(message)
encoded_message = encode_x(token_to_index, validation_tokenized_messages[index])
encoded_label = encode_y(label)
x_batch, y_batch = create_batch([encoded_message], [encoded_label], 1, len(vocabulary))
prediction, _ = model(x_batch.to(device), h0.to(device))
distribution = torch.nn.functional.softmax(prediction, dim=-1)
labels = ['negative', 'neutral', 'positive']
print(labels[label])
print(labels[torch.argmax(distribution)], distribution)

taking mum to lunch for mothers day
neutral
neutral tensor([[0.0171, 0.9147, 0.0682]], device='mps:0', grad_fn=<SoftmaxBackward0>)


In [179]:
h0 = torch.zeros(num_layers, 1, hidden_size)
message = "I hated you"  # "I never hated you"
_, tokenized_messages = tokenize_messages([message])
encoded_message = encode_x(token_to_index, tokenized_messages[0])
x_batch, _ = create_batch([encoded_message], [torch.tensor([0, 0, 0])], 1, len(vocabulary))
prediction, _ = model(x_batch.to(device), h0.to(device))
distribution = torch.nn.functional.softmax(prediction, dim=-1)
print(distribution)
labels = ['negative', 'neutral', 'positive']
print(labels[torch.argmax(distribution)])

tensor([[0.9808, 0.0122, 0.0070]], device='mps:0', grad_fn=<SoftmaxBackward0>)
negative
