In [107]:
import numpy as np
import spacy
import torch
from datasets import load_dataset, Dataset
from torch import nn

In [108]:
def determine_device():
    if torch.cuda.is_available():
        return 'cuda'
    elif torch.backends.mps.is_available():
        return 'mps'
    else:
        return 'cpu'


device = determine_device()
print(f'Device is {device}')

Device is mps


In [109]:
def process(example):
    example['text'] = example['text'].strip()
    example['length'] = len(example['text'])
    return example


nlp = spacy.load('en_core_web_sm')

def tokenize_messages(messages):
    tokens = set()
    tokenized_messages = []
    for message in messages:
        doc = nlp(message)
        tokenized_message = [token.text.lower() for token in doc if token.pos_ in {'ADJ', 'ADV', 'INTJ', 'NOUN', 'PROPN', 'VERB'}]
        tokens.update(tokenized_message)
        tokenized_messages.append(tokenized_message)
    return list(tokens), tokenized_messages


def encode_x(token_to_index, tokens):
    return torch.tensor([token_to_index[token] for token in tokens if token in token_to_index])


def encode_y(label):
    vector = torch.zeros(3)
    vector[label] = 1
    return vector

dataset = load_dataset('Sp1786/multiclass-sentiment-analysis-dataset')
train_dataset: Dataset = dataset['train'].filter(lambda it: len(it['text']) <= 128).map(process).sort('length')
validation_dataset: Dataset = dataset['validation'].filter(lambda it: len(it['text']) <= 128).map(process).sort('length')
test_dataset: Dataset = dataset['test']

train_tokens, train_tokenized_messages = tokenize_messages(train_dataset['text'])
validation_tokens, validation_tokenized_messages = tokenize_messages(validation_dataset['text'])

# the full list of tokens is sorted to ensure that the encoding of the messages stays the same between the Jupiter Notebook reloads, so that the saved models could be loaded and used for inference
tokens = sorted(set(train_tokens + validation_tokens))
print(tokens[:20])

vocabulary = {token: index for index, token in enumerate(tokens)}
print(len(vocabulary))

Filter:   0%|          | 0/31232 [00:00<?, ? examples/s]

Map:   0%|          | 0/25913 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5205 [00:00<?, ? examples/s]

Map:   0%|          | 0/4279 [00:00<?, ? examples/s]

['!', '#', '%', '&', "'", "'back", "'calendar", "'gummed", "'not", "'peter", "'s", '(:', '(=', '*', '****', '***kix', '*shuts', '*whispers', '+', '+1/2']
25906


In [110]:
token_to_index = {u: i for i, u in enumerate(vocabulary)}

In [112]:
train_messages = [encode_x(token_to_index, tokens) for tokens in train_tokenized_messages]
train_labels = [encode_y(label) for label in train_dataset['label']]
validation_messages = [encode_x(token_to_index, tokens) for tokens in validation_tokenized_messages]
validation_labels = [encode_y(label) for label in validation_dataset['label']]
print(len(train_messages))
print(len(train_labels))
print(len(validation_messages))
print(len(validation_labels))

25913
25913
4279
4279


['spiderwoman',
 'she`s',
 'amazing',
 'mum',
 'gr8',
 'blogger',
 'gr8',
 'mentor',
 'top',
 'climbs',
 'walls',
 'http://digg.com/d1qeua']

In [136]:
def create_batch(xs, ys, batch_size, padding_value):
    index = np.random.choice(len(xs) - batch_size + 1)
    indices = range(index, index + batch_size)
    return nn.utils.rnn.pad_sequence([xs[i] for i in indices], batch_first=True, padding_value=padding_value).int(), torch.stack(
        [ys[i] for i in indices])

In [137]:
# See https://docs.pytorch.org/docs/stable/generated/torch.nn.GRU.html.
class Model(nn.Module):

    def __init__(self, vocabulary_size, embedding_dim, hidden_size):
        super(Model, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=vocabulary_size, embedding_dim=embedding_dim)
        self.rnn = nn.GRU(input_size=embedding_dim, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
        self.activation = nn.ReLU()
        self.linear = nn.Linear(hidden_size, 3)

    def forward(self, sequence, hidden):
        # (batch_size, sequence_length) -> (batch_size, sequence_length, embedding_dim)
        embedded = self.embedding(sequence)
        # (batch_size, sequence_length, embedding_dim)
        # -> (batch_size, sequence_length, hidden_size), (num_layers, batch_size, hidden_size)
        prediction, hidden = self.rnn(embedded, hidden)
        # See https://docs.pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html#creating-the-network.
        prediction_with_activation = self.activation(prediction[:, -1, :])
        return self.linear(prediction_with_activation), hidden

In [139]:
# TODO: try different batch sizes
batch_size = 128
loss_fn = nn.CrossEntropyLoss(reduction="mean")

embedding_dim = 256
hidden_size = 1024
num_layers = 2
model = Model(len(vocabulary), embedding_dim, hidden_size).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=5e-3)

number_of_epoches = 1000
min_validation_loss = float('inf')
for epoch in range(number_of_epoches):
    x_batch, y_batch = create_batch(train_messages, train_labels, batch_size, len(vocabulary))
    h0 = torch.zeros(num_layers, batch_size, hidden_size)
    prediction, _ = model(x_batch.to(device), h0.to(device))
    loss = loss_fn(prediction, y_batch.to(device))

    loss.backward()
    optimizer.step()
    model.zero_grad()

    if epoch % 10 == 0 or epoch == number_of_epoches - 1:
        print(f'Epoch {epoch}, train loss {loss.item()}')

Epoch 0, train loss 1.0987062454223633
Epoch 10, train loss 1.0459595918655396
Epoch 20, train loss 1.1357064247131348
Epoch 30, train loss 1.1586110591888428
Epoch 40, train loss 0.9328310489654541
Epoch 50, train loss 0.8262327909469604
Epoch 60, train loss 1.0850646495819092
Epoch 70, train loss 0.9762576818466187
Epoch 80, train loss 0.8735822439193726
Epoch 90, train loss 0.9356590509414673
Epoch 100, train loss 0.9574069380760193
Epoch 110, train loss 0.5967537760734558
Epoch 120, train loss 0.6612371206283569
Epoch 130, train loss 0.6998139023780823
Epoch 140, train loss 0.9851012825965881
Epoch 150, train loss 0.8151595592498779
Epoch 160, train loss 0.595989465713501
Epoch 170, train loss 0.7471057772636414
Epoch 180, train loss 0.7372685074806213
Epoch 190, train loss 0.5491386651992798
Epoch 200, train loss 0.8636525869369507
Epoch 210, train loss 0.9624097347259521
Epoch 220, train loss 0.6215113401412964
Epoch 230, train loss 0.6260864734649658
Epoch 240, train loss 0.9285

In [152]:
h0 = torch.zeros(num_layers, 1, hidden_size)
index = 1004
message = validation_dataset['text'][index]
label = validation_dataset['label'][index]
print(message)
print(label)
encoded_message = encode_x(token_to_index, validation_tokenized_messages[index])
encoded_label = encode_y(label)
print(encoded_message)
print(encoded_label)
x_batch, y_batch = create_batch([encoded_message], [encoded_label], 1, len(vocabulary))
prediction, _ = model(x_batch.to(device), h0.to(device))
distribution = torch.nn.functional.softmax(prediction, dim=-1)
print(distribution)
labels = ['negative', 'neutral', 'positive']
print(labels[label])
print(labels[torch.argmax(distribution)])

thinks Aaron is pretty darn awesome
2
tensor([22488,   551, 17816,  5625,  1902])
tensor([0., 0., 1.])
tensor([[0.0185, 0.0670, 0.9145]], device='mps:0', grad_fn=<SoftmaxBackward0>)
positive
positive


In [176]:
h0 = torch.zeros(num_layers, 1, hidden_size)
message = "I hated you" # "I never hated you"
_, tokenized_messages = tokenize_messages([message])
encoded_message = encode_x(token_to_index, tokenized_messages[0])
x_batch, _ = create_batch([encoded_message], [torch.tensor([0, 0, 0])], 1, len(vocabulary))
prediction, _ = model(x_batch.to(device), h0.to(device))
distribution = torch.nn.functional.softmax(prediction, dim=-1)
print(distribution)
labels = ['negative', 'neutral', 'positive']
print(labels[torch.argmax(distribution)])

tensor([[0.8598, 0.1303, 0.0099]], device='mps:0', grad_fn=<SoftmaxBackward0>)
negative
