In [1]:
import os

import numpy as np
import spacy
import torch
from datasets import load_dataset, Dataset
from torch import nn
from torch.nn import functional as F

In [33]:
# See https://docs.pytorch.org/docs/stable/generated/torch.nn.GRU.html.
# batch_size = 3
# sequence_length = 5
# input_size = 10
# hidden_size = 20
# num_layers = 2
# rnn = nn.GRU(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
# input = torch.randn(batch_size, sequence_length, input_size)  # Should be (batch_size, sequence_length, input_size)
# h0 = torch.randn(num_layers, batch_size, hidden_size)  # Should be (num_layers, batch_size, hidden_size)
# output, hn = rnn(input, h0)
# print(output.shape)


class Model(nn.Module):

    def __init__(self, vocabulary_size, embedding_dim, hidden_size):
        super(Model, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=vocabulary_size, embedding_dim=embedding_dim)
        self.rnn = nn.GRU(input_size=embedding_dim, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocabulary_size)

    def forward(self, sequence, hidden):
        embedded = self.embedding(sequence)
        prediction, hidden = self.rnn(embedded, hidden)
        return self.linear(prediction), hidden

batch_size = 1
num_layers = 2
input_size = 10
hidden_size = 20
h0 = torch.randn(num_layers, batch_size, hidden_size)
vocabulary_size = 4
input = torch.tensor([
    [0, 1, 2, 3, 0]
])
input2 = torch.tensor([
    [0, 1, 2, 3, 0, 1]
])
model = Model(vocabulary_size, input_size, hidden_size)
y1, h1 = model.forward(input, h0)
y2, h2 = model.forward(input2, h1)
print(y1.shape)
print(h1.shape)
print(y2.shape)
print(h2.shape)

torch.Size([1, 5, 4])
torch.Size([2, 1, 20])
torch.Size([1, 6, 4])
torch.Size([2, 1, 20])


In [74]:
def tokenize(messages):
    tokens = set()
    message_to_tokens = []
    for message in messages:
        doc = nlp(message)
        message_tokens = {token.text.lower() for token in doc if
                          token.pos_ in {'ADJ', 'ADV', 'INTJ', 'NOUN', 'PROPN', 'VERB'}}
        tokens.update(message_tokens)
        message_to_tokens.append(message_tokens)
    return list(tokens), message_to_tokens

In [75]:
def encode_x(vocabulary, tokens):
    vector = torch.zeros(len(vocabulary))
    for token in tokens:
        index = vocabulary[token]
        vector[index] += 1
    return vector


def encode_y(label):
    vector = torch.zeros(3)
    vector[label] = 1
    return vector

In [76]:
nlp = spacy.load('en_core_web_sm')
dataset = load_dataset('Sp1786/multiclass-sentiment-analysis-dataset')
train_dataset: Dataset = dataset['train']
validation_dataset: Dataset = dataset['validation']

train_tokens, train_message_to_tokens = tokenize(train_dataset['text'])
validation_tokens, validation_message_to_tokens = tokenize(validation_dataset['text'])

# the full list of tokens is sorted to ensure that the encoding of the messages stays the same between the Jupiter Notebook reloads, so that the saved models could be loaded and used for inference
tokens = sorted(set(train_tokens + validation_tokens))
print(tokens[:20])

vocabulary = {token: index for index, token in enumerate(tokens)}
print(len(vocabulary))

['!', '"-', '#', '$', '%', '&', "'", "'-cholla`s", "'back", "'calendar", "'gummed", "'not", "'peter", "'s", '(:', '(=', '*', '****', '***kix', '*shuts']
31839


In [77]:
train_xs = torch.stack([encode_x(vocabulary, tokens) for tokens in train_message_to_tokens])
train_ys = torch.stack([encode_y(label) for label in train_dataset['label']])
validation_xs = torch.stack([encode_x(vocabulary, tokens) for tokens in validation_message_to_tokens])
validation_ys = torch.stack([encode_y(label) for label in validation_dataset['label']])
print(train_xs.shape)
print(train_ys.shape)
print(validation_xs.shape)
print(validation_ys.shape)

torch.Size([31232, 31839])
torch.Size([31232, 3])
torch.Size([5205, 31839])
torch.Size([5205, 3])


In [78]:
def create_batch(xs, ys, batch_size):
    random_indices = np.random.choice(len(xs), batch_size)
    return xs[random_indices], ys[random_indices]

In [79]:
def determine_device():
    if torch.cuda.is_available():
        return 'cuda'
    elif torch.backends.mps.is_available():
        return 'mps'
    else:
        return 'cpu'


device = determine_device()
print(f'Device is {device}')

Device is mps


In [83]:
def estimate_loss(model, iterations, validation_xs, validation_ys, batch_size):
    model.eval()
    loses = torch.zeros(iterations)
    for i in range(iterations):
        validation_x_batch, validation_y_batch = create_batch(validation_xs, validation_ys, batch_size)
        validation_prediction = model(validation_x_batch.to(device))
        validation_loss = F.cross_entropy(validation_prediction, validation_y_batch.to(device), reduction='mean')
        loses[i] = validation_loss.item()
    model.train()
    return loses.mean()

In [84]:
# nn.Softmax is not required (see the nn.CrossEntropyLoss docs why) and can be kept only in the inference
model = nn.Sequential(
    nn.Linear(len(vocabulary), 256),
    nn.Linear(256, 3)
).to(device)

In [85]:
model_dir = os.path.join(os.getcwd(), "models")
os.makedirs(model_dir, exist_ok=True)

batch_size = 512
loss_fn = nn.CrossEntropyLoss(reduction="mean")
optimizer = torch.optim.Adam(model.parameters(), lr=5e-3)

number_of_epoches = 100
min_validation_loss = float('inf')
for epoch in range(number_of_epoches):
    x_batch, y_batch = create_batch(train_xs, train_ys, batch_size)
    prediction = model(x_batch.to(device))
    loss = loss_fn(prediction, y_batch.to(device))

    loss.backward()
    optimizer.step()
    model.zero_grad()

    if epoch % 10 == 0 or epoch == number_of_epoches - 1:
        mean_loss = estimate_loss(model, 10, validation_xs, validation_ys, batch_size)
        print(f'Epoch {epoch}, trail loss {loss.item()}, validation loss {mean_loss.item()}')
        if mean_loss < min_validation_loss:
            model_file_name = os.path.join(model_dir, "model_" + str(epoch) + ".pt")
            torch.save(model.state_dict(), model_file_name)
            print("Model has been saved as", model_file_name)
            min_validation_loss = mean_loss

Epoch 0, trail loss 1.102682113647461, validation loss 1.077303171157837
Model has been saved as /Users/yaskovdev/dev/git_home/ai-sandbox/sentiment-analysis/models/model_0.pt
Epoch 10, trail loss 0.8979932069778442, validation loss 0.9276639819145203
Model has been saved as /Users/yaskovdev/dev/git_home/ai-sandbox/sentiment-analysis/models/model_10.pt
Epoch 20, trail loss 0.7764990329742432, validation loss 0.8771727681159973
Model has been saved as /Users/yaskovdev/dev/git_home/ai-sandbox/sentiment-analysis/models/model_20.pt
Epoch 30, trail loss 0.6884598135948181, validation loss 0.8671800494194031
Model has been saved as /Users/yaskovdev/dev/git_home/ai-sandbox/sentiment-analysis/models/model_30.pt
Epoch 40, trail loss 0.7256267070770264, validation loss 0.9020015597343445
Epoch 50, trail loss 0.6767445802688599, validation loss 0.8644858598709106
Model has been saved as /Users/yaskovdev/dev/git_home/ai-sandbox/sentiment-analysis/models/model_50.pt
Epoch 60, trail loss 0.6512887477

In [86]:
model.load_state_dict(
    torch.load(os.path.join(os.getcwd(), "models", 'model_50.pt'), map_location=torch.device(device)))

<All keys matched successfully>

In [91]:
model.eval()
message = "You're the best"
test_tokens, _ = tokenize([message])
x = encode_x(vocabulary, test_tokens)
y = model(x.to(device))

distribution = torch.nn.functional.softmax(y, dim=-1)
print(distribution)
answer = torch.argmax(distribution)
print(['negative', 'neutral', 'positive'][answer])
model.train()

tensor([0.0780, 0.2832, 0.6388], device='mps:0', grad_fn=<SoftmaxBackward0>)
positive


Sequential(
  (0): Linear(in_features=31839, out_features=256, bias=True)
  (1): Linear(in_features=256, out_features=3, bias=True)
)

In [30]:
torch.onnx.export(model, (torch.tensor(x, dtype=torch.float32).to(device),), "model.onnx")