In [21]:
import numpy as np
import spacy
import torch
from torch import nn
from torch.nn import functional as F
from datasets import load_dataset, Dataset
import os

In [25]:
def tokenize(messages):
    tokens = set()
    message_to_tokens = []
    for message in messages:
        doc = nlp(message)
        message_tokens = {token.text.lower() for token in doc if
                          token.pos_ in {'ADJ', 'ADV', 'INTJ', 'NOUN', 'PROPN', 'VERB'}}
        tokens.update(message_tokens)
        message_to_tokens.append(message_tokens)
    return list(tokens), message_to_tokens

In [29]:
def encode_x(vocabulary, tokens):
    vector = np.zeros(len(vocabulary))
    for token in tokens:
        index = vocabulary[token]
        vector[index] += 1
    return vector


def encode_y(label):
    vector = np.zeros(3)
    vector[label] = 1
    return vector

In [27]:
nlp = spacy.load('en_core_web_sm')
dataset = load_dataset('Sp1786/multiclass-sentiment-analysis-dataset')
train_dataset: Dataset = dataset['train']
validation_dataset: Dataset = dataset['validation']

train_tokens, train_message_to_tokens = tokenize(train_dataset['text'])
validation_tokens, validation_message_to_tokens = tokenize(validation_dataset['text'])

# the full list of tokens is sorted to ensure that the encoding of the messages stays the same between the Jupiter Notebook reloads, so that the saved models could be loaded and used for inference
tokens = sorted(set(train_tokens + validation_tokens))
print(tokens[:20])

vocabulary = {token: index for index, token in enumerate(tokens)}
print(len(vocabulary))

['!', '"-', '#', '$', '%', '&', "'", "'-cholla`s", "'back", "'calendar", "'gummed", "'not", "'peter", "'s", '(:', '(=', '*', '****', '***kix', '*shuts']
31839


In [30]:
train_xs = np.stack([encode_x(vocabulary, tokens) for tokens in train_message_to_tokens])
train_ys = np.stack([encode_y(label) for label in train_dataset['label']])
validation_xs = np.stack([encode_x(vocabulary, tokens) for tokens in validation_message_to_tokens])
validation_ys = np.stack([encode_y(label) for label in validation_dataset['label']])
print(train_xs.shape)
print(train_ys.shape)
print(validation_xs.shape)
print(validation_ys.shape)

(31232, 31839)
(31232, 3)
(5205, 31839)
(5205, 3)


In [42]:
def create_batch(xs, ys, batch_size):
    random_indices = np.random.choice(len(xs), batch_size)
    return torch.tensor(xs[random_indices], dtype=torch.float32), torch.tensor(ys[random_indices], dtype=torch.float32)

In [44]:
def determine_device():
    if torch.cuda.is_available():
        return 'cuda'
    elif torch.backends.mps.is_available():
        return 'mps'
    else:
        return 'cpu'


device = determine_device()
print(f'Device is {device}')

Device is mps


In [45]:
# nn.Softmax is not required (see the nn.CrossEntropyLoss docs why) and can be kept only in the inference
model = nn.Sequential(
    nn.Linear(len(vocabulary), 256),
    nn.Linear(256, 3)
).to(device)

In [46]:
def estimate_loss(model, iterations, validation_xs, validation_ys, batch_size):
    model.eval()
    loses = torch.zeros(iterations)
    for i in range(iterations):
        validation_x_batch, validation_y_batch = create_batch(validation_xs, validation_ys, batch_size)
        validation_prediction = model(validation_x_batch.to(device))
        validation_loss = F.cross_entropy(validation_prediction, validation_y_batch.to(device), reduction='mean')
        loses[i] = validation_loss.item()
    model.train()
    return loses.mean()

In [47]:
model_dir = os.path.join(os.getcwd(), "models")
os.makedirs(model_dir, exist_ok=True)

batch_size = 512
loss_fn = nn.CrossEntropyLoss(reduction="mean")
optimizer = torch.optim.Adam(model.parameters(), lr=5e-3)

number_of_epoches = 100
for epoch in range(number_of_epoches):
    x_batch, y_batch = create_batch(train_xs, train_ys, batch_size)
    prediction = model(x_batch.to(device))
    loss = loss_fn(prediction, y_batch.to(device))

    loss.backward()
    optimizer.step()
    model.zero_grad()

    if epoch % 10 == 0 or epoch == number_of_epoches - 1:
        mean_loss = estimate_loss(model, 10, validation_xs, validation_ys, batch_size)
        print(f'Epoch {epoch}, trail loss {loss.item()}, validation loss {mean_loss.item()}')
    if epoch > 0 and (epoch % 10 == 0 or epoch == number_of_epoches - 1):
        model_file_name = os.path.join(model_dir, "model_" + str(epoch) + ".pt")
        torch.save(model.state_dict(), model_file_name)
        print("Model has been saved as", model_file_name)


Epoch 0, trail loss 1.1001036167144775, validation loss 1.0777775049209595
Epoch 10, trail loss 0.8503361344337463, validation loss 0.9124234318733215
Model has been saved as /Users/yaskovdev/dev/git_home/ai-sandbox/sentiment-analysis/models/model_10.pt
Epoch 20, trail loss 0.7822971343994141, validation loss 0.8598759770393372
Model has been saved as /Users/yaskovdev/dev/git_home/ai-sandbox/sentiment-analysis/models/model_20.pt
Epoch 30, trail loss 0.7125304341316223, validation loss 0.8622048497200012
Model has been saved as /Users/yaskovdev/dev/git_home/ai-sandbox/sentiment-analysis/models/model_30.pt
Epoch 40, trail loss 0.6592621207237244, validation loss 0.8717592358589172
Model has been saved as /Users/yaskovdev/dev/git_home/ai-sandbox/sentiment-analysis/models/model_40.pt
Epoch 50, trail loss 0.5881929397583008, validation loss 0.8881387710571289
Model has been saved as /Users/yaskovdev/dev/git_home/ai-sandbox/sentiment-analysis/models/model_50.pt
Epoch 60, trail loss 0.6059032

In [85]:
model.load_state_dict(
torch.load(os.path.join(os.getcwd(), "models", 'model_99.pt'), map_location=torch.device(device)))

Sequential(
  (0): Linear(in_features=31839, out_features=256, bias=True)
  (1): Linear(in_features=256, out_features=3, bias=True)
)

In [101]:
model.eval()
message = "You're the worst"
test_tokens, _ = tokenize([message])
x = encode_x(vocabulary, test_tokens)
y = model(torch.tensor(x, dtype=torch.float32).to(device))

distribution = torch.nn.functional.softmax(y, dim=-1)
print(distribution)
answer = torch.argmax(distribution)
print(['negative', 'neutral', 'positive'][answer])
model.train()

tensor([0.8409, 0.1492, 0.0099], device='mps:0', grad_fn=<SoftmaxBackward0>)
negative


In [30]:
torch.onnx.export(model, (torch.tensor(x, dtype=torch.float32).to(device),), "model.onnx")