In [1]:
import numpy as np
import spacy
import torch
from torch import nn
from datasets import load_dataset, Dataset
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def tokenize(messages):
    tokens = set()
    message_to_tokens = []
    for message in messages:
        doc = nlp(message)
        message_tokens = {token.text.lower() for token in doc if
                          token.pos_ in {'ADJ', 'ADV', 'INTJ', 'NOUN', 'PROPN', 'VERB'}}
        tokens.update(message_tokens)
        message_to_tokens.append(message_tokens)
    # the full list of tokens is sorted to ensure that the encoding of the messages stays the same between the Jupiter Notebook reloads, so that the saved models could be loaded and used for inference
    return sorted(list(tokens)), message_to_tokens

In [3]:
def encode_x(vocabulary, tokens):
    vector = [0] * len(vocabulary)
    for token in tokens:
        index = vocabulary[token]
        vector[index] = 1
    return vector


def encode_y(label):
    vector = [0] * 3
    vector[label] = 1
    return vector

In [4]:
nlp = spacy.load('en_core_web_sm')
dataset = load_dataset('Sp1786/multiclass-sentiment-analysis-dataset')
train_dataset: Dataset = dataset['train']

tokens, message_to_tokens = tokenize(train_dataset['text'])
print(tokens[:20])

vocabulary = {token: index for index, token in enumerate(tokens)}
print(len(vocabulary))

['!', '"-', '#', '$', '%', "'", "'-cholla`s", "'back", "'gummed", "'s", '(:', '(=', '*', '****', '***kix', '*shuts', '*whispers', '+', '+1', '+1/2']
29050


In [5]:
xs = [encode_x(vocabulary, tokens) for tokens in message_to_tokens]
ys = [encode_y(label) for label in train_dataset['label']]

In [6]:
def create_batch(xs, ys, batch_size):
    random_indices = np.random.choice(len(xs), batch_size)
    return torch.tensor(np.stack([xs[index] for index in random_indices]), dtype=torch.float32), torch.tensor(
        np.stack([ys[index] for index in random_indices]), dtype=torch.float32)

In [7]:
def determine_device():
    if torch.cuda.is_available():
        return 'cuda'
    elif torch.backends.mps.is_available():
        return 'mps'
    else:
        return 'cpu'


device = determine_device()
print(f'Device is {device}')

Device is mps


In [59]:
# nn.Softmax is not required (see the nn.CrossEntropyLoss docs why) and can be kept only in the inference
model = nn.Sequential(
    nn.Linear(len(vocabulary), 512),
    nn.Sigmoid(),
    nn.Dropout(0.3),
    nn.Linear(512, 256),
    nn.Sigmoid(),
    nn.Dropout(0.2),
    nn.Linear(256, 3)
).to(device)

In [60]:
model_dir = os.path.join(os.getcwd(), "models")
os.makedirs(model_dir, exist_ok=True)

batch_size = 512
loss_fn = nn.CrossEntropyLoss(reduction="mean")
optimizer = torch.optim.Adam(model.parameters(), lr=5e-3)

number_of_epoches = 1300
for epoch in range(number_of_epoches):
    x_batch, y_batch = create_batch(xs, ys, batch_size)
    prediction = model(x_batch.to(device))
    loss = loss_fn(prediction, y_batch.to(device))

    loss.backward()
    optimizer.step()
    model.zero_grad()

    if epoch % 10 == 0:
        # TODO: evaluate against the test set as well
        print(epoch, loss.item())
    if epoch > 0 and (epoch % 100 == 0 or epoch == number_of_epoches - 1):
        model_file_name = os.path.join(model_dir, "model_" + str(epoch) + ".pt")
        torch.save(model.state_dict(), model_file_name)
        print("Model has been saved as", model_file_name)


0 1.0988504886627197
10 1.0913362503051758
20 1.0769563913345337
30 1.0757659673690796
40 1.033682107925415
50 0.9479398727416992
60 0.802152156829834
70 0.8036815524101257
80 0.7268104553222656
90 0.7699982523918152
100 0.756033182144165
Model has been saved as /Users/yaskovdev/dev/git_home/ai-sandbox/sentiment-analysis/models/model_100.pt
110 0.6600620150566101
120 0.6443519592285156
130 0.6632018089294434
140 0.702505350112915
150 0.6174395084381104
160 0.6248451471328735
170 0.5867311954498291
180 0.5470965504646301
190 0.5937950015068054
200 0.5384635329246521
Model has been saved as /Users/yaskovdev/dev/git_home/ai-sandbox/sentiment-analysis/models/model_200.pt
210 0.5675145387649536
220 0.5747131705284119
230 0.4687992036342621
240 0.5076034069061279
250 0.46345430612564087
260 0.4867647588253021
270 0.4685673713684082
280 0.4485918879508972
290 0.471912145614624
300 0.4138813614845276
Model has been saved as /Users/yaskovdev/dev/git_home/ai-sandbox/sentiment-analysis/models/mod

In [61]:
model.load_state_dict(torch.load(os.path.join(os.getcwd(), "models", 'model_1299.pt'), map_location=torch.device(device)))
model.eval()

Sequential(
  (0): Linear(in_features=29050, out_features=512, bias=True)
  (1): Sigmoid()
  (2): Dropout(p=0.3, inplace=False)
  (3): Linear(in_features=512, out_features=256, bias=True)
  (4): Sigmoid()
  (5): Dropout(p=0.2, inplace=False)
  (6): Linear(in_features=256, out_features=3, bias=True)
)

In [80]:
message = "I don't like you"
tokens, message_to_tokens = tokenize([message])
x = encode_x(vocabulary, tokens)
y = model(torch.tensor(x, dtype=torch.float32).to(device))

distribution = torch.nn.functional.softmax(y, dim=-1)
print(distribution)
answer = torch.multinomial(distribution, 1, replacement=True)
print(['negative', 'neutral', 'positive'][answer])

tensor([0.1314, 0.5595, 0.3091], device='mps:0', grad_fn=<SoftmaxBackward0>)
negative


In [35]:
torch.onnx.export(model, (torch.tensor(x, dtype=torch.float32).to(device),), "model.onnx")