In [1]:
import numpy as np
import spacy
import torch
from torch import nn
from datasets import load_dataset, Dataset
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def tokenize(messages):
    tokens = set()
    message_to_tokens = []
    for message in messages:
        doc = nlp(message)
        message_tokens = {token.text.lower() for token in doc if
                          token.pos_ in {'ADJ', 'ADV', 'INTJ', 'NOUN', 'PROPN', 'VERB'}}
        tokens.update(message_tokens)
        message_to_tokens.append(message_tokens)
    # the full list of tokens is sorted to ensure that the encoding of the messages stays the same between the Jupiter Notebook reloads, so that the saved models could be loaded and used for inference
    return sorted(list(tokens)), message_to_tokens

In [3]:
def encode_x(vocabulary, tokens):
    vector = [0] * len(vocabulary)
    for token in tokens:
        index = vocabulary[token]
        vector[index] = 1
    return vector


def encode_y(label):
    vector = [0] * 3
    vector[label] = 1
    return vector

In [4]:
nlp = spacy.load('en_core_web_sm')
dataset = load_dataset('Sp1786/multiclass-sentiment-analysis-dataset')
train_dataset: Dataset = dataset['train']

tokens, message_to_tokens = tokenize(train_dataset['text'])
print(tokens[:20])

vocabulary = {token: index for index, token in enumerate(tokens)}
print(len(vocabulary))

['!', '"-', '#', '$', '%', "'", "'-cholla`s", "'back", "'gummed", "'s", '(:', '(=', '*', '****', '***kix', '*shuts', '*whispers', '+', '+1', '+1/2']
29050


In [5]:
xs = [encode_x(vocabulary, tokens) for tokens in message_to_tokens]
ys = [encode_y(label) for label in train_dataset['label']]

In [6]:
def create_batch(xs, ys, batch_size):
    random_indices = np.random.choice(len(xs), batch_size)
    return torch.tensor(np.stack([xs[index] for index in random_indices]), dtype=torch.float32), torch.tensor(
        np.stack([ys[index] for index in random_indices]), dtype=torch.float32)

In [7]:
def determine_device():
    if torch.cuda.is_available():
        return 'cuda'
    elif torch.backends.mps.is_available():
        return 'mps'
    else:
        return 'cpu'


device = determine_device()
print(f'Device is {device}')

Device is mps


In [8]:
model = nn.Sequential(
    nn.Linear(len(vocabulary), 256),
    nn.Linear(256, 3),
    nn.Softmax(dim=-1)  # along which dimension the sum will be 1
).to(device)

In [12]:
model_dir = os.path.join(os.getcwd(), "models")
os.makedirs(model_dir, exist_ok=True)

batch_size = 512
loss_fn = nn.CrossEntropyLoss(reduction="mean")
optimizer = torch.optim.Adam(model.parameters(), lr=5e-3)

number_of_epoches = 1300
for epoch in range(number_of_epoches):
    x_batch, y_batch = create_batch(xs, ys, batch_size)
    prediction = model(x_batch.to(device))
    loss = loss_fn(prediction, y_batch.to(device))

    loss.backward()
    optimizer.step()
    model.zero_grad()

    if epoch % 10 == 0:
        print(epoch, loss.item())
    if epoch > 0 and (epoch % 100 == 0 or epoch == number_of_epoches - 1):
        model_file_name = os.path.join(model_dir, "model_" + str(epoch) + ".pt")
        torch.save(model.state_dict(), model_file_name)
        print("Model has been saved as", model_file_name)


0 1.0984939336776733
10 0.9625958800315857
20 0.8951854705810547
30 0.8292977213859558
40 0.8181977272033691
50 0.8018471002578735
60 0.7799428701400757
70 0.7932983636856079
80 0.7879929542541504
90 0.7697056531906128
100 0.7825945019721985
Model has been saved as /Users/yaskovdev/dev/git_home/ai-sandbox/sentiment-analysis/models/model_100.pt
110 0.743977427482605
120 0.7197096943855286
130 0.7389661073684692
140 0.7244490385055542
150 0.7258056402206421
160 0.7150658369064331
170 0.731610894203186
180 0.7535115480422974
190 0.7197318077087402
200 0.740250825881958
Model has been saved as /Users/yaskovdev/dev/git_home/ai-sandbox/sentiment-analysis/models/model_200.pt
210 0.7198085784912109
220 0.7353278994560242
230 0.691091775894165
240 0.7065654993057251
250 0.7149518728256226
260 0.7192531824111938
270 0.7050416469573975
280 0.6831994652748108
290 0.6705355644226074
300 0.7140761017799377
Model has been saved as /Users/yaskovdev/dev/git_home/ai-sandbox/sentiment-analysis/models/mod

In [9]:
model.load_state_dict(torch.load(os.path.join(os.getcwd(), "models", 'model_1299.pt'), map_location=torch.device(device)))
model.eval()

Sequential(
  (0): Linear(in_features=29050, out_features=256, bias=True)
  (1): Linear(in_features=256, out_features=3, bias=True)
  (2): Softmax(dim=-1)
)

In [26]:
message = "My computer is totally broken"
tokens, message_to_tokens = tokenize([message])
x = encode_x(vocabulary, tokens)
y = model(torch.tensor(x, dtype=torch.float32).to(device))
print(y)
answer = torch.multinomial(y, 1, replacement=True)
print(['negative', 'neutral', 'positive'][answer])

tensor([1.0000e+00, 1.1926e-18, 5.2388e-29], device='mps:0',
       grad_fn=<SoftmaxBackward0>)
negative


In [20]:
torch.onnx.export(model, (torch.tensor(x, dtype=torch.float32).to(device),), "model.onnx")