In [65]:
import os

import numpy as np
import spacy
import torch
from datasets import load_dataset, Dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from torch import nn
from torch.nn import functional as F

In [66]:
def tokenize(messages):
    tokens = set()
    message_to_tokens = []
    for message in messages:
        doc = nlp(message)
        message_tokens = {token.text.lower() for token in doc if
                          token.pos_ in {'ADJ', 'ADV', 'INTJ', 'NOUN', 'PROPN', 'VERB'}}
        tokens.update(message_tokens)
        message_to_tokens.append(message_tokens)
    return list(tokens), message_to_tokens

In [67]:
def encode_x(vocabulary, tokens):
    vector = [0] * len(vocabulary)
    for token in tokens:
        index = vocabulary[token]
        vector[index] += 1
    return vector


def encode_y(label):
    vector = [0] * 3
    vector[label] = 1
    return vector

In [164]:
from datasets import concatenate_datasets

nlp = spacy.load('en_core_web_sm')
dataset = load_dataset('Sp1786/multiclass-sentiment-analysis-dataset')
train_dataset: Dataset = dataset['train']
validation_dataset: Dataset = dataset['validation']
corpus = concatenate_datasets([train_dataset, validation_dataset])['text']


vectorizer = TfidfVectorizer() # TODO: try different max number of features
X = vectorizer.fit_transform(corpus).toarray()
train_and_validation_xs = [list(row) for row in X] # TODO: what is the difference between this and X itself
print(len(train_and_validation_xs))
print(len(train_and_validation_xs[0]))

# train_tokens, train_message_to_tokens = tokenize(train_dataset['text'])
# validation_tokens, validation_message_to_tokens = tokenize(validation_dataset['text'])
#
# # the full list of tokens is sorted to ensure that the encoding of the messages stays the same between the Jupiter Notebook reloads, so that the saved models could be loaded and used for inference
# tokens = sorted(set(train_tokens + validation_tokens))
# print(tokens[:20])
#
# vocabulary = {token: index for index, token in enumerate(tokens)}
# print(len(vocabulary))

36437
31804


In [165]:
train_xs = train_and_validation_xs[:len(train_dataset)]
train_ys = [encode_y(label) for label in train_dataset['label']]
validation_xs = train_and_validation_xs[len(train_dataset):]
validation_ys = [encode_y(label) for label in validation_dataset['label']]
print(f'Train xs length: {len(train_xs)}, train ys length: {len(train_ys)}')
print(f'Validation xs length: {len(validation_xs)}, validation ys length: {len(validation_ys)}')

Train xs length: 31232, train ys length: 31232
Validation xs length: 5205, validation ys length: 5205


In [166]:
def create_batch(xs, ys, batch_size):
    random_indices = np.random.choice(len(xs), batch_size)
    return torch.tensor(np.stack([xs[index] for index in random_indices]), dtype=torch.float32), torch.tensor(
        np.stack([ys[index] for index in random_indices]), dtype=torch.float32)

In [172]:
def determine_device():
    if torch.cuda.is_available():
        return 'cuda'
    elif torch.backends.mps.is_available():
        return 'mps'
    else:
        return 'cpu'


device = determine_device()
print(f'Device is {device}')

Device is mps


In [173]:
# nn.Softmax is not required (see the nn.CrossEntropyLoss docs why) and can be kept only in the inference
model = nn.Sequential(
    nn.Linear(X.shape[1], 512),
    nn.Linear(512, 128),
    nn.Linear(128, 3)
).to(device)

In [174]:
def estimate_loss(model, iterations, validation_xs, validation_ys, batch_size):
    model.eval()
    loses = torch.zeros(iterations)
    for i in range(iterations):
        validation_x_batch, validation_y_batch = create_batch(validation_xs, validation_ys, batch_size)
        validation_prediction = model(validation_x_batch.to(device))
        validation_loss = F.cross_entropy(validation_prediction, validation_y_batch.to(device), reduction='mean')
        loses[i] = validation_loss.item()
    model.train()
    return loses.mean()

In [175]:
model_dir = os.path.join(os.getcwd(), "models")
os.makedirs(model_dir, exist_ok=True)

batch_size = 128
loss_fn = nn.CrossEntropyLoss(reduction="mean")
optimizer = torch.optim.Adam(model.parameters(), lr=5e-3)

number_of_epoches = 1000
for epoch in range(number_of_epoches):
    if epoch % 100 == 0 or epoch == number_of_epoches - 1:
        iterations = 10
        model.eval()
        loses = torch.zeros(iterations)
        for i in range(iterations):
            validation_x_batch, validation_y_batch = create_batch(validation_xs, validation_ys, batch_size)
            validation_prediction = model(validation_x_batch.to(device))
            validation_loss = loss_fn(validation_prediction, validation_y_batch.to(device))
            loses[i] = validation_loss.item()
        model.train()
        print(f'Epoch {epoch}, validation loss {loses.mean().item()}')

    x_batch, y_batch = create_batch(train_xs, train_ys, batch_size)
    prediction = model(x_batch.to(device))
    loss = loss_fn(prediction, y_batch.to(device))
    if epoch % 100 == 0 or epoch == number_of_epoches - 1:
        print(f'Epoch {epoch}, train loss {loss.item()}')

    loss.backward()
    optimizer.step()
    model.zero_grad()

    if epoch > 0 and (epoch % 100 == 0 or epoch == number_of_epoches - 1):
        model_file_name = os.path.join(model_dir, "model_" + str(epoch) + ".pt")
        torch.save(model.state_dict(), model_file_name)
        print("Model has been saved as", model_file_name)


Epoch 0, validation loss 1.1007750034332275
Epoch 0, train loss 1.095632553100586
Epoch 100, validation loss 0.8361845016479492
Epoch 100, train loss 0.6235423684120178
Model has been saved as /Users/yaskovdev/dev/git_home/ai-sandbox/sentiment-analysis/models/model_100.pt
Epoch 200, validation loss 0.8652393221855164
Epoch 200, train loss 0.5559450387954712
Model has been saved as /Users/yaskovdev/dev/git_home/ai-sandbox/sentiment-analysis/models/model_200.pt


KeyboardInterrupt: 

In [97]:
model.load_state_dict(
torch.load(os.path.join(os.getcwd(), "models", 'model_999.pt'), map_location=torch.device(device)))

<All keys matched successfully>

In [116]:
model.eval()
message_index = 201
message = validation_dataset['text'][message_index]
encoded_message = validation_xs[message_index]
print(message)
labels = ['negative', 'neutral', 'positive']
print(f'Expected: {labels[torch.argmax(torch.tensor(validation_ys[message_index])).item()]}')
# test_tokens, _ = tokenize([message])
# x = encode_x(vocabulary, test_tokens)
y = model(torch.tensor(encoded_message, dtype=torch.float32).to(device))
#
distribution = torch.nn.functional.softmax(y, dim=-1)
print(distribution)
answer = torch.argmax(distribution)
print(['negative', 'neutral', 'positive'][answer])
# model.train()

oh dear! gareths b-day 2moro all my girlfriends are abroad  lucky niamh is coming home 2moro poor gareth has 2 listen 2 me talk all day
Expected: neutral
tensor([3.0862e-07, 9.9991e-01, 8.7616e-05], device='mps:0',
       grad_fn=<SoftmaxBackward0>)
neutral


In [None]:
torch.onnx.export(model, (torch.tensor(x, dtype=torch.float32).to(device),), "model.onnx")