In [211]:
import os

import numpy as np
import torch
from datasets import load_dataset, Dataset, concatenate_datasets
from sklearn.feature_extraction.text import TfidfVectorizer
from torch import nn
from torch.nn import functional as F

In [212]:
def encode_y(label):
    vector = torch.zeros(3)
    vector[label] = 1
    return vector

In [222]:
dataset = load_dataset('Sp1786/multiclass-sentiment-analysis-dataset')
train_dataset: Dataset = dataset['train']
validation_dataset: Dataset = dataset['validation']

corpus = concatenate_datasets([train_dataset, validation_dataset])['text']
vectorizer = TfidfVectorizer()  # TODO: try different max number of features
X = torch.tensor(vectorizer.fit_transform(corpus).toarray(), dtype=torch.float32)
print(X.dtype)

torch.float32


In [223]:
train_xs = X[:len(train_dataset)]
train_ys = torch.stack([encode_y(label) for label in train_dataset['label']])
validation_xs = X[len(train_dataset):]
validation_ys = torch.stack([encode_y(label) for label in validation_dataset['label']])
print(f'Train xs length: {len(train_xs)}, train ys length: {len(train_ys)}')
print(f'Validation xs length: {len(validation_xs)}, validation ys length: {len(validation_ys)}')

Train xs length: 31232, train ys length: 31232
Validation xs length: 5205, validation ys length: 5205


In [224]:
def create_batch(xs, ys, batch_size):
    random_indices = np.random.choice(len(xs), batch_size)
    return xs[random_indices], ys[random_indices]

In [225]:
def determine_device():
    if torch.cuda.is_available():
        return 'cuda'
    elif torch.backends.mps.is_available():
        return 'mps'
    else:
        return 'cpu'


device = determine_device()
print(f'Device is {device}')

Device is mps


In [231]:
def estimate_loss(model, iterations, validation_xs, validation_ys, batch_size):
    model.eval()
    loses = torch.zeros(iterations)
    for i in range(iterations):
        validation_x_batch, validation_y_batch = create_batch(validation_xs, validation_ys, batch_size)
        validation_prediction = model(validation_x_batch.to(device))
        validation_loss = F.cross_entropy(validation_prediction, validation_y_batch.to(device), reduction='mean')
        loses[i] = validation_loss.item()
    model.train()
    return loses.mean()

In [247]:
# nn.Softmax is not required (see the nn.CrossEntropyLoss docs why) and can be kept only in the inference
model = nn.Sequential(
    nn.Linear(X.shape[1], 512),
    nn.Linear(512, 128),
    nn.Linear(128, 3)
).to(device)

In [248]:
model_dir = os.path.join(os.getcwd(), "models")
os.makedirs(model_dir, exist_ok=True)

batch_size = 128
loss_fn = nn.CrossEntropyLoss(reduction="mean")
optimizer = torch.optim.Adam(model.parameters(), lr=5e-3)

number_of_epoches = 1000
for epoch in range(number_of_epoches):
    if epoch % 100 == 0 or epoch == number_of_epoches - 1:
        iterations = 10
        mean_loss = estimate_loss(model, iterations, validation_xs, validation_ys, batch_size)
        print(f'Epoch {epoch}, validation loss {mean_loss.item()}')

    x_batch, y_batch = create_batch(train_xs, train_ys, batch_size)
    prediction = model(x_batch.to(device))
    loss = loss_fn(prediction, y_batch.to(device))
    if epoch % 100 == 0 or epoch == number_of_epoches - 1:
        print(f'Epoch {epoch}, train loss {loss.item()}')

    loss.backward()
    optimizer.step()
    model.zero_grad()

    if epoch > 0 and (epoch % 100 == 0 or epoch == number_of_epoches - 1):
        model_file_name = os.path.join(model_dir, "model_" + str(epoch) + ".pt")
        torch.save(model.state_dict(), model_file_name)
        print("Model has been saved as", model_file_name)


Epoch 0, validation loss 1.0945491790771484
Epoch 0, train loss 1.0887937545776367
Epoch 100, validation loss 0.8140960931777954
Epoch 100, train loss 0.7766619324684143
Model has been saved as /Users/yaskovdev/dev/git_home/ai-sandbox/sentiment-analysis/models/model_100.pt
Epoch 200, validation loss 0.8585911989212036
Epoch 200, train loss 0.712178111076355
Model has been saved as /Users/yaskovdev/dev/git_home/ai-sandbox/sentiment-analysis/models/model_200.pt
Epoch 300, validation loss 0.8986045718193054
Epoch 300, train loss 0.5122576355934143
Model has been saved as /Users/yaskovdev/dev/git_home/ai-sandbox/sentiment-analysis/models/model_300.pt
Epoch 400, validation loss 0.9088821411132812
Epoch 400, train loss 0.41814205050468445
Model has been saved as /Users/yaskovdev/dev/git_home/ai-sandbox/sentiment-analysis/models/model_400.pt
Epoch 500, validation loss 1.0304359197616577
Epoch 500, train loss 0.33060935139656067
Model has been saved as /Users/yaskovdev/dev/git_home/ai-sandbox/

In [249]:
model.load_state_dict(
    torch.load(os.path.join(os.getcwd(), "models", 'model_200.pt'), map_location=torch.device(device)))

<All keys matched successfully>

In [258]:
model.eval()
message_index = 193
message = validation_dataset['text'][message_index]
encoded_message = validation_xs[message_index]
print(message)
labels = ['negative', 'neutral', 'positive']
print(f'Expected: {labels[torch.argmax(validation_ys[message_index]).item()]}')
# test_tokens, _ = tokenize([message])
# x = encode_x(vocabulary, test_tokens)
y = model(encoded_message.to(device))
#
distribution = torch.nn.functional.softmax(y, dim=-1)
print(distribution)
answer = torch.argmax(distribution)
print(['negative', 'neutral', 'positive'][answer])
# model.train()

 I do NOT. I RARELY dye it at all.  the red was the first in aaaagggessss. My hair just hates me.
Expected: negative
tensor([9.5637e-01, 4.3051e-02, 5.7639e-04], device='mps:0',
       grad_fn=<SoftmaxBackward0>)
negative


In [None]:
torch.onnx.export(model, (torch.tensor(x, dtype=torch.float32).to(device),), "model.onnx")