In [1]:
import os

import numpy as np
import torch
from datasets import load_dataset, Dataset, concatenate_datasets
from sklearn.feature_extraction.text import TfidfVectorizer
from torch import nn
from torch.nn import functional as F

In [2]:
def encode_y(label):
    vector = torch.zeros(3)
    vector[label] = 1
    return vector

In [12]:
dataset = load_dataset('Sp1786/multiclass-sentiment-analysis-dataset')
train_dataset: Dataset = dataset['train']
validation_dataset: Dataset = dataset['validation']

corpus = concatenate_datasets([train_dataset, validation_dataset])['text']
vectorizer = TfidfVectorizer(ngram_range=(1, 2))  # TODO: try different max number of features
X = torch.tensor(vectorizer.fit_transform(corpus).toarray(), dtype=torch.float32)
print(X.shape)

torch.Size([36437, 268766])


In [13]:
train_xs = X[:len(train_dataset)]
train_ys = torch.stack([encode_y(label) for label in train_dataset['label']])
validation_xs = X[len(train_dataset):]
validation_ys = torch.stack([encode_y(label) for label in validation_dataset['label']])
print(f'Train xs length: {len(train_xs)}, train ys length: {len(train_ys)}')
print(f'Validation xs length: {len(validation_xs)}, validation ys length: {len(validation_ys)}')

Train xs length: 31232, train ys length: 31232
Validation xs length: 5205, validation ys length: 5205


In [14]:
def create_batch(xs, ys, batch_size):
    random_indices = np.random.choice(len(xs), batch_size)
    return xs[random_indices], ys[random_indices]

In [15]:
def determine_device():
    if torch.cuda.is_available():
        return 'cuda'
    elif torch.backends.mps.is_available():
        return 'mps'
    else:
        return 'cpu'


device = determine_device()
print(f'Device is {device}')

Device is mps


In [19]:
def estimate_loss(model, iterations, validation_xs, validation_ys, batch_size):
    model.eval()
    loses = torch.zeros(iterations)
    for i in range(iterations):
        validation_x_batch, validation_y_batch = create_batch(validation_xs, validation_ys, batch_size)
        validation_prediction = model(validation_x_batch.to(device))
        validation_loss = F.cross_entropy(validation_prediction, validation_y_batch.to(device), reduction='mean')
        loses[i] = validation_loss.item()
    model.train()
    return loses.mean()

In [40]:
# nn.Softmax is not required (see the nn.CrossEntropyLoss docs why) and can be kept only in the inference
model = nn.Sequential(
    nn.Linear(X.shape[1], 128),
    nn.Dropout(p=0.2),
    nn.Linear(128, 3)
).to(device)

In [41]:
model_dir = os.path.join(os.getcwd(), "models")
os.makedirs(model_dir, exist_ok=True)

batch_size = 128
loss_fn = nn.CrossEntropyLoss(reduction="mean")
optimizer = torch.optim.Adam(model.parameters(), lr=5e-3)

number_of_epoches = 1000
for epoch in range(number_of_epoches):
    if epoch % 100 == 0 or epoch == number_of_epoches - 1:
        iterations = 10
        mean_loss = estimate_loss(model, iterations, validation_xs, validation_ys, batch_size)
        print(f'Epoch {epoch}, validation loss {mean_loss.item()}')

    x_batch, y_batch = create_batch(train_xs, train_ys, batch_size)
    prediction = model(x_batch.to(device))
    loss = loss_fn(prediction, y_batch.to(device))
    if epoch % 100 == 0 or epoch == number_of_epoches - 1:
        print(f'Epoch {epoch}, train loss {loss.item()}')

    loss.backward()
    optimizer.step()
    model.zero_grad()

    if epoch > 0 and (epoch % 100 == 0 or epoch == number_of_epoches - 1):
        model_file_name = os.path.join(model_dir, "model_" + str(epoch) + ".pt")
        torch.save(model.state_dict(), model_file_name)
        print("Model has been saved as", model_file_name)


Epoch 0, validation loss 1.1044849157333374
Epoch 0, train loss 1.1006226539611816
Epoch 100, validation loss 0.8304470181465149
Epoch 100, train loss 0.5002315044403076
Model has been saved as /Users/yaskovdev/dev/git_home/ai-sandbox/sentiment-analysis/models/model_100.pt
Epoch 200, validation loss 0.8196702003479004
Epoch 200, train loss 0.43814992904663086
Model has been saved as /Users/yaskovdev/dev/git_home/ai-sandbox/sentiment-analysis/models/model_200.pt
Epoch 300, validation loss 0.8260801434516907
Epoch 300, train loss 0.27154871821403503
Model has been saved as /Users/yaskovdev/dev/git_home/ai-sandbox/sentiment-analysis/models/model_300.pt
Epoch 400, validation loss 0.8721014261245728
Epoch 400, train loss 0.19580766558647156
Model has been saved as /Users/yaskovdev/dev/git_home/ai-sandbox/sentiment-analysis/models/model_400.pt
Epoch 500, validation loss 0.8616323471069336
Epoch 500, train loss 0.19394882023334503
Model has been saved as /Users/yaskovdev/dev/git_home/ai-sandb

KeyboardInterrupt: 

In [42]:
model.load_state_dict(
    torch.load(os.path.join(os.getcwd(), "models", 'model_200.pt'), map_location=torch.device(device)))

<All keys matched successfully>

In [49]:
model.eval()
message_index = 207
message = validation_dataset['text'][message_index]
encoded_message = validation_xs[message_index]
print(message)
labels = ['negative', 'neutral', 'positive']
print(f'Expected: {labels[torch.argmax(validation_ys[message_index]).item()]}')
# test_tokens, _ = tokenize([message])
# x = encode_x(vocabulary, test_tokens)
y = model(encoded_message.to(device))
#
distribution = torch.nn.functional.softmax(y, dim=-1)
print(distribution)
answer = torch.argmax(distribution)
print(['negative', 'neutral', 'positive'][answer])
# model.train()

On my way to school not feeln this rainy day at all.... But I had lots of FUN this weekend  ....
Expected: neutral
tensor([0.1732, 0.6943, 0.1325], device='mps:0', grad_fn=<SoftmaxBackward0>)
neutral


In [None]:
torch.onnx.export(model, (torch.tensor(x, dtype=torch.float32).to(device),), "model.onnx")