In [16]:
import os

import numpy as np
import spacy
import torch
from datasets import load_dataset, Dataset, concatenate_datasets
from torch import nn
from torch.nn import functional as F

In [4]:
# batch_size = 3
# sequence_length = 5
# input_size = 10
# hidden_size = 20
# num_layers = 2
# rnn = nn.GRU(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
# input = torch.randn(batch_size, sequence_length, input_size)  # Should be (batch_size, sequence_length, input_size)
# h0 = torch.randn(num_layers, batch_size, hidden_size)  # Should be (num_layers, batch_size, hidden_size)
# output, hn = rnn(input, h0)
# print(output.shape)


# See https://docs.pytorch.org/docs/stable/generated/torch.nn.GRU.html.
class Model(nn.Module):

    def __init__(self, vocabulary_size, embedding_dim, hidden_size):
        super(Model, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=vocabulary_size, embedding_dim=embedding_dim)
        self.rnn = nn.GRU(input_size=embedding_dim, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocabulary_size)

    def forward(self, sequence, hidden):
        # (batch_size, sequence_length) -> (batch_size, sequence_length, embedding_dim)
        embedded = self.embedding(sequence)
        print('before', embedded.shape)
        # (batch_size, sequence_length, embedding_dim)
        # -> (batch_size, sequence_length, hidden_size), (num_layers, batch_size, hidden_size)
        prediction, hidden = self.rnn(embedded, hidden)
        print('after', hidden[0].shape)
        # See https://docs.pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html#creating-the-network.
        return self.linear(hidden[0]), hidden


batch_size = 2
num_layers = 3
embedding_dim = 10
hidden_size = 20
h0 = torch.zeros(num_layers, batch_size, hidden_size)
vocabulary_size = 4
# See https://discuss.pytorch.org/t/gru-and-padded-sequences-tipps-and-tricks/90729.
sequences = [
    [0, 1, 2, 3, 0],
    [0, 1, 2, 2, 0]
]
# packed_sequences = nn.utils.rnn.pack_sequence(sequences)
input = torch.tensor(sequences)
# packed_sequences.data
model = Model(vocabulary_size, embedding_dim, hidden_size)
y1, h1 = model.forward(input, h0)
print(y1)
print(h1)
print(sum([4,6,8,5,4,3,7,8]))

before torch.Size([2, 5, 10])
after torch.Size([2, 20])
tensor([[-0.2569,  0.2200, -0.1028, -0.1272],
        [-0.2597,  0.2211, -0.0933, -0.1527]], grad_fn=<AddmmBackward0>)
tensor([[[ 0.4181, -0.2240, -0.2414, -0.3319,  0.2433, -0.0841, -0.1404,
          -0.0915, -0.2395, -0.1739, -0.2741,  0.3700,  0.0347,  0.0502,
           0.1752, -0.0028,  0.4296,  0.2685,  0.0816, -0.1483],
         [ 0.4247, -0.0240, -0.3432, -0.2624,  0.1610, -0.1988, -0.0863,
          -0.0444, -0.2826, -0.3509, -0.1825,  0.1301, -0.0578,  0.1600,
           0.2182, -0.2973,  0.3265,  0.4021, -0.1764,  0.0373]],

        [[ 0.2080,  0.0928,  0.2080,  0.2313,  0.0567,  0.0924, -0.2340,
          -0.1719, -0.0457, -0.1122,  0.1348, -0.2385,  0.3061,  0.2668,
          -0.0119, -0.1363,  0.2380,  0.0187,  0.2667,  0.3373],
         [ 0.1728,  0.2045,  0.2135,  0.3272,  0.0606,  0.1096, -0.2796,
          -0.2297, -0.1099, -0.1299,  0.1604, -0.1927,  0.2305,  0.3387,
          -0.0792, -0.1172,  0.2822, -0.0366

In [5]:
def tokenize(messages):
    tokens = set()
    message_to_tokens = []
    for message in messages:
        doc = nlp(message)
        message_tokens = {token.text.lower() for token in doc if
                          token.pos_ in {'ADJ', 'ADV', 'INTJ', 'NOUN', 'PROPN', 'VERB'}}
        tokens.update(message_tokens)
        message_to_tokens.append(message_tokens)
    return list(tokens), message_to_tokens

In [6]:
def encode_x(vocabulary, tokens):
    vector = torch.zeros(len(vocabulary))
    for token in tokens:
        index = vocabulary[token]
        vector[index] += 1
    return vector


def encode_y(label):
    vector = torch.zeros(3)
    vector[label] = 1
    return vector

In [39]:
from torch.utils.data import DataLoader
from util import BucketBatchSampler, BucketDataset

nlp = spacy.load('en_core_web_sm')
dataset = load_dataset('Sp1786/multiclass-sentiment-analysis-dataset')
train_dataset: Dataset = dataset['train']
validation_dataset: Dataset = dataset['validation']
test_dataset: Dataset = dataset['test']

corpus = concatenate_datasets([train_dataset, validation_dataset])['text']
vocabulary = sorted(set(''.join(corpus)))

char_to_i = {u: i for i, u in enumerate(vocabulary)}

def encode(char_to_i, message):
    return np.array([char_to_i[char] for char in message])

train_messages = [encode(char_to_i, message) for message in train_dataset['text']]
train_labels = [encode_y(label) for label in train_dataset['label']]

bucket_batch_sampler = BucketBatchSampler(train_messages, 128) # <-- does not store X
bucket_dataset = BucketDataset(train_messages, train_labels)
dataloader = DataLoader(bucket_dataset, batch_size=1, batch_sampler=bucket_batch_sampler, shuffle=False, num_workers=8, drop_last=False)

i = 0
for x, y in dataloader:
    print(x)
    print(y)
    i += 1

tensor([[54, 74, 71,  ..., 80, 70,  3],
        [41, 81, 81,  ...,  2, 81, 77],
        [42, 81, 69,  ...,  2, 90, 38],
        ...,
        [ 2, 86, 74,  ..., 87, 73, 74],
        [43, 86,  2,  ..., 81, 84, 71],
        [65, 79, 69,  ..., 21, 21, 21]])
tensor([[0., 0., 1.],
        [0., 0., 1.],
        [0., 1., 0.],
        [0., 0., 1.],
        [1., 0., 0.],
        [0., 0., 1.],
        [0., 0., 1.],
        [0., 0., 1.],
        [0., 0., 1.],
        [1., 0., 0.],
        [0., 0., 1.],
        [0., 1., 0.],
        [1., 0., 0.],
        [0., 0., 1.],
        [1., 0., 0.],
        [0., 0., 1.],
        [0., 1., 0.],
        [0., 1., 0.],
        [0., 1., 0.],
        [0., 1., 0.],
        [0., 1., 0.],
        [1., 0., 0.],
        [0., 0., 1.],
        [0., 0., 1.],
        [1., 0., 0.],
        [0., 0., 1.],
        [1., 0., 0.],
        [1., 0., 0.],
        [0., 0., 1.],
        [0., 1., 0.],
        [0., 0., 1.],
        [0., 1., 0.],
        [0., 0., 1.],
        [0., 0., 1.]

In [77]:
train_xs = torch.stack([encode_x(vocabulary, tokens) for tokens in train_message_to_tokens])
train_ys = torch.stack([encode_y(label) for label in train_dataset['label']])
validation_xs = torch.stack([encode_x(vocabulary, tokens) for tokens in validation_message_to_tokens])
validation_ys = torch.stack([encode_y(label) for label in validation_dataset['label']])
print(train_xs.shape)
print(train_ys.shape)
print(validation_xs.shape)
print(validation_ys.shape)

torch.Size([31232, 31839])
torch.Size([31232, 3])
torch.Size([5205, 31839])
torch.Size([5205, 3])


In [78]:
def create_batch(xs, ys, batch_size):
    random_indices = np.random.choice(len(xs), batch_size)
    return xs[random_indices], ys[random_indices]

In [79]:
def determine_device():
    if torch.cuda.is_available():
        return 'cuda'
    elif torch.backends.mps.is_available():
        return 'mps'
    else:
        return 'cpu'


device = determine_device()
print(f'Device is {device}')

Device is mps


In [83]:
def estimate_loss(model, iterations, validation_xs, validation_ys, batch_size):
    model.eval()
    loses = torch.zeros(iterations)
    for i in range(iterations):
        validation_x_batch, validation_y_batch = create_batch(validation_xs, validation_ys, batch_size)
        validation_prediction = model(validation_x_batch.to(device))
        validation_loss = F.cross_entropy(validation_prediction, validation_y_batch.to(device), reduction='mean')
        loses[i] = validation_loss.item()
    model.train()
    return loses.mean()

In [84]:
# nn.Softmax is not required (see the nn.CrossEntropyLoss docs why) and can be kept only in the inference
model = nn.Sequential(
    nn.Linear(len(vocabulary), 256),
    nn.Linear(256, 3)
).to(device)

In [85]:
model_dir = os.path.join(os.getcwd(), "models")
os.makedirs(model_dir, exist_ok=True)

batch_size = 512
loss_fn = nn.CrossEntropyLoss(reduction="mean")
optimizer = torch.optim.Adam(model.parameters(), lr=5e-3)

number_of_epoches = 100
min_validation_loss = float('inf')
for epoch in range(number_of_epoches):
    x_batch, y_batch = create_batch(train_xs, train_ys, batch_size)
    prediction = model(x_batch.to(device))
    loss = loss_fn(prediction, y_batch.to(device))

    loss.backward()
    optimizer.step()
    model.zero_grad()

    if epoch % 10 == 0 or epoch == number_of_epoches - 1:
        mean_loss = estimate_loss(model, 10, validation_xs, validation_ys, batch_size)
        print(f'Epoch {epoch}, trail loss {loss.item()}, validation loss {mean_loss.item()}')
        if mean_loss < min_validation_loss:
            model_file_name = os.path.join(model_dir, "model_" + str(epoch) + ".pt")
            torch.save(model.state_dict(), model_file_name)
            print("Model has been saved as", model_file_name)
            min_validation_loss = mean_loss

Epoch 0, trail loss 1.102682113647461, validation loss 1.077303171157837
Model has been saved as /Users/yaskovdev/dev/git_home/ai-sandbox/sentiment-analysis/models/model_0.pt
Epoch 10, trail loss 0.8979932069778442, validation loss 0.9276639819145203
Model has been saved as /Users/yaskovdev/dev/git_home/ai-sandbox/sentiment-analysis/models/model_10.pt
Epoch 20, trail loss 0.7764990329742432, validation loss 0.8771727681159973
Model has been saved as /Users/yaskovdev/dev/git_home/ai-sandbox/sentiment-analysis/models/model_20.pt
Epoch 30, trail loss 0.6884598135948181, validation loss 0.8671800494194031
Model has been saved as /Users/yaskovdev/dev/git_home/ai-sandbox/sentiment-analysis/models/model_30.pt
Epoch 40, trail loss 0.7256267070770264, validation loss 0.9020015597343445
Epoch 50, trail loss 0.6767445802688599, validation loss 0.8644858598709106
Model has been saved as /Users/yaskovdev/dev/git_home/ai-sandbox/sentiment-analysis/models/model_50.pt
Epoch 60, trail loss 0.6512887477

In [86]:
model.load_state_dict(
    torch.load(os.path.join(os.getcwd(), "models", 'model_50.pt'), map_location=torch.device(device)))

<All keys matched successfully>

In [91]:
model.eval()
message = "You're the best"
test_tokens, _ = tokenize([message])
x = encode_x(vocabulary, test_tokens)
y = model(x.to(device))

distribution = torch.nn.functional.softmax(y, dim=-1)
print(distribution)
answer = torch.argmax(distribution)
print(['negative', 'neutral', 'positive'][answer])
model.train()

tensor([0.0780, 0.2832, 0.6388], device='mps:0', grad_fn=<SoftmaxBackward0>)
positive


Sequential(
  (0): Linear(in_features=31839, out_features=256, bias=True)
  (1): Linear(in_features=256, out_features=3, bias=True)
)

In [30]:
torch.onnx.export(model, (torch.tensor(x, dtype=torch.float32).to(device),), "model.onnx")