In [28]:
import numpy as np
import spacy
import torch
from datasets import load_dataset, Dataset, concatenate_datasets
from torch import nn

In [2]:
def determine_device():
    if torch.cuda.is_available():
        return 'cuda'
    elif torch.backends.mps.is_available():
        return 'mps'
    else:
        return 'cpu'


device = determine_device()
print(f'Device is {device}')

Device is mps


In [56]:
def process(example):
    example['text'] = example['text'].strip()
    example['length'] = len(example['text'])
    return example


nlp = spacy.load('en_core_web_sm')
dataset = load_dataset('Sp1786/multiclass-sentiment-analysis-dataset')
train_dataset: Dataset = dataset['train'].filter(lambda it: len(it['text']) <= 128).map(process).sort('length')
validation_dataset: Dataset = dataset['validation'].filter(lambda it: len(it['text']) <= 128).map(process).sort(
    'length')
test_dataset: Dataset = dataset['test']

corpus = concatenate_datasets([train_dataset, validation_dataset])['text']
vocabulary = sorted(set(''.join(corpus)))

# TODO: cleanup data to only have English letters
char_to_i = {u: i for i, u in enumerate(vocabulary)}


def encode_x(char_to_i, message):
    return torch.tensor([char_to_i[char] for char in message])


def encode_y(label):
    vector = torch.zeros(3)
    vector[label] = 1
    return vector


train_messages = [encode_x(char_to_i, message) for message in train_dataset['text']]
train_labels = [encode_y(label) for label in train_dataset['label']]
validation_messages = [encode_x(char_to_i, message) for message in validation_dataset['text']]
validation_labels = [encode_y(label) for label in validation_dataset['label']]

print('Number of train messages:', len(train_messages))
print('Number of train labels:', len(train_labels))
print('Number of validation messages:', len(validation_messages))
print('Number of validation labels:', len(validation_labels))
print(max([len(message) for message in train_messages]))
print(len(vocabulary))


Filter:   0%|          | 0/31232 [00:00<?, ? examples/s]

Map:   0%|          | 0/25913 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5205 [00:00<?, ? examples/s]

Map:   0%|          | 0/4279 [00:00<?, ? examples/s]

Number of train messages: 25913
Number of train labels: 25913
Number of validation messages: 4279
Number of validation labels: 4279
128
503


In [49]:
train_dataset['text'][:100]

['😍',
 '💖',
 '😀',
 '👍',
 '🐂',
 'yy',
 'Ok',
 'aw',
 'ME',
 'TV',
 '♡♡',
 'no',
 'Gd',
 'ok',
 'gut',
 'Oke',
 'bad',
 'Fab',
 'Wow',
 '🙄🙄🙄',
 'yup',
 'top',
 'yep',
 'okk',
 'Bad',
 'Baf',
 'new',
 'not',
 'Xxx',
 'wew',
 'Try',
 'Gud',
 'Yas',
 'E u',
 'NYC',
 'Hey',
 'awe',
 'Pay',
 'Thx',
 'Nope',
 'heyy',
 'nice',
 'Kpai',
 'same',
 'Love',
 'lame',
 'Surp',
 'ouch',
 'Hell',
 'Cute',
 'Well',
 'Why?',
 'taco',
 'Nice',
 'best',
 'Why?',
 'Thop',
 'yay!',
 'why?',
 'yeah',
 'Cool',
 'rain',
 '😶😶😶😶',
 'lMMD',
 'Like',
 'with',
 'mean',
 'Also',
 'Fine',
 'like',
 'good',
 'awww',
 '55 o',
 'Nise',
 'شكرا',
 '****',
 'Bye.',
 '*hug*',
 'Oh no',
 'nice!',
 'uh oh',
 'I did',
 'Good.',
 'Good!',
 'Yep!!',
 'gmail',
 'Yeah.',
 'woOt!',
 'super',
 'supp?',
 'awee!',
 'yeahh',
 'Sucks',
 'Bekar',
 'Yayyy',
 'Great',
 'night',
 'thanx',
 'goood',
 'why??']

In [81]:
def create_batch(xs, ys, batch_size):
    index = np.random.choice(len(xs) - batch_size + 1)
    indices = range(index, index + batch_size)
    return nn.utils.rnn.pad_sequence([xs[i] for i in indices], batch_first=True), torch.stack(
        [ys[i] for i in indices])

In [82]:
# See https://docs.pytorch.org/docs/stable/generated/torch.nn.GRU.html.
class Model(nn.Module):

    def __init__(self, vocabulary_size, embedding_dim, hidden_size):
        super(Model, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=vocabulary_size, embedding_dim=embedding_dim)
        self.rnn = nn.GRU(input_size=embedding_dim, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, 3)

    def forward(self, sequence, hidden):
        # (batch_size, sequence_length) -> (batch_size, sequence_length, embedding_dim)
        embedded = self.embedding(sequence)
        # (batch_size, sequence_length, embedding_dim)
        # -> (batch_size, sequence_length, hidden_size), (num_layers, batch_size, hidden_size)
        prediction, hidden = self.rnn(embedded, hidden)
        # See https://docs.pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html#creating-the-network.
        return self.linear(prediction[:, -1, :]), hidden

In [83]:
# TODO: try different batch sizes
batch_size = 128
loss_fn = nn.CrossEntropyLoss(reduction="mean")

embedding_dim = 256
hidden_size = 1024
num_layers = 2
model = Model(len(vocabulary), embedding_dim, hidden_size).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=5e-3)

number_of_epoches = 1000
min_validation_loss = float('inf')
for epoch in range(number_of_epoches):
    x_batch, y_batch = create_batch(train_messages, train_labels, batch_size)
    h0 = torch.zeros(num_layers, batch_size, hidden_size)
    prediction, _ = model(x_batch.to(device), h0.to(device))
    loss = loss_fn(prediction, y_batch.to(device))

    loss.backward()
    optimizer.step()
    model.zero_grad()

    if epoch % 1 == 0 or epoch == number_of_epoches - 1:
        print(f'Epoch {epoch}, train loss {loss.item()}')

Epoch 0, train loss 140.5164337158203
Epoch 1, train loss 632.6210327148438
Epoch 2, train loss 519.5572509765625
Epoch 3, train loss 933.2319946289062
Epoch 4, train loss 705.7045288085938
Epoch 5, train loss 160.4541015625
Epoch 6, train loss 307.45819091796875
Epoch 7, train loss 271.8440856933594
Epoch 8, train loss 154.2937469482422
Epoch 9, train loss 238.359375
Epoch 10, train loss 144.48666381835938
Epoch 11, train loss 199.45755004882812
Epoch 12, train loss 254.4681396484375
Epoch 13, train loss 159.23524475097656
Epoch 14, train loss 221.72213745117188
Epoch 15, train loss 202.20468139648438
Epoch 16, train loss 138.87362670898438
Epoch 17, train loss 198.72479248046875
Epoch 18, train loss 151.6739959716797
Epoch 19, train loss 150.80471801757812
Epoch 20, train loss 164.68215942382812
Epoch 21, train loss 160.53231811523438
Epoch 22, train loss 148.97027587890625
Epoch 23, train loss 162.66848754882812
Epoch 24, train loss 151.57351684570312
Epoch 25, train loss 148.769821

KeyboardInterrupt: 

In [85]:
h0 = torch.zeros(num_layers, 1, hidden_size)
index = 1000
message = validation_dataset['text'][index]
label = validation_dataset['label'][index]
print(message)
print(label)
encoded_message = encode_x(char_to_i, message)
encoded_label = encode_y(label)
print(encoded_message)
print(encoded_label)
x_batch, y_batch = create_batch([encoded_message], [encoded_label], 1)
prediction, _ = model(x_batch.to(device), h0.to(device))
print(torch.nn.functional.softmax(prediction, dim=-1))

Speedbumps suck when u gotta piss!!
0
tensor([53, 82, 71, 71, 70, 68, 87, 79, 82, 85,  2, 85, 87, 69, 77,  2, 89, 74,
        71, 80,  2, 87,  2, 73, 81, 86, 86, 67,  2, 82, 75, 85, 85,  3,  3])
tensor([1., 0., 0.])
tensor([[0.3066, 0.3262, 0.3672]], device='mps:0', grad_fn=<SoftmaxBackward0>)
