In [48]:
import os
from datetime import datetime

import spacy
import torch
from datasets import load_dataset, Dataset
from torch import nn
from torch.utils.data import DataLoader
from tqdm import tqdm
import comgra
from comgra.objects import DecisionMakerForRecordingsFrequencyPerType
from comgra.recorder import ComgraRecorder

In [49]:
def determine_device():
    if torch.cuda.is_available():
        return 'cuda'
    elif torch.backends.mps.is_available():
        return 'mps'
    else:
        return 'cpu'


device = determine_device()
print(f'Device is {device}')

Device is mps


In [50]:
def process(example):
    example['text'] = example['text'].strip()
    example['length'] = len(example['text'])
    return example


def tokenize_messages(messages):
    tokens = set()
    tokenized_messages = []
    for message in tqdm(messages):
        doc = nlp(message)
        tokenized_message = [token.text.lower() for token in doc]
        tokens.update(tokenized_message)
        tokenized_messages.append(tokenized_message)
    return list(tokens), tokenized_messages


def encode_x(token_to_index, tokens):
    return torch.tensor([token_to_index[token] for token in tokens if token in token_to_index])


# Returns a one-hot encoding of a label, e.g., (0, 1, 0).
def encode_y(label):
    y = torch.zeros(3)
    y[label] = 1
    return y

In [51]:
nlp = spacy.load('en_core_web_sm')

max_length = float('inf')
dataset_huggingface = load_dataset('Sp1786/multiclass-sentiment-analysis-dataset')
train_dataset_huggingface: Dataset = dataset_huggingface['train'].filter(lambda it: len(it['text']) <= max_length).map(process).sort('length')
val_dataset_huggingface: Dataset = dataset_huggingface['validation'].filter(lambda it: len(it['text']) <= max_length).map(process).sort(
    'length')
test_dataset_huggingface: Dataset = dataset_huggingface['test'].filter(lambda it: it['text'] is not None and len(it['text']) <= max_length).map(
    process).sort('length')

train_tokens, train_tokenized_messages = tokenize_messages(train_dataset_huggingface['text'])
validation_tokens, val_tokenized_messages = tokenize_messages(val_dataset_huggingface['text'])
test_tokens, test_tokenized_messages = tokenize_messages(test_dataset_huggingface['text'])

Connection closed unexpectedly!


Map:   0%|          | 0/31232 [00:00<?, ? examples/s]

Map:   0%|          | 0/5205 [00:00<?, ? examples/s]

Map:   0%|          | 0/5205 [00:00<?, ? examples/s]

100%|██████████| 31232/31232 [01:18<00:00, 396.43it/s]
100%|██████████| 5205/5205 [00:13<00:00, 395.60it/s]
100%|██████████| 5205/5205 [00:12<00:00, 404.46it/s]


In [52]:
# the full list of tokens is sorted to ensure that the encoding of the messages stays the same between the Jupiter Notebook reloads, so that the saved models could be loaded and used for inference
tokens = sorted(set(train_tokens + validation_tokens + test_tokens))
print(tokens[:20])

vocabulary = {token: index for index, token in enumerate(tokens)}
print(len(vocabulary))

token_to_index = {u: i for i, u in enumerate(vocabulary)}

['\t ', '\n', '\n\n', ' ', '  ', '   ', '    ', '     ', '      ', '       ', '        ', '             ', '              ', '               ', '                ', '                                           ', '                                                                                              ', '!', '"', '"-']
36633


In [53]:
train_messages = [encode_x(token_to_index, tokens) for tokens in train_tokenized_messages]
train_labels = [encode_y(label) for label in train_dataset_huggingface['label']]
val_messages = [encode_x(token_to_index, tokens) for tokens in val_tokenized_messages]
val_labels = [encode_y(label) for label in val_dataset_huggingface['label']]
test_messages = [encode_x(token_to_index, tokens) for tokens in test_tokenized_messages]
test_labels = [encode_y(label) for label in test_dataset_huggingface['label']]
print(len(train_messages))
print(len(train_labels))
print(len(val_messages))
print(len(val_labels))
print(len(test_messages))
print(len(test_labels))

31232
31232
5205
5205
5205
5205


In [54]:
index = 20000
print(train_dataset_huggingface['text'][index], train_tokenized_messages[index])

At least he`s in breakthrough performance tho. I just wanted him nominated in his own category ['at', 'least', 'he`s', 'in', 'breakthrough', 'performance', 'tho', '.', 'i', 'just', 'wanted', 'him', 'nominated', 'in', 'his', 'own', 'category']


In [55]:
class CustomDataset(torch.utils.data.Dataset):

    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

train_dataset = CustomDataset(train_messages, train_labels)
val_dataset = CustomDataset(val_messages, val_labels)
test_dataset = CustomDataset(test_messages, test_labels)


def collate_fn(batch):
    # TODO: a way to pass padding_value without closure?
    return [
        nn.utils.rnn.pad_sequence([x[0] for x in batch], batch_first=True, padding_value=len(vocabulary)),
        torch.stack([x[1] for x in batch])
    ]

BATCH_SIZE = 128
SHUFFLE = False
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=SHUFFLE, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=SHUFFLE, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=SHUFFLE, collate_fn=collate_fn)

In [85]:
# See https://docs.pytorch.org/docs/stable/generated/torch.nn.GRU.html.
class Model(nn.Module):

    def __init__(self, vocabulary_size, embedding_dim, hidden_size, num_layers):
        super(Model, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=vocabulary_size, embedding_dim=embedding_dim)
        self.rnn = nn.GRU(input_size=embedding_dim, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, 3)
        self.log_softmax = nn.LogSoftmax(
            dim=1)  # The negative log likelihood loss expects log-probabilities of each class.

    def forward(self, sequence, hidden):
        # (batch_size, sequence_length) -> (batch_size, sequence_length, embedding_dim)
        embedded = self.embedding(sequence)
        # (batch_size, sequence_length, embedding_dim)
        # -> (batch_size, sequence_length, hidden_size), (num_layers, batch_size, hidden_size)
        prediction, hidden = self.rnn(embedded, hidden)
        # See https://docs.pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html#creating-the-network.
        linear_prediction = self.linear(prediction[:, -1])
        return self.log_softmax(linear_prediction), hidden

In [86]:
comgra.my_recorder = ComgraRecorder(
    comgra_root_path=os.path.join(os.getcwd(), "comgra"),
    group="name_of_experiment_group",
    trial_id="example_trial",
    decision_maker_for_recordings=DecisionMakerForRecordingsFrequencyPerType(min_training_steps_difference=5),
)
comgra.my_recorder.add_note("This is an optional log message that will show up in the 'Notes' tab.")

model_dir = os.path.join(os.getcwd(), "models")
os.makedirs(model_dir, exist_ok=True)

embedding_dim, hidden_size, num_layers = 4, 128, 1
model = Model(len(vocabulary), embedding_dim, hidden_size, num_layers).to(device)

# See https://discuss.pytorch.org/t/difference-between-cross-entropy-loss-or-log-likelihood-loss/38816/2.
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=5e-3)
comgra.my_recorder.track_module("main_model", model)

# See https://docs.pytorch.org/tutorials/beginner/introyt/trainingyt.html.
REPORT_EVERY = 10


def train_one_epoch(epoch):
    running_loss = 0.
    last_loss = 0.

    # Here, we use enumerate(training_loader) instead of
    # iter(training_loader) so that we can track the batch
    # index and do some intra-epoch reporting
    for i, data in enumerate(train_dataloader):
        # Every data instance is an input + label pair
        x, y = data[0].to(device), data[1].to(device)
        b, _ = x.shape
        h0 = torch.zeros(num_layers, b, hidden_size).to(device)
        comgra.my_recorder.start_batch(epoch * 1000 + i, b) # TODO: figure out the correct way to calculate the training_step
        comgra.my_recorder.start_iteration()

        # See https://github.com/FlorianDietz/comgra?tab=readme-ov-file#known-issues.
        comgra.my_recorder.register_tensor("inputs", x.float(), is_input=True) # TODO: h0?
        b = x * 1.0
        b.requires_grad = True
        comgra.my_recorder.add_tensor_connection("inputs", b)
        # Make predictions for this batch
        y_pred, _ = model(x, h0)
        comgra.my_recorder.register_tensor("outputs", y_pred)
        comgra.my_recorder.register_tensor("targets", y.float(), is_target=True)
        c = y * 1.0
        comgra.my_recorder.add_tensor_connection("targets", c)

        # Compute the loss and its gradients
        loss = loss_fn(y_pred, y.to(device))
        comgra.my_recorder.register_tensor("loss", loss, is_loss=True)
        comgra.my_recorder.record_kpi_in_graph("loss", "", loss)

        # Zero your gradients for every batch!
        optimizer.zero_grad()
        loss.backward()

        # Adjust learning weights
        optimizer.step()
        comgra.my_recorder.record_current_gradients(f"gradients")
        comgra.my_recorder.finish_iteration()
        comgra.my_recorder.finish_batch()

        # Gather data and report
        running_loss += loss.item()
        if i % REPORT_EVERY == REPORT_EVERY - 1:
            last_loss = running_loss / REPORT_EVERY  # loss per batch
            print('  batch {} loss: {}'.format(i + 1, last_loss))
            running_loss = 0.

    return last_loss

In [90]:
# Initializing in a separate cell so we can easily add more epochs to the same run
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
epoch_number = 0

EPOCHS = 100

best_val_loss = 1_000_000.

for epoch in range(EPOCHS):
    print('EPOCH {}:'.format(epoch_number + 1))

    # Make sure gradient tracking is on, and do a pass over the data
    model.train(True)
    avg_loss = train_one_epoch(epoch)

    running_val_loss = 0.0
    # Set the model to evaluation mode, disabling dropout and using population
    # statistics for batch normalization.
    model.eval()

    # Disable gradient computation and reduce memory consumption.
    with torch.no_grad():
        for i, val_data in enumerate(val_dataloader):
            val_x, val_y = val_data
            b, _ = val_x.shape
            h0 = torch.zeros(num_layers, b, hidden_size)
            val_y_pred, _ = model(val_x.to(device), h0.to(device))
            val_loss = loss_fn(val_y_pred, val_y.to(device))
            running_val_loss += val_loss

    avg_val_loss = running_val_loss / (i + 1)
    print('LOSS train {} valid {}'.format(avg_loss, avg_val_loss))

    # Track the best performance, and save the model's state
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        model_path = os.path.join(model_dir, 'model_{}_{}.pt'.format(timestamp, epoch_number))
        torch.save(model.state_dict(), model_path)

    epoch_number += 1
comgra.my_recorder.finalize()

EPOCH 1:
  batch 10 loss: 1.0847641348838806
  batch 20 loss: 1.0678720355033875
  batch 30 loss: 1.067219650745392
  batch 40 loss: 1.0844271779060364
  batch 50 loss: 1.0749964475631715
  batch 60 loss: 1.0738522052764892
  batch 70 loss: 1.066994273662567
  batch 80 loss: 1.0814654231071472
  batch 90 loss: 1.0740254402160645
  batch 100 loss: 1.0746989727020264
  batch 110 loss: 1.0760101079940796
  batch 120 loss: 1.0711143732070922
  batch 130 loss: 1.056877851486206
  batch 140 loss: 1.0690032720565796
  batch 150 loss: 1.079414188861847
  batch 160 loss: 1.0485008716583253
  batch 170 loss: 1.0582187533378602
  batch 180 loss: 1.0567939162254334
  batch 190 loss: 1.0566339492797852
  batch 200 loss: 1.061097502708435
  batch 210 loss: 1.0564129829406739
  batch 220 loss: 1.0344078063964843
  batch 230 loss: 1.0180722057819367
  batch 240 loss: 1.0138613760471344
LOSS train 1.0138613760471344 valid 1.0475882291793823
EPOCH 2:
  batch 10 loss: 1.0016878426074982
  batch 20 loss: 

In [95]:
model = Model(len(vocabulary), embedding_dim, hidden_size, num_layers).to(device)
model.load_state_dict(
    torch.load(os.path.join(os.getcwd(), 'model_20250715_004909_4'), map_location=torch.device(device)))

<All keys matched successfully>

In [119]:
# Measuring the model performance
model.eval()
correct = 0
total = 0
for index in range(len(test_dataset_huggingface)):
    h0 = torch.zeros(num_layers, 1, hidden_size)
    message = test_dataset_huggingface['text'][index]
    label = test_dataset_huggingface['label'][index]
    encoded_message = encode_x(token_to_index, test_tokenized_messages[index])
    encoded_label = encode_y(label)
    x_batch, y_batch = torch.unsqueeze(encoded_message, dim=0), torch.unsqueeze(encoded_label, dim=0)
    prediction, _ = model(x_batch.to(device), h0.to(device))
    # https://docs.pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html#creating-the-network
    _, top_i = torch.topk(prediction, k=1)
    labels = ['negative', 'neutral', 'positive']
    if labels[label] == labels[top_i[0].item()]:
        correct += 1
    total += 1
    if index % 100 == 0:
        print(f'Finished test {index}, accuracy is {correct / total}')

print(correct, total)
model.train()

Finished test 0, accuracy is 0.0
Finished test 100, accuracy is 0.5742574257425742
Finished test 200, accuracy is 0.6616915422885572
Finished test 300, accuracy is 0.6677740863787376
Finished test 400, accuracy is 0.683291770573566
Finished test 500, accuracy is 0.6806387225548902
Finished test 600, accuracy is 0.6905158069883528
Finished test 700, accuracy is 0.7004279600570613


KeyboardInterrupt: 

In [118]:
model.eval()
h0 = torch.zeros(num_layers, 1, hidden_size)
message = "I never hated you"
message = "I do not hate you"
message = "I'm sick of that"
message = "My computer is great"
_, tokenized_messages = tokenize_messages([message])
encoded_message = encode_x(token_to_index, tokenized_messages[0])
x_batch = torch.unsqueeze(encoded_message, dim=0)
prediction, _ = model(x_batch.to(device), h0.to(device))
print(prediction)
_, top_i = torch.topk(prediction, k=1)
labels = ['negative', 'neutral', 'positive']
print(labels[top_i[0].item()])
model.train()

100%|██████████| 1/1 [00:00<00:00, 206.23it/s]

tensor([[-0.8819, -1.8181, -0.8588]], device='mps:0',
       grad_fn=<LogSoftmaxBackward0>)
positive





Model(
  (embedding): Embedding(36633, 4)
  (rnn): GRU(4, 128, batch_first=True)
  (linear): Linear(in_features=128, out_features=3, bias=True)
  (log_softmax): LogSoftmax(dim=1)
)