In [1]:
# https://www.kaggle.com/code/emirkocak/in-depth-series-sentiment-analysis-w-transformers
import torch
import torch.nn as nn
import torchtext
from torchtext.data.functional import generate_sp_model, load_sp_model, sentencepiece_tokenizer, sentencepiece_numericalizer
from torchtext.vocab import build_vocab_from_iterator
import torchtext.transforms as T
from torch.nn import functional as F
import torch.optim as optim

import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import io
import math
from torch.utils.data import Dataset, DataLoader
import torch.utils.data.dataloader as dataloader
import os
import re



In [2]:
# Load the tokenizer and vocabulary
model_path = "../models/final_transformer.pth"
tokenizer_path = "../SentencePiece/transformer.model"
vocab_path = "../SentencePiece/transformer.vocab"

In [3]:
def yield_tokens(file_path):
    with io.open(file_path, encoding='utf-8') as f:
        # Iterate through each line in the file
        for line in f:
            # Accesses the vocab file, splits the line by tab, and gets the first entry (the actual token)
            # Yield the token from the first column (split by tab)
            yield [line.split("\t")[0]]

# Build a vocabulary from the tokens yielded by the yield_tokens function
    # <pad> is a padding token that is added to the end of a sentence to ensure the length of all sequences in a batch is the same
    # <sos> signals the "Start-Of-Sentence" aka the start of the sequence
    # <eos> signal the "End-Of-Sentence" aka the end of the sequence
    # <unk> "unknown" token is used if a token is not contained in the vocab
# From torchtext library (build_vocab_from_iterator)
# Builds a generator object, that is treated like an iterator
vocab = build_vocab_from_iterator(yield_tokens("../SentencePiece/transformer.vocab"), specials=['<cls>', '<pad>', '<eos>', '<unk>'], special_first=True)

# Set the default index for unknown tokens to the index of the '<unk>' token
vocab.set_default_index(vocab['<unk>'])

In [4]:
text_transform = T.Sequential(
    T.SentencePieceTokenizer(tokenizer_path),
    T.VocabTransform(vocab),
    T.AddToken(vocab['<cls>'], begin=True),
    T.Truncate(max_seq_len=254),
    T.AddToken(vocab['<eos>'], begin=False),
    T.ToTensor(padding_value=vocab['<pad>']),
    T.PadTransform(max_length=256, pad_value=0),
)


In [5]:
class TokenDrop(nn.Module):
    """ For a batch of tokens indices, randomly replace a non-specical token with <pad>.
    prob (float): probability of dropping a token
    pad_token (int): index for the <pad> token
    num_special (int): Number of special tokens, assumed to be at the start of the vocab
    """

    def __init__(self, prob=0.1, pad_token=0, num_special=4):
        self.prob = prob
        self.num_special = num_special
        self.pad_token = pad_token

    def __call__(self, sample):
        # Randomly sample a bernoulli distribution with p = prob
        # Create a mask where 1 means we will replace that token
        # Discrete probability distribution
        # Here we want to treat the ones as the indexes to drop
        mask = torch.bernoulli(self.prob * torch.ones_like(sample)).long()
        
        # Only replace if the token is not a special token
        # Ones or zeros. If cannot drop, 0, if can drop, 1
        can_drop = (sample >= self.num_special).long()
        # Multiply together to get the corresponding tokens to be dropped and not dropped
        # Here, 1 represents drop, 0 represents do not drop
        mask = mask * can_drop

        # Make a mask of pad_token to use for replacing dropped indices with the pad_token
        replace_with = (self.pad_token * torch.ones_like(sample)).long()
        """ Sample is the original sample
        The mask indicates what tokens can be replaced (0 to not be replaced, 1 to be replaced)
        Replace_with is a list of of the pad_token tokens
        Here, (1-mask) creates the complement mask. (now, 0 indicates drop, 1 indicates to not drop)
        1-1 = 0, 1-0 = 0
        Multiplying by sample, retains the original tokens that are not to be kept, and applies the mask on the sample
        Here, mask * replace_with does elementwise multiplication and adds the corresponding pad_token to the tokens replaced
        """
        sample_out = (1 - mask) * sample + mask * replace_with
        
        return sample_out

In [6]:
class SinusoidalPosEmb(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.dim = dim

    def forward(self, x):
        device = x.device
        half_dim = self.dim // 2
        emb = math.log(10000) / (half_dim - 1)
        emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
        emb = x[:, None] * emb[None, :]
        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
        return emb

class NanoTransformer(nn.Module):
    def __init__(self, num_emb, output_size, hidden_size=128, num_heads=4):
        super(NanoTransformer, self).__init__()
        self.embedding = nn.Embedding(num_emb, hidden_size)
        self.embedding.weight.data = 0.001 * self.embedding.weight.data
        self.pos_embedding = SinusoidalPosEmb(hidden_size)
        self.transformer = nn.Transformer(hidden_size, num_heads)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.embedding(x) + self.pos_embedding(torch.arange(x.size(1), device=x.device))
        x = x.permute(1, 0, 2)  # Transformer expects (seq_len, batch, embed_dim)
        x = self.transformer(x)
        x = x.mean(dim=0)
        return self.fc(x)

In [7]:
class NanoTransformer(nn.Module):
    """
        This class implements a simplified Transformer model for sequence classification. 
        It uses an embedding layer for tokens, sinusoidal positional embeddings, 
        a Transformer, and a Linear layer.
        
        num_emb: The number of unique tokens in the vocabulary. (vocab_size)
        output_size: The size of the output layer (number of classes). (4)
        hidden_size: The dimension of the hidden layer in the Transformer block (default: 128)
        num_heads: The number of heads in the multi-head attention layer (default: 4).
    """
    def __init__(self, num_emb, output_size, hidden_size=128, num_heads=4):
        
        # Inherits from nn.Module's attributes
        super(NanoTransformer, self).__init__()

        # Create an embedding for each token
        self.embedding = nn.Embedding(num_emb, hidden_size) # (vocab_size, 128)
        
        # Scaling down the embedding weights
        self.embedding.weight.data = 0.001 * self.embedding.weight.data
        
        # Positional embedding
        self.pos_emb = SinusoidalPosEmb(hidden_size)

        # Multi-head attention
        self.multihead_attn = nn.MultiheadAttention(hidden_size, num_heads = num_heads, batch_first = True)

        # Linear layer
        self.mlp = nn.Sequential(nn.Linear(hidden_size, hidden_size), # (batch_size, 128, 128)
                                 nn.LayerNorm(hidden_size), # (batch_size, 128, 128)
                                 nn.ELU(), # (batch_size, 128, 128)
                                 nn.Linear(hidden_size, hidden_size)) # (batch_size, 128, 128)
        
        self.fc_out = nn.Linear(hidden_size, output_size) # (batch_size, 128, 128)

    def forward(self, input_seq):
        # batch_size, time_steps
        batch_size, l = input_seq.shape # (32, 160)

        input_embs = self.embedding(input_seq) # (32, 160) -> (32, 160, 128)
        
        # Add a unique embedding to each token embedding depending on it's position in the sequence
        seq_indx = torch.arange(l) # (160)
        
        pos_emb = self.pos_emb(seq_indx).reshape(1, l, -1).expand(batch_size, l, -1) # (1, 160, 128) -> (32, 160, 128)
        
        pos_emb = pos_emb.to(device)
        
        embs = input_embs + pos_emb # (32, 160, 128) + (32, 160, 128)
        
        
        output, attn_map = self.multihead_attn(embs, embs, embs) # (32, 160, 128)
        
        output = self.mlp(output) # (32, 160, 128)

        return self.fc_out(output), attn_map # (32, 160, 4)

In [8]:
# Set the device to GPU if available, otherwise use CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

learning_rate = 1e-4

nepochs = 20

hidden_size = 256

output_size = 2

num_heads = 4

tf_classifier = NanoTransformer(num_emb=len(vocab), output_size=2, hidden_size=hidden_size, num_heads=num_heads).to(device)
tf_classifier = tf_classifier.to(device)
# Initialize the optimizer
optimizer = optim.Adam(tf_classifier.parameters(), lr=learning_rate, weight_decay=1e-4)

# Cosine annealing scheduler to decay the learning rate
lr_scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=nepochs, eta_min=0)

loss_fn = nn.CrossEntropyLoss()
td = TokenDrop(prob=0.4)

training_loss_list = []
test_loss_list = []
training_acc_list = []
test_acc_list = []

In [9]:
# Load the model
tf_classifier.load_state_dict(torch.load(model_path))
tf_classifier.eval()

NanoTransformer(
  (embedding): Embedding(30003, 256)
  (pos_emb): SinusoidalPosEmb()
  (multihead_attn): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
  )
  (mlp): Sequential(
    (0): Linear(in_features=256, out_features=256, bias=True)
    (1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
    (2): ELU(alpha=1.0)
    (3): Linear(in_features=256, out_features=256, bias=True)
  )
  (fc_out): Linear(in_features=256, out_features=2, bias=True)
)

In [51]:
# Example test cases
test_sentences = [
    "Truly bad and easily the worst episode I have ever seen....ever. They tried to make up for it by giving it the, 'we know we are doing this' routine. That would have been funny if it weren't for the fact that 'The Simpsons' had already done it. And it still wouldn't make up for it if they had come up with the idea in the first place. The flashbacks took place as part of the usual character's (mainly J.D's) fantasies. The flashbacks weren't even of actual events that occurred, just compilations of say, J.D falling over or, i don't know.... Elliott falling over. If I wanted to watch a Scrubs compilation i'd go on youtube and not waste half an hour of my life. Scrubs has ultimately fallen into the trap that most sit-coms have to, and it disappoints me, they managed to go 5 and a quarter seasons without an episode like this.  I was hoping that scrubs wouldn't have to be that kind of sit-com. And just as a passing thought, why the hell was Dr.Cox bald?"
]

# Transform test sentences into tokens
test_tokens = [text_transform([sentence]).to(device) for sentence in test_sentences]

# Perform inference
with torch.no_grad():
    for tokens in test_tokens:
        
        # Make predictions

        print(tokens) 
        
        predictions, attn_map = tf_classifier(tokens)
        
        predicted_classes = predictions[:, 0, :]
        
        print(predicted_classes)

        # Print results
        for sentence, predicted_class in zip(test_sentences, predicted_classes):
            print(f"Sentence: {sentence} -> Predicted Sentiment: {predicted_classes}")


tensor([[    0,  5475,     3,  4291,  3131,  7370,  6652, 12558, 27459, 29717,
         13064,  5475,     3, 15667, 13226, 24862,    87,  1829,    82,  5475,
             3,  2362, 28035, 27704, 18880, 28684, 14192, 17175,  8773, 14886,
         17175, 27459,     3,  5482,  5272, 17818, 29333,  6926, 12139, 27527,
            26, 24208,    82,  5475,     3,  2308, 29722, 15667,  7682, 14507,
         16394, 17175, 29407,    26,  4771, 14192, 27459, 13532, 27457,  5482,
             3,  2321,  5475,     3,  2489,  3374,  4554,  4315,    26, 15430,
          6507, 12179, 17175,    82,  5475,     3,  3435, 17175, 26438, 29723,
            26,  4771, 18880, 28684, 14192, 17175, 16394, 27499, 15430,  9957,
         28684, 29615, 27459, 16359, 16578, 27459, 13950, 21924,    82,  5475,
             3,  2321, 14010, 27755, 21924,  7033, 21379, 20837, 27459, 28741,
          9300,    26,  4315,  5492,  3151,  5475,     3,    82,     3,    26,
          4315,    34, 13620,    82,  5475,     3,  

In [53]:
attn_map.shape

torch.Size([1, 256, 256])

In [50]:
att_map = attn_map[0, 0]
top5 = att_map.argsort(descending=True)[:10]
top5_tokens = vocab.lookup_tokens(tokens[0, top5].cpu().numpy())

In [49]:
top5_tokens

['▁(', '▁if', '▁if', '▁idea', 'ru', '▁been', 'ly', '▁easily', '▁or', '▁the']