In [1]:
import torch
import torch.nn as nn
import torchtext
from torchtext.data.functional import generate_sp_model, load_sp_model, sentencepiece_tokenizer, sentencepiece_numericalizer
from torchtext.vocab import build_vocab_from_iterator
import torchtext.transforms as T
from torch.nn import functional as F
import torch.optim as optim

import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import io
import math
from torch.utils.data import Dataset, DataLoader
import torch.utils.data.dataloader as dataloader
import os
import re



In [2]:
# Load the tokenizer and vocabulary
model_path = "models/gpt.pth"
tokenizer_path = "sentencepiece/transformer.model"
vocab_path = "sentencepiece/transformer.vocab"

In [3]:
def yield_tokens(file_path):
    with io.open(file_path, encoding='utf-8') as f:
        # Iterate through each line in the file
        for line in f:
            # Accesses the vocab file, splits the line by tab, and gets the first entry (the actual token)
            # Yield the token from the first column (split by tab)
            yield [line.split("\t")[0]]

# Build a vocabulary from the tokens yielded by the yield_tokens function
    # <pad> is a padding token that is added to the end of a sentence to ensure the length of all sequences in a batch is the same
    # <sos> signals the "Start-Of-Sentence" aka the start of the sequence
    # <eos> signal the "End-Of-Sentence" aka the end of the sequence
    # <unk> "unknown" token is used if a token is not contained in the vocab
# From torchtext library (build_vocab_from_iterator)
# Builds a generator object, that is treated like an iterator
vocab = build_vocab_from_iterator(yield_tokens("../SentencePiece/transformer.vocab"), specials=['<cls>', '<pad>', '<eos>', '<unk>'], special_first=True)

# Set the default index for unknown tokens to the index of the '<unk>' token
vocab.set_default_index(vocab['<unk>'])

In [4]:
text_transform = T.Sequential(
    T.SentencePieceTokenizer(tokenizer_path),
    T.VocabTransform(vocab),
    T.AddToken(vocab['<cls>'], begin=True),
    T.Truncate(max_seq_len=254),
    T.AddToken(vocab['<eos>'], begin=False),
    T.ToTensor(padding_value=vocab['<pad>']),
    T.PadTransform(max_length=256, pad_value=0),
)


In [5]:
# No changes

class FeedForward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )
    
    def forward(self, x):
        return self.net(x)

# Updated to unpack the tuple

class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ffwd = FeedForward(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        sa_out, attn_weights = self.sa(self.ln1(x))
        x = x + sa_out  # Residual connection
        x = x + self.ffwd(self.ln2(x))
        return x, attn_weights


# Updated to return attention weights

class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)
        q = self.query(x)
        wei = q @ k.transpose(-2, -1) * C ** -0.5
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        v = self.value(x)
        out = wei @ v
        return out, wei  # Return attention weights


class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = []
        attn_weights = []
        for h in self.heads:
            head_out, head_attn = h(x)
            out.append(head_out)
            attn_weights.append(head_attn)
        out = torch.cat(out, dim=-1)
        out = self.dropout(self.proj(out))
        attn_weights = torch.stack(attn_weights, dim=1)  # (B, num_heads, T, T)
        return out, attn_weights



class Transformer(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size + 1, n_embd)
        self.blocks = nn.ModuleList([Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, output_size)
 
    def forward(self, idx):
        B, T = idx.shape
        tok_emb = self.token_embedding_table(idx)  # (B, T, C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=idx.device))  # (T, C)
        x = tok_emb + pos_emb  # (B, T, C) - broadcasting

        for i, block in enumerate(self.blocks):
            x, attn_weights = block(x)  # Save attention weights from the last block

        x = self.ln_f(x)
        logits = self.lm_head(x[:, 0, :])  # (B, output_size) - we use the CLS token representation

        return logits, attn_weights

In [6]:
# Hyperparameters
device = 'cuda' if torch.cuda.is_available() else 'cpu'
block_size = 256
n_embd = 768
n_head = 12
n_layer = 2
dropout = 0.4
output_size = 2
vocab_size = len(vocab)
learning_rate = 5e-4

In [7]:
model = Transformer()
model.load_state_dict(torch.load(model_path))
model = model.to(device)
model.eval()
print("Model loaded successfully.")

Model loaded successfully.


In [10]:
def predict_single_text(model, text):
    tokens = text_transform([text]).to(device)
    # tokens = tokens.unsqueeze(0)  # Add batch dimension
    print(tokens.shape)
    with torch.no_grad():
        logits, _ = model(tokens)
    probs = F.softmax(logits, dim=-1)
    label = torch.argmax(probs, dim  = 1)
    print(probs)
    if label.item() == 0:
        return 'negative'
    else:
        return 'positive'

# Example usage:
text = "Meryl Streep as Kate, a woman dying of cancer, performs her role admirably. No wonder she was up for an Oscar. In the part she proves that caring and nurturing housewives are just as important as their sisters out in the business world. And the lesson she teaches about life's expectations and their lack of fulfillment as the relationship grows, that is the most important thing she teaches her daughter. We can expect too much of our mates. Realize that there are many slips and forgiveness or understanding are the main ingredients of a happy life. This is a sombre movie and the ending though sad, shows reconciliation between the father and daughter. I give this one a ten."


output = predict_single_text(model, text)


print("Predicted probabilities:", output)

torch.Size([1, 256])
tensor([[0.1060, 0.8940]], device='cuda:0')
Predicted probabilities: positive


In [16]:
def predict_single_text(model, text, text_transform, vocab):
    # Transform the input text to tokens
    tokens = text_transform([text]).to(device)
    print(tokens.shape)
    
    with torch.no_grad():
        logits, attn_weights = model(tokens)
    
    # Get the probabilities and predicted label
    probs = F.softmax(logits, dim=-1)
    label = torch.argmax(probs, dim=1)
    
    print(probs)
    
    print(attn_weights.shape)
    
    # Example of analyzing attention maps
    # Get the attention map for the last head and the first sample
    att_map = attn_weights[0, -1, :, :]  # Shape: (T, T)
    
    # Sum the attention weights across all tokens for the [CLS] token
    att_weights = att_map[0]  # Attention weights for the [CLS] token
    
    # Get top 10 tokens with the highest attention weights
    top10 = att_weights.argsort(descending=True)[:10]
    top10_tokens = [vocab.lookup_token(tokens[0][idx].item()) for idx in top10]
    
    print("Top 10 tokens with highest attention:", top10_tokens)
    
    if label.item() == 0:
        return 'negative'
    else:
        return 'positive'

# Example usage:
text = "Meryl Streep as Kate, a woman dying of cancer, performs her role admirably. No wonder she was up for an Oscar. In the part she proves that caring and nurturing housewives are just as important as their sisters out in the business world. And the lesson she teaches about life's expectations and their lack of fulfillment as the relationship grows, that is the most important thing she teaches her daughter. We can expect too much of our mates. Realize that there are many slips and forgiveness or understanding are the main ingredients of a happy life. This is a sombre movie and the ending though sad, shows reconciliation between the father and daughter. I give this one a ten."


output = predict_single_text(model, text, text_transform, vocab)
print("Predicted sentiment:", output)

torch.Size([1, 256])
tensor([[0.1060, 0.8940]], device='cuda:0')
torch.Size([1, 12, 256, 256])
Top 10 tokens with highest attention: ['▁the', '.', '.', '▁the', '.', '▁the', '.', '▁the', '▁the', '▁the']
Predicted sentiment: positive
