In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
import tqdm
import lightning.pytorch as pl
import pandas as pd
import random


In [2]:
with open("data/tiny-shakespeare.txt") as f:
    text = f.read()


In [3]:
# chunk the text into sequences of length seq_length
seq_length = 64
sequences = []
for i in range(0, len(text) - seq_length, seq_length):
    sequences.append(text[i:i + seq_length])


In [4]:
print(len(sequences), sequences[0])


17428 First Citizen:
Before we proceed any further, hear me speak.

Al


In [5]:
@torch.no_grad()
def encode(string: str) -> torch.Tensor:
    string = string.encode('utf-8')
    return torch.as_tensor([int(c)+1 for c in string])

@torch.no_grad()
def decode(arr: torch.Tensor) -> str:
    arr = arr.tolist()
    return ''.join([chr(c-1) for c in arr])


In [6]:
print(encode('hello'))
print(decode(encode('hello')))


tensor([105, 102, 109, 109, 112])
hello


In [7]:
import torch
import torch.nn as nn
import math

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=1024):
        super().__init__()
        self.encoding = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-torch.log(torch.tensor(10000.0)) / d_model))
        self.encoding[:, 0::2] = torch.sin(position * div_term)
        self.encoding[:, 1::2] = torch.cos(position * div_term)
        self.encoding = self.encoding.unsqueeze(0).transpose(0, 1)

    def forward(self, x):
        return x + self.encoding[:x.size(0), :]

class TransformerDecoderLayer(nn.Module):
    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1):
        super().__init__()
        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
        self.linear1 = nn.Linear(d_model, dim_feedforward)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(dim_feedforward, d_model)

        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

        self.activation = nn.ReLU()

    def forward(self, tgt, tgt_mask=None):
        tgt2 = self.norm1(tgt)
        tgt2, _ = self.self_attn(tgt2, tgt2, tgt2, attn_mask=tgt_mask)
        tgt = tgt + self.dropout1(tgt2)
        tgt2 = self.norm2(tgt)
        tgt2 = self.linear1(tgt2)
        tgt2 = self.activation(tgt2)
        tgt2 = self.dropout(tgt2)
        tgt2 = self.linear2(tgt2)
        tgt = tgt + self.dropout2(tgt2)
        return tgt

class TransformerDecoder(nn.Module):
    def __init__(self, d_model, nhead, num_layers, vocab_size, max_len, dim_feedforward=2048, dropout=0.1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model, max_len)
        self.layers = nn.ModuleList([
            TransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout) for _ in range(num_layers)
        ])
        self.norm = nn.LayerNorm(d_model)
        self.output_layer = nn.Linear(d_model, vocab_size)  # Final output layer
        
        self.d_model = d_model
        self.nhead = nhead
        self.num_layers = num_layers
        self.vocab_size = vocab_size
        self.max_len = max_len
        

    def forward(self, src, tgt_mask=None):
        src = self.embedding(src) * math.sqrt(d_model)
        src = self.pos_encoder(src)
        output = src
        for layer in self.layers:
            output = layer(output, tgt_mask=tgt_mask)
        output = self.norm(output)
        output = self.output_layer(output)
        return output

# Model configuration
d_model = 256  # Dimension of the model
nhead = 2      # Number of heads in multi-head attention
num_layers = 3 # Number of decoder layers
vocab_size = 256  # Size of vocabulary
max_len = 64   # Maximum length of a sequence

# Example usage
model = TransformerDecoder(d_model, nhead, num_layers, vocab_size, max_len)
tokens = torch.randint(0, vocab_size, (max_len, 4))  # (sequence_length, batch_size)
output = model(tokens)
print(output.shape)  # Should output (1, vocab_size), for one token output


torch.Size([64, 4, 256])


In [8]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)


In [10]:
import torch.nn.functional as F

# Validation Loop with Temperature Sampling
model.eval()
initial_input = encode('First Citizen:') -1
generated_text = initial_input

with torch.no_grad():
    input_tensor = initial_input
    for _ in tqdm.tqdm(range(max_len - len(initial_input))):
        input_tensor = F.pad(input_tensor, (max_len - len(input_tensor), 0))
        # print(input_tensor)
        output = model(input_tensor)
        predicted_token = torch.multinomial(F.softmax(output[0, -1] / 0.7, dim=-1), 1)
        
        # print(generated_text.shape, predicted_token.shape)
        
        generated_text = torch.cat((generated_text, predicted_token), dim=0)
        input_tensor = generated_text  # Feed the entire sequence
        

    # Display the generated text
    print(decode(generated_text.squeeze() + 1))
    print(len(decode(generated_text.squeeze() + 1)))


100%|██████████| 50/50 [00:20<00:00,  2.38it/s]

First Citizen:c×ç úwÓ)çO? z¡Gcf Ii´y¤Ãm{èõâ­ßvÆRÁö¨pÂ<
64





In [11]:
# Assume model, optimizer, criterion, and encode function are defined.
epochs = 20
for epoch in range(epochs):
    model.train()
    total_loss = 0
    with tqdm.tqdm(enumerate(sequences), total=len(sequences)) as pbar:
        for i, seq in pbar:
            # Prepare input and target tensors
            seq = encode(seq) -1
            total_loss = torch.tensor(0.)
            
            optimizer.zero_grad()
            for j in range(len(seq) - 1):
                inputs = seq[:j]
                targets = seq[j+1].unsqueeze(0)
                
                inputs = F.pad(inputs, (max_len - len(inputs), 0))
                
                # Forward pass
                # print("inputs", inputs.shape)
                # print("targets", targets.shape)
                
                output = model(inputs)
                # print("output", output.shape, output.requires_grad)
                
                predicted_token = output[0, -1].unsqueeze(0)
                # print("predicted_token", predicted_token.shape)
                
                
                # print(predicted_token.requires_grad, targets.requires_grad)
                loss = criterion(predicted_token.float(), targets)
                total_loss += loss
                
            total_loss.backward()    
            optimizer.step()
            pbar.set_description(f"Loss: {loss.item()}")
                

  0%|          | 0/17428 [00:00<?, ?it/s]

: 