In [1]:
import os
from os.path import exists
import torch
import torch.nn as nn
from torch.nn.functional import log_softmax, pad
import math
import copy
import time
from torch.optim.lr_scheduler import LambdaLR
import pandas as pd
import altair as alt
from torchtext.data.functional import to_map_style_dataset
from torch.utils.data import DataLoader
from torchtext.vocab import build_vocab_from_iterator
import torch.functional as F
import torchtext.datasets as datasets
import spacy
import GPUtil
import warnings
from torch.utils.data.distributed import DistributedSampler
import torch.distributed as dist
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP


# Set to False to skip notebook execution (e.g. for debugging)
warnings.filterwarnings("ignore")
RUN_EXAMPLES = True


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/opt/miniconda3/envs/dl_papers/lib/python3.11/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/opt/miniconda3/envs/dl_papers/lib/python3.11/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/opt/miniconda3/envs/dl_papers/lib/python3.11/site-packages/ipykernel/kernelapp.py", line 739, in start
    self.io_loop.sta

In [2]:
class GPT(nn.Module):
    
    def __init__(self,decoder,embed,generator):
        """
        GPT model architecture.
        """
        super(GPT,self).__init__()
        self.decoder=decoder
        self.embed=embed
        self.generator=generator
    
    def forward(self,x,mask):
        #x-> [batch_size,seq_len]
        embedded=self.embed(x) #->[batch_size,seq_len,d_model]
        return self.decoder(embedded,mask) # [batch_size,seq_len,d_model]
        

In [3]:
class Generator(nn.Module):
    def __init__(self,d_model,vocab):
        super(Generator,self).__init__()
        self.proj=nn.Linear(in_features=d_model,out_features=vocab)
    
    def forward(self,x):
        #x -> [batch_size,seq_len,d_model]
        projection=self.proj(x) # [batch_size,seq_len,vocab]
        #pred=log_softmax(projection,dim=-1) # [batch_size,seq_len,vocab]
        return projection #since crossentropy expects raw logits

In [4]:
def clones(module,N):
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

    

In [5]:
class Decoder(nn.Module):
    def __init__(self,layer,N):
        super(Decoder,self).__init__()
        self.layers=clones(layer,N)
        self.norm=LayerNorm(layer.size)  # layer.size -> [d_model]
    
    def forward(self,x,mask):
        for layer in self.layers:
            x=layer(x,mask)
        print("hello from decoder")

        
        return self.norm(x)
        

In [6]:
class LayerNorm(nn.Module):
    #https://tungmphung.com/wp-content/uploads/2020/01/Screenshot-from-2020-01-05-07-00-09.png
    def __init__(self,feature,eps=1e-6): #feature -> [d_model]
        super(LayerNorm,self).__init__()
        self.gamma=nn.Parameter(torch.ones(feature))
        self.beta=nn.Parameter(torch.zeros(feature))
        self.eps=eps
    
    def forward(self,x):
        mean=x.mean(-1,keepdim=True)
        std=x.std(-1,keepdim=True)

        return self.gamma*(x-mean)/(std+self.eps) + self.beta
        

In [7]:
class SubLayerConnection(nn.Module):
    """Residual connection followed by a layer norm """
    def __init__(self,size,dropout):
        super(SubLayerConnection,self).__init__()
        self.norm=LayerNorm(size)
        self.dropout=nn.Dropout(dropout)
    
    def forward(self,x,sublayer):
        # print("hello from sublayer")

        return x + self.dropout(sublayer(self.norm(x)))

In [8]:
class DecoderLayer(nn.Module):
    def __init__(self,size,self_attn,feed_forward,dropout):
        super(DecoderLayer,self).__init__()
        self.self_attn=self_attn
        self.feed_forward=feed_forward
        self.sublayer=clones(SubLayerConnection(size,dropout),2)
        self.size=size
    
    def forward(self,x,mask):
        x=self.sublayer[0](x,lambda x: self.self_attn(x,x,x,mask))
        return self.sublayer[1](x,self.feed_forward)
    

In [9]:
def subsequent_mask(size):
    "Mask out subsequent positions"
    attn_shape=(1,size,size) #[1,seq_len,seq_len]
    subsequent_mask=torch.triu(torch.ones(attn_shape),diagonal=1).type(torch.uint8)

    return subsequent_mask==0 #True->unmasked, False->masked  

In [10]:
def attention(query,key,value,mask=None,dropout=None):
    #query,key,value -> [batch_size,h,seq_len,d_k]
    d_k=query.size(-1)
    scores=torch.matmul(query,key.transpose(-2,-1))/math.sqrt(d_k) #[batch_size,h,seq_len,seq_len]
    if mask is not None:
        mask = mask.unsqueeze(1).expand(-1, 12, -1, -1)
        scores=scores.masked_fill(mask==0,-float('inf')) # wherever mask==0, fill that up with -inf
    p_attn=scores.softmax(dim=-1)
    if dropout is not None:
        p_attn=dropout(p_attn)
    return torch.matmul(p_attn,value),p_attn
    
    

In [11]:
class MultiHeadedAttention(nn.Module):
    def __init__(self,h,d_model,dropout=0.1):
        #h=12,N=12
        super(MultiHeadedAttention,self).__init__()
        assert d_model%h==0
        self.d_k=d_model//h
        self.h=h
        self.linears=clones(nn.Linear(d_model,d_model),4)
        self.attn=None
        self.dropout=nn.Dropout(p=dropout)
    
    def forward(self,query,key,value,mask=None):
        #mask shape-> [batch_size,seq_len,seq_len]
        if mask is not None:
            mask.unsqueeze(1) # [batch_size,1,seq_len,seq_len] # same masking across all attention heads h
        
        nbatches=query.size(0)
        
        query,key,value=[
            lin(x).view(nbatches,-1,self.h,self.d_k).transpose(1,2) for lin,x in zip(self.linears,(query,key,value))
        ]
        x,self_attn=attention(query,key,value,mask,dropout=self.dropout)

        x=(
            x.transpose(1,2).contiguous().view(nbatches,-1,self.h*self.d_k)
        )
        del query
        del key
        del value
        return self.linears[-1](x) #WO * x
                    

In [12]:
#3072
class PositionwiseFeedForward(nn.Module):
    def __init__(self,d_model,d_ff,dropout=0.1):
        #d_ff=3072
        super(PositionwiseFeedForward,self).__init__()
        self.w_1=nn.Linear(d_model,d_ff)
        self.w_2=nn.Linear(d_ff,d_model)
        self.dropout=nn.Dropout(p=dropout)
        self.gelu=nn.GELU()

    def forward(self,x):
        #print("hello from feeforward")
        return self.w_2(self.dropout(self.gelu(self.w_1(x))))
        

In [13]:
class Embeddings(nn.Module):
    def __init__(self,d_model,vocab):
        super(Embeddings,self).__init__()
        self.lut=nn.Embedding(num_embeddings=vocab,embedding_dim=d_model)
        self.d_model=d_model
    
    def forward(self,x):
        return self.lut(x) * math.sqrt(self.d_model)

In [14]:
class PositionalEncoding(nn.Module):
    def __init__(self,d_model,dropout,max_len=5000):
        super(PositionalEncoding,self).__init__()
        self.dropout=nn.Dropout(dropout)

        pe=torch.zeros(max_len,d_model)
        position=torch.arange(0,max_len).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model)
        )
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer("pe", pe)

    def forward(self, x):
        #print("hello from posen")
        x = x + self.pe[:, : x.size(1)].requires_grad_(False)
        return self.dropout(x)
    

In [15]:
import numpy,pandas

In [16]:
def make_model(vocab,N=12,d_model=768,d_ff=3072,h=12,dropout=0.1):
    c=copy.deepcopy
    attn=MultiHeadedAttention(h,d_model)
    ff=PositionwiseFeedForward(d_model,d_ff,dropout)
    position=PositionalEncoding(d_model,dropout)
    model=GPT(
        Decoder(DecoderLayer(d_model,c(attn),c(ff),dropout),N),
        nn.Sequential(Embeddings(d_model,vocab),c(position)),
        Generator(d_model,vocab))

    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
    return model

In [18]:
def inference_test():
    # Define a GPT-like model with a vocab size of 11, 2 layers
    test_model = make_model(vocab=11, N=2, d_model=512, d_ff=2048, h=8, dropout=0.1)
    test_model.eval()  # Set the model to evaluation mode

    # Initial input sequence (e.g., a prompt or input sentence)
    input_sequence = torch.LongTensor([[1, 2, 3, 4, 5]])
    input_mask = subsequent_mask(input_sequence.size(1)).type_as(input_sequence)

    # Start generating tokens
    generated_sequence = input_sequence.clone()
    for _ in range(5):  # Generate 5 additional tokens
        # Pass the input sequence through the model
        output = test_model(
            generated_sequence,
            subsequent_mask(generated_sequence.size(1)).type_as(generated_sequence),
        )
        # Apply the generator to get probabilities for the next token
        prob = test_model.generator(output[:, -1])
        # Select the most likely next token
        _, next_token = torch.max(prob, dim=1)
        next_token = next_token.item()

        # Append the next token to the generated sequence
        generated_sequence = torch.cat(
            [generated_sequence, torch.tensor([[next_token]]).type_as(input_sequence)],
            dim=1,
        )

    print("Example Untrained Model Prediction:", generated_sequence)

def run_tests():
    print("Running Inference Tests...")
    for _ in range(3):  # Run the test 3 times
        inference_test()

run_tests()

Running Inference Tests...


RuntimeError: The size of tensor a (12) must match the size of tensor b (8) at non-singleton dimension 1

## Batches and masking

In [19]:
class Batch:
    def __init__(self, src, pad=0):
        self.src = src
        self.src_mask = (src != pad).unsqueeze(-2)

        # Derive tgt and tgt_y from src
        self.tgt = src[:, :-1]  # Input to the decoder (shift right)
        self.tgt_y = src[:, 1:]  # Target for loss computation (shift left)

        # Create tgt_mask
        self.tgt_mask = self.make_std_mask(self.tgt, pad)
        self.ntokens = (self.tgt_y != pad).data.sum()

    @staticmethod
    def make_std_mask(tgt, pad):
        """Creates a mask to avoid attention on padding tokens."""
        tgt_mask = (tgt != pad).unsqueeze(-2)  # Shape: [batch_size, 1, seq_len]
        print(tgt_mask.shape)
        return tgt_mask & subsequent_mask(tgt.size(-1)).type_as(tgt_mask.data)

## Training

In [20]:
class TrainState:
    step:int=0 #steps in the current epoch
    accum_step:int=0 #number of gradient accumulation steps
    samples:int=0 #total #examples used
    tokens:int=0 # total # of tokens processed
    
    

In [29]:
def run_epoch(data_iter,model,loss_compute,optimizer,mode="train",accum_iter=1,train_state=TrainState()):
    """Trains a single epoch"""

    start=time.time()
    total_tokens=0
    total_loss=0
    tokens=0
    n_accum=0
    
    for i,batch in enumerate(data_iter):
        out=model.forward(batch.tgt,batch.tgt_mask)
        loss,loss_node=loss_compute(out,batch.tgt_y,batch.ntokens)
        
        if mode=="train" or mode=="train + log":
            loss_node.backward()
            train_state.step+=1
            train_state.samples += batch.src.shape[0]
            train_state.tokens += batch.ntokens
        
            if i% accum_iter==0:
                optimizer.step()
                optimizer.zero_grad(set_to_none=True)
                n_accum+=1
                train_state.accum_step+=1
        
        total_loss+=loss
        total_tokens+=batch.ntokens
        tokens+=batch.ntokens
        if i%40==0 and (mode=="train" or mode=="train+log"):
            lr=optimizer.param_groups[0]["lr"]
            elapsed=time.time() - start
            print(
                (
                  "Epoch Step: %6d | Accumulation Step: %3d | Loss: %6.2f "
                    + "| Tokens / Sec: %7.1f | Learning Rate: %6.1e"
                )
                % (i, n_accum, loss / batch.ntokens, tokens / elapsed, lr)
            )
            start = time.time()
            tokens = 0
            del loss
        del loss_node
    return total_loss / total_tokens, train_state
            
        
        
        
        
    
    

# A first example

In [30]:
def data_gen(V,batch_size,nbatches):
    """
    Generate random data for a GPT task where src is used for both input and target.
    
    Args:
        V (int): Vocabulary size.
        batch_size (int): Number of samples per batch.
        nbatches (int): Number of batches to generate.
    
    Yields:
        Batch: A batch object constructed with the generated `src`.
    """
    for i in range(nbatches):
        data = torch.randint(1, V, size=(batch_size, 10))
        data[:, 0] = 1
        src=data.requires_grad_(False).clone().detach()
        yield Batch(src,pad=0)

In [31]:
pad_token=0
criterion=nn.CrossEntropyLoss(ignore_index=pad_token)

In [32]:
class SimpleLossCompute:
    def __init__(self,generator,criterion,pad):
        self.generator=generator
        self.criterion=criterion
    
    def __call__(self,x,y,norm):
        logits=self.generator(x)
        loss=self.criterion(
            logits.contiguous().view(-1,logits.size(-1)),
            y.contiguous().view(-1)
        ) / norm
        
        return loss.data*norm,loss

In [33]:
import pandas as pd
data="./data/input.txt"
with open(data,'r') as f:
    text=f.read()

print(text[:200])
text=text[:11153]
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")
tokens = tokenizer.encode(text)
tokens= torch.tensor(tokens)
print(tokens) 

seq_len = 32
batch_size = 32


token_batches = []
for i in range(0, len(tokens), seq_len):
    token_batches.append(tokens[i:i+seq_len])

print(type(token_batches))
class TokenDataset(torch.utils.data.Dataset):
    def __init__(self,tokens,seq_len,pad_token=0):
        self.tokens=tokens
        self.seq_len=seq_len
        self.pad_token = pad_token
    
        self.token_batches=[tokens[i:i + seq_len] for i in range(0, len(tokens), seq_len)]
        if len(self.token_batches[-1]) < seq_len:
                padding_length = seq_len - len(self.token_batches[-1])
                padding = torch.tensor([self.pad_token] * padding_length)
                self.token_batches[-1] = torch.tensor(self.token_batches[-1]).tolist() + padding.tolist()

    
    def __len__(self):
        return len(self.token_batches)

    def __getitem__(self,idx):
        return torch.tensor(self.token_batches[idx])
    
seq_len=32
batch_size=64
dataset=TokenDataset(tokens,seq_len)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

len(dataloader)
next(iter(dataloader)).shape
tokenizer.decode(list(next(iter(dataloader))[0]))
tokenizer.n_vocab 

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you
tensor([ 5962, 22307,    25,  ...,  7140,   290,  1907])
<class 'list'>


50257

In [34]:
device="cuda" if torch.cuda.is_available() else "mps"
model=make_model(vocab=tokenizer.n_vocab,N=12,d_model=768,d_ff=3072,h=12,dropout=0.1).to(device)
optimizer=torch.optim.Adam(model.parameters())
loss_compute=SimpleLossCompute(model.generator,criterion,pad_token)

In [35]:
def train_model(model,train_dataloader,optimizer,loss_compute,epochs,accum_iter,device,train_state=TrainState()):
    train_history=[]
    for i, src in enumerate(train_dataloader):
        batch = Batch(src.to(device), pad=0)
        print(f"src: {batch.src.shape}, tgt: {batch.tgt.shape}, tgt_y: {batch.tgt_y.shape}, tgt_mask: {batch.tgt_mask.shape}")
        break
    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}/{epochs} - Training...")
        model.train()
        train_loss,train_state=run_epoch((Batch(batch.to(device), pad=0) for batch in train_dataloader),
                                         model,loss_compute,optimizer=optimizer,mode="train",accum_iter=accum_iter,train_state=train_state
                                         
                                         )
        train_history.append(train_loss)
        print(f"Epoch {epoch + 1}: Train Loss: {train_loss:.4f}")
    
    return train_history

In [36]:
train_model(model,dataloader,optimizer,loss_compute,epochs=1,accum_iter=2,device=device)

torch.Size([64, 1, 31])
src: torch.Size([64, 32]), tgt: torch.Size([64, 31]), tgt_y: torch.Size([64, 31]), tgt_mask: torch.Size([64, 31, 31])
Epoch 1/1 - Training...
torch.Size([64, 1, 31])
hello from decoder
Epoch Step:      0 | Accumulation Step:   1 | Loss:   0.01 | Tokens / Sec:     3.2 | Learning Rate: 1.0e-03
torch.Size([36, 1, 31])


KeyboardInterrupt: 

In [72]:
print(f"Model expects tgt of shape: {model.decoder.embed.shape}, tgt_mask of shape: {model.decoder.mask_shape}")

AttributeError: 'Decoder' object has no attribute 'embed'