In [2]:
import torch
from llm.components import MultiHeadAttention, GPTModel
from torch import nn
import tiktoken
import os

from torch.utils.data import DataLoader, Dataset


In [3]:
GPT_CONFIG = { 
    'vocab_size': 50257,
    'context_length': 256,
    'embed_dim': 768,
    'n_heads': 12,
    'n_layers': 12,
    'drop_rate': 0.1,
    'qkv_bias': False,
    'eos_id': "<eos>"
}

gpt = GPTModel(GPT_CONFIG)

In [4]:
tokenizer = tiktoken.get_encoding('gpt2')

In [5]:
def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    return torch.tensor(encoded).unsqueeze(0)

def token_ids_to_text(ids, tokenizer):
    flat = ids.squeeze(0) #remove batch
    return tokenizer.decode(flat.tolist())

In [6]:
inputs = ["every effort moves", "I really like"]
targets = ["effort moves you", "really like chocolate"]
ids = []
for input in inputs:
    ids.append(text_to_token_ids(input, tokenizer))

ids = torch.vstack(ids)
print('ids = ', ids.shape)

preds = gpt.generate_text_simple(ids, 10, 256)
print('preds = ', preds.shape)

decoded = []

for p in preds:
    decoded.append(token_ids_to_text(p, tokenizer))

print(decoded)


ids =  torch.Size([2, 3])
preds =  torch.Size([2, 13])
['every effort moves� apostialsprotein OliveDurunci 153 Abbey gateway', 'I really like Wales Investments partake vodka strawberry sorce champion 153sight Borg']


In [7]:
token_ids = torch.argmax(torch.rand(8,3,10), dim = -1, keepdim=True)
print(token_ids.shape)

torch.Size([8, 3, 1])


# Loading the verdict.txt

In [8]:
filepath = 'verdict.txt'

with open(filepath, 'r', encoding='utf8') as file:
    textdata = file.read()

print(textdata[:100])
print('len = ', len(textdata))
tokens = tokenizer.encode(textdata)

print('token count = ', len(tokens))

I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no g
len =  20479
token count =  5145


# Import code from chapt 2

In [9]:
from llm.dataset import create_dataloader, GPTDatasetV1 
from torch.utils.data import random_split

In [10]:
context_len = GPT_CONFIG['context_length']
stride = context_len

train_ratio = 0.9
train_data = int(len(textdata) * train_ratio)

train_loader, _, _ = create_dataloader(
    textdata[:train_data], 
    batch_size = 2,
    max_len = context_len,
    stride=stride,
)

val_loader,_,_ = create_dataloader(
    textdata[train_data:], 
    batch_size = 2,
    max_len = context_len,
    stride=stride,
    drop_last=False,
    shuffle=False
)

print('len train loader = ', len(train_loader))
print('len val loader = ', len(val_loader))


len train loader =  9
len val loader =  1


In [11]:
#double check the loader 

for i, (x,y) in enumerate(train_loader):
    if i == 3: break
    print(x.shape, ' ', y.shape)

torch.Size([2, 256])   torch.Size([2, 256])
torch.Size([2, 256])   torch.Size([2, 256])
torch.Size([2, 256])   torch.Size([2, 256])


In [12]:
def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch = input_batch.to(device)
    target_batch = target_batch.to(device)

    logits = model(input_batch)

    loss = torch.nn.functional.cross_entropy(
        logits.flatten(0,1), #flatten input, (2,3,10000) --> (6,10000)
        target_batch.flatten() #target = (2,3) --> (6)
    )

    return loss

def calc_loss_loader(dataloader, model, device, num_batches = None):
    #NOTE: num_batches is a limit of evaluation batches
    # to speed up if needed

    total_loss = 0.

    if len(dataloader) == 0:
        return float('nan')
    
    elif num_batches is None:
        num_batches = len(dataloader)

    else:
        num_batches = min(num_batches, len(dataloader))
    
    for i, (b_input, b_target) in enumerate(dataloader):
        if i < num_batches:
            loss = calc_loss_batch(b_input,b_target,model,device)
            total_loss += loss.item()
        else:
            break
    
    return total_loss / num_batches

def eval_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    train_loss = calc_loss_loader(train_loader, model, device, num_batches= eval_iter)
    val_loss = calc_loss_loader(val_loader, model, device, num_batches= eval_iter)
    model.train()

    return train_loss, val_loss
    



In [13]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GPTModel(GPT_CONFIG)

In [14]:
model.to(device)

GPTModel(
  (token_emb): Embedding(50257, 768)
  (pos_emb): Embedding(256, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (transformer_blocks): Sequential(
    (0): TransformerBlock(
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU(approximate='none')
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (attn): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=False)
        (W_key): Linear(in_features=768, out_features=768, bias=False)
        (W_value): Linear(in_features=768, out_features=768, bias=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, ou

In [15]:
with torch.no_grad():
    train_loss = calc_loss_loader(train_loader, model, device)
    val_loss = calc_loss_loader(val_loader, model, device)

print('training loss = ', train_loss)
print('val loss = ', val_loss)


training loss =  11.016315142313639
val loss =  11.021211624145508


# Training loop

In [16]:
a = torch.randint(1,10000,(2,3))
print(a.numel())

6


In [23]:
def train(
    model: nn.Module, 
    train_loader: DataLoader, 
    val_loader: DataLoader, optimizer, 
    device: torch.device, 
    num_epochs, 
    eval_freq,
    eval_iter, 
    start_context: str, 
    tokenizer)->None:

    train_losses, val_losses, track_tokens_seen = [], [], []

    tokens_seen, global_steps = 0, -1 #NOTE: why need a token_seen?

    for epoch in range(num_epochs):
        model.train()

        for x,y  in train_loader:
            optimizer.zero_grad()

            loss = calc_loss_batch(x, y, model, device)

            loss.backward()
            optimizer.step()

            tokens_seen += x.numel()
            global_steps += 1

            if global_steps % eval_freq == 0: 
                train_loss, val_loss = eval_model(model, train_loader, val_loader, device, eval_iter)

                train_losses.append(train_loss)
                val_losses.append(val_loss)

                track_tokens_seen.append(tokens_seen)

                print(f"ep {epoch},\ntrain_loss={train_loss:.3f}\nval_loss={val_loss:.3f}")
        # generate a sample every epoch
        generate_and_print_sample(model, tokenizer, device, start_context)
        break
    
    return train_losses, val_losses, tokens_seen

def generate_and_print_sample(model: GPTModel, tokenizer, device, start_context) -> None:

    model.eval()

    context_size = model.pos_emb.weight.shape[0]

    encoded = text_to_token_ids(start_context, tokenizer).to(device)

    with torch.no_grad():

        token_ids = model.generate(encoded, max_new_tokens=25, context_size=context_size, top_k = 50, temperature=1.5)
        print("token ids = ", token_ids)

        decoded_text = token_ids_to_text(token_ids, tokenizer)

        print('decoded text = ', decoded_text[:100])
    
    model.train()

In [18]:
torch.manual_seed(123)

model = GPTModel(GPT_CONFIG)

model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr = 0.0004, weight_decay=0.1)

num_epochs = 10

train_losses, val_losses, tokens_seen = train(
    model, train_loader, val_loader, 
    optimizer, device, num_epochs = 10, 
    eval_freq = 10, eval_iter = 1, 
    start_context = "Hello world, this is the first time", tokenizer = tokenizer)



ep 0,
train_loss=10.026
val_loss=9.952
token ids =  tensor([[15496,   995,    11,   428,   318,   262,   717,   640,   286,   286,
           438,   438,   257,    13,    11,   262,   314,   314,   438,   286,
           257,   314,   550,   314,   314,   373,    11,   286,   262,   438,
           257,   286,   290,   438,   290,   257,   262,   286,    13,   198,
            13,   198,   550,   314,   198,   198,   550,   438,   290,    13,
            13,   257,   262,    13,   198,   198,   198,     1]])
decoded text =  Hello world, this is the first time of of---- a., the I I-- of a I had I I was, of the-- a of and-- 


# Decoding strategies

1. temperature scaling
2. top-k sampling

# Loading OpenAI GPT2 weights

In [19]:
# Loaded gpt weights from https://github.com/rasbt/LLMs-from-scratch/tree/main/ch05/02_alternative_weight_loading

In [20]:
model_path = os.path.join(os.getcwd(), 'output', 'gpt-small.torch')

gpt = torch.load(model_path)

In [24]:
generate_and_print_sample(model, tokenizer, device, "Every effort moves you")

token ids =  tensor([[6109, 3626, 6100,  345,   13,    1,  286,   11,  339,  339,  438,  526,
          286,  366,  373,  502,   11,   12,  339,   12,  338,  290,  438,  262,
          470,   13,   13,  286,  338]])
decoded text =  Every effort moves you." of, he he--." of " was me,- he-'s and-- the't.. of's
