**EE4685 Assignment 2: Building a miniGPT** by Josephine King and Alec Daalman

**References:**
- "Let's build GPT: from scratch, in code, spelled out." Youtube tutorial by Andrej Karpathy. https://www.youtube.com/watch?v=kCc8FmEb1nY
- HuggingFace Tokenizer developer guides. https://huggingface.co/docs/transformers/en/notebooks


In [241]:
# Import packages
import os
from tqdm.notebook import tqdm
import tiktoken
import torch
import torch.nn as nn
from torch.nn import functional as F
import torch.utils.data as data
from tokenizers import Tokenizer, pre_tokenizers
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

# Setup
torch.manual_seed(6250513)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
CHECKPOINT_PATH = "./saved_models/"
device = torch.device("cpu") if not torch.cuda.is_available() else torch.device("cuda:0")
print("Using device", device)

# Initialize model parameters
TRAIN_PCT = 0.8
BLOCK_SIZE = 8
BATCH_SIZE = 64
MAX_ITER = 3000
VOCAB_SIZE = 3000
EMBD_DIM = 32
LR = 2.5e-4

# Download the TinyShakespeare dataset
!wget -O tinyshakespeare.txt https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('tinyshakespeare.txt', 'r', encoding='utf-8') as f: raw_data = f.read()


Using device cpu
--2025-03-19 17:36:34--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘tinyshakespeare.txt’


2025-03-19 17:36:34 (26.4 MB/s) - ‘tinyshakespeare.txt’ saved [1115394/1115394]



**Data Preprocessing**

Create a custom tokenizer using the HuggingFace Tokenizer package. Then encode the data, convert it into a PyTorch tensor, and split it up into validation data and training data.

In [268]:
# Create the tokenizer 
tokenizer = Tokenizer(BPE(unk_token="[UNK]")) 
#tokenizer.pre_tokenizer = pre_tokenizers.Sequence([pre_tokenizers.Punctuation("isolated"), pre_tokenizers.Split("\n", "isolated"), pre_tokenizers.Split(" ", "isolated")])
tokenizer.pre_tokenizer = pre_tokenizers.Sequence([pre_tokenizers.Punctuation("isolated"), pre_tokenizers.Whitespace()])
trainer = BpeTrainer(vocab_size=VOCAB_SIZE)
tokenizer.train(["tinyshakespeare.txt"], trainer)
tokenizer.save("tinyshakespeare_tokenizer.json")

# Tokenize the data
tokenizer = Tokenizer.from_file("tinyshakespeare_tokenizer.json")
tokenized_data = tokenizer.encode(raw_data).ids
# Convert into a pytorch tensor
tensor_data = torch.tensor(tokenized_data, dtype=torch.long)

# Split into training and validation sets
train_end = int(len(tensor_data)*TRAIN_PCT)
training_data = tensor_data[:train_end]
validation_data = tensor_data[train_end:]






**Basic Untrained Bigram Language Model**

Create a basic Bigram Language model from Karpathy's tutorial (copied directly). To use this model, we need the function get_batch, which returns a batch from the dataset. Using this untrained model, generate some text and see what we get.

In [269]:
def get_batch(data, batch_size, block_size):
    # Choose batch_size random starting points
    block_starts = torch.randint(0, len(data) - block_size, (batch_size,))
    # Get the inputs and outputs for the chosen blocks, stack them into tensors
    batch_inputs = torch.stack([data[start: start + block_size] for start in block_starts])
    batch_outputs = torch.stack([data[start + 1: start + block_size + 1] for start in block_starts])
    return batch_inputs, batch_outputs

# Copied from Karpathy's tutorial
class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) # (B,T,C)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    # Generate num_gen_tokens more tokens given the current tokens in curr_tokens
    def generate(self, curr_tokens, num_gen_tokens):
        for _ in range(num_gen_tokens):
            # Get the predictions for the next tokens 
            preds, loss = self.forward(curr_tokens)
            # Look only at the last time step
            preds = preds[:, -1, :] # becomes (B, C)
            # Normalize probabilities from 0 to 1 using softmax
            probs = F.softmax(preds, dim=-1) # (B, C)
            # Get the next token by sampling from the probability distribution
            next_token = torch.multinomial(probs, num_samples=1) # (B, 1)
            # Add the new token to the current tokens
            curr_tokens = torch.cat((curr_tokens, next_token), dim=1) # (B, T+1)
        return curr_tokens

# Create the model and generate some text 
m = BigramLanguageModel(VOCAB_SIZE)
starting_text = "Romeo Romeo wherefore art thou Romeo"
starting_tokens = tokenizer.encode(starting_text).ids
starting_tokens = torch.tensor(starting_tokens, dtype=torch.long).reshape(-1,1)
print(tokenizer.decode(m.generate(curr_tokens = starting_tokens, num_gen_tokens=100)[0].tolist()))


Romeo PETER perceive remain satis aven ark quite thyself cc ance Cl st wilt yourself oath ude shi MAR mistress Under Claudio ition sue hath Lu Can PR ISAB three UC He Servingman Second GRUMIO lip bo mer sed dog honest Hortensio ured tong but hich move du used thither KE self ken double ey sub concl ian DI hearing madam CAPUL hus wrong WARWIC jo vain Z whose ONTES forth straight avo gold Un Li ! hemi EL His : wise Juli morrow sever cept counter ap age ind arewell MONTAGUE ged noon LUC Me cousin mber villain reck tide


**Train the Bigram Language Model**

Create a function train_model, which takes in training data, a model, and an optimizer and trains the model. Copied/modified from the EE4685 optimization exercise.

In [261]:
# These functions are all copied/modified from the optimization exercise 
def _get_config_file(model_path, model_name):
    return os.path.join(model_path, model_name + ".config")

def _get_model_file(model_path, model_name):
    return os.path.join(model_path, model_name + ".tar")

def _get_result_file(model_path, model_name):
    return os.path.join(model_path, model_name + "_results.json")

def save_model(model, model_path, model_name):
    config_dict = model.config
    os.makedirs(model_path, exist_ok=True)
    config_file, model_file = _get_config_file(model_path, model_name), _get_model_file(model_path, model_name)
    with open(config_file, "w") as f:
        json.dump(config_dict, f)
    torch.save(model.state_dict(), model_file)

def train_model(train_set, model, model_name, optimizer, max_iter=1000, batch_size=256, block_size=32, overwrite=False, save_model=False):
    """
    Train a model on the training set of FashionMNIST

    Inputs:
        train_set - Training dataset
        model - Object of BaseNetwork
        model_name - (str) Name of the model, used for creating the checkpoint names
        max_iter - Number of iterations we want to (maximally) train for
        patience - If the performance on the validation set has not improved for #patience epochs, we stop training early
        batch_size - Size of batches used in training
        overwrite - Determines how to handle the case when there already exists a checkpoint. If True, it will be overwritten. Otherwise, we skip training.
    """
    file_exists = os.path.isfile(_get_model_file(CHECKPOINT_PATH, model_name))
    if file_exists and not overwrite:
        print(f"Model file of \"{model_name}\" already exists. Skipping training...")
        with open(_get_result_file(CHECKPOINT_PATH, model_name), "r") as f:
            results = json.load(f)
    else:
        if file_exists:
            print("Model file exists, but will be overwritten...")

        ############
        # Training #
        ############
        model.train()
        for iter in range(max_iter):
            inputs, outputs = get_batch(train_set, batch_size, block_size)
            inputs, outputs = inputs.to(device), outputs.to(device)
            optimizer.zero_grad()
            preds,loss = model(inputs, outputs)
            if iter % 500 == 0 or iter == max_iter - 1:
                print(f"iter {iter}: loss = {loss}")
            loss.backward()
            optimizer.step()

        if (save_model):
            save_model(model, CHECKPOINT_PATH, model_name)

    return model


Train the model and print out the loss values to see how more iterations improve the model.

Generate some text from the trained model and see how it compares to the untrained model.

In [None]:
# GPT architecture 
class GPT(nn.Module):

    def __init__(self, vocab_size, block_size, embd_dim, decoders):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, embd_dim)
        self.position_embedding_table = nn.Embedding(block_size, embd_dim)

        self.transformer_blocks = nn.TransformerDecoder(nn.TransformerDecoderLayer(d_model=embd_dim, nhead=12, dim_feedforward=3072, dropout=0.1), decoders, norm=None)
        self.linear_layer = nn.Linear(embd_dim, vocab_size)
        self.block_size = block_size

    def forward(self, idx, targets=None):
        # idx and targets are both (B,T) tensor of integers
        B, T = idx.shape
        logits = self.token_embedding_table(idx) # (B,T,C)
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        inputs = tok_emb + pos_emb # (B,T,C)
        inputs = self.transformer_blocks(inputs, memory=torch.zeros_like(inputs))
        logits = self.linear_layer(inputs)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    # Generate num_gen_tokens more tokens given the current tokens in curr_tokens
    def generate(self, curr_tokens, num_gen_tokens):
        for _ in range(num_gen_tokens):
            curr_tokens_cond = curr_tokens[:, -self.block_size:]
            # Get the predictions for the next tokens 
            preds, loss = self.forward(curr_tokens_cond)
            # Look only at the last time step
            preds = preds[:, -1, :] # becomes (B, C)
            # Normalize probabilities from 0 to 1 using softmax
            probs = F.softmax(preds, dim=-1) # (B, C)
            # Get the next token by sampling from the probability distribution
            next_token = torch.multinomial(probs, num_samples=1) # (B, 1)
            # Add the new token to the current tokens
            curr_tokens = torch.cat((curr_tokens, next_token), dim=1) # (B, T+1)
        return curr_tokens

In [None]:
TRAIN_PCT = 0.8
BLOCK_SIZE = 64
BATCH_SIZE = 32
MAX_ITER = 6000
VOCAB_SIZE = 3000
EMBD_DIM = 192
LR = 2.5e-5

In [None]:
bigram_model = BigramLanguageModel(VOCAB_SIZE).to(device)
optimizer = torch.optim.AdamW(bigram_model.parameters(), lr=LR)
bigram_model = train_model(training_data.to(device), bigram_model, "bigram_model", 
                           optimizer, max_iter=MAX_ITER, batch_size=BATCH_SIZE, block_size=BLOCK_SIZE)

iter 0: loss = 8.56875228881836
iter 500: loss = 8.349356651306152
iter 999: loss = 8.337937355041504


In [None]:
gpt_model = GPT(VOCAB_SIZE, BLOCK_SIZE, EMBD_DIM, 12).to(device)
optimizer = torch.optim.AdamW(gpt_model.parameters(), lr=LR)
gpt_model = train_model(training_data.to(device), gpt_model, "gpt_model", 
                        optimizer, max_iter=MAX_ITER, batch_size=BATCH_SIZE, block_size=BLOCK_SIZE)

iter 0: loss = 8.139966011047363
iter 500: loss = 5.585456848144531
iter 999: loss = 5.170231342315674


In [None]:
starting_text = "O Romeo, Romeo, wherefore art thou Romeo?"
starting_tokens = tokenizer.encode(starting_text).ids
starting_tokens = torch.tensor(starting_tokens, dtype=torch.long).reshape(-1,1)

print("-------------------------------------")
print("Bigram model")
print("-------------------------------------")
print(tokenizer.decode(bigram_model.generate(curr_tokens = starting_tokens.to(device), num_gen_tokens = 100)[0].tolist()))

print("-------------------------------------")
print("GPT model")
print("-------------------------------------")
print(tokenizer.decode(gpt_model.generate(curr_tokens = starting_tokens.to(device), num_gen_tokens = 100)[0].tolist()))

-------------------------------------
Bigram model
-------------------------------------
O ENIUS viol brow BY vo unk flatter three weary G small doo believe knees curse j They mus when KE long servant sirrah ri F fra ser twenty reat wol faith Gre sl changed ering plo dishonour children gro Within banished er wn enty deserved ld idle cl Pa BAST offence Me uct After ghost ach fan ens count Ser wain asure ves proceed COR thousand This sm ople majesty try lest mo ting After ISABELLA fault gain enemy unes case VI remain ERD DU UTUS GLOUCESTER Hold Even CORIOLAN trust leep Paulina twixt sacred seiz bly great pati lia
-------------------------------------
GPT model
-------------------------------------
O kill Lord : You ' d bit , lords Till Des ue of The city and dead and Buckingham ity ? I only it not : Ay , follow t : Ay an ings d and thy ear forth peace or ange T ON ? ISABELLA : Say , that ' vi beggar for my s sto enemies , because you fra . BRUTUS when em ring , none ' ll Margaret , slain