# Building a GPT
- Most of this code was adapted from Andrej Karpathy's Video here.
- https://youtu.be/kCc8FmEb1nY?si=TI-PuYCwQKAc4_6s
- I took some ideas from here as well: 
- https://github.com/Infatoshi/fcc-intro-to-llms/blob/main/gpt-v1.ipynb


In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import math

The dataset here are two text files I made myself by scraping data and cleaning it up from these two sources:
- https://huggingface.co/datasets/LimYeri/LeetCode_Python_Solutions_v2
- https://huggingface.co/datasets/Thermostatic/texts_parallel_corpus_europarl_english_spanish
- The training set has around 80% of the combined data from here, and the validation set has around 20% 

# Process the text 
- get all lines from the files and store them
- get all unique characters and store them for our vocabulary

In [2]:
# read it in to inspect it
with open('cleaned_train.txt', 'r', encoding='utf-8') as f:
    text = f.read()
    

In [3]:
with open('cleaned_dev.txt', 'r', encoding='utf-8') as f:
    textdev = f.read()

In [4]:
# validate dataset
print("length of dataset in characters: ", len(text))

length of dataset in characters:  576826410


In [5]:
# all unique characters in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


	
 !"#$%&'()*+,-./0123456789:;<=>@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ÁÉÍÑÓÚÜáéíñóúü
111


# Basic Character Level Encoding and Decoding System
- each character is mapped to a certain integer
- Encoder - takes a string and encodes it to a list of integers
- Decoder - takes a list of integers and outputs a string

In [6]:
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

print(encode("hii there"))
print(decode(encode("hii there")))

[74, 75, 75, 3, 86, 74, 71, 84, 71]
hii there


In [7]:
train_data = torch.tensor(encode(text), dtype=torch.long)
val_data = torch.tensor(encode(textdev), dtype=torch.long)

In [8]:
# hyperparameters
batch_size = 32 # how many independent sequences will we process in parallel?
block_size = 128 # what is the maximum context length for predictions?
max_iters = 10000
eval_interval = 100
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.0

In [9]:
# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

# super simple bigram model
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

model = BigramLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()


10.775151 M parameters
step 0: train loss 4.8850, val loss 4.8841
step 100: train loss 2.4763, val loss 2.4793
step 200: train loss 2.3923, val loss 2.3884
step 300: train loss 2.1575, val loss 2.1596
step 400: train loss 1.9589, val loss 1.9546
step 500: train loss 1.7862, val loss 1.7843
step 600: train loss 1.6500, val loss 1.6524
step 700: train loss 1.5540, val loss 1.5515
step 800: train loss 1.4904, val loss 1.4863
step 900: train loss 1.4213, val loss 1.4204
step 1000: train loss 1.3843, val loss 1.3808
step 1100: train loss 1.3540, val loss 1.3530
step 1200: train loss 1.3205, val loss 1.3094
step 1300: train loss 1.2946, val loss 1.2943
step 1400: train loss 1.2778, val loss 1.2760
step 1500: train loss 1.2604, val loss 1.2564
step 1600: train loss 1.2471, val loss 1.2402
step 1700: train loss 1.2297, val loss 1.2253
step 1800: train loss 1.2221, val loss 1.2224
step 1900: train loss 1.2047, val loss 1.2060
step 2000: train loss 1.1938, val loss 1.2001
step 2100: train loss 1

# Outputs
- Generate the Text from training by using a prompt, which is basically a starting point

In [10]:
prompt = 'def cheapestJump(coins, maxJump):'
context = torch.tensor(encode(prompt), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context.unsqueeze(0), max_new_tokens=700)[0].tolist())
print(generated_chars)

def cheapestJump(coins, maxJump):

                         # 1->2-0 + 60 + 4 + 2.



**Example 4:**



**Input:** nums = \[12,25,35,14,1\]

**Output:** 19952

**Explanation:**



**Constraints:**



*   `2 <= words.length <= 3000`

*   `1 <= c2 <= x <= 105`

*   `1 <= neights[i].length`

*   `1 <= word.length, word is starting in `n`.



Return false.eight, each letter in the summata in countary area.

*   These controvers are getting amounts from the subtree prevailing people, as the same defenders' pates, its nothings alongside the its occupation to greater. Accepts a new conditions and the came between Member States and today of the use of administrative-radition rules are to be collected in viability, with which I thin


# Analysis of outputs:
- The output was mostly just incoherent. It reflects some sort of imitation of LeetCode-style formatting but with no logic, reasoning or consistent vocabulary. The model seems to string together words it sees together in training, however, with no coherence or semantic understanding. 
- However, this behavior was completely expected, and the output is not terrible considering the size of the dataset, time for training, and the limited computation power thrown at it. 


# Calculate Perplexity
- used to evaluate how well a language model can predict a sequence of text
- Lower perplexity score means that the model performs better, eg has higher probabilities for correct predictions
- Perplexity is computed as the exponential of the average negative log-likelihood (NLL)(which is the same as loss):


**Perplexity = exp(−1/N * Σ log P(xᵢ))**


Where:
- **N** = total number of tokens
- **P(xᵢ)** = predicted probability of token *xᵢ*

In [None]:
# sources used for this section
# https://discuss.huggingface.co/t/guide-the-best-way-to-calculate-the-perplexity-of-fixed-length-models/193/2
# https://stackoverflow.com/questions/59209086/calculate-perplexity-in-pytorch
# https://pytorch.org/docs/stable/generated/torch.nn.functional.cross_entropy.html
# https://en.wikipedia.org/wiki/Perplexity
# https://huggingface.co/docs/transformers/perplexity
# evaluate model
model.eval()
total_loss = 0.0
total_tokens = 0
perp_dev = textdev[:100000]
# disable gradients
with torch.no_grad():
    # loop over the dev set and each step goes over the block size
    for i in range(0, len(perp_dev) - block_size, block_size):
        # get characteres, and characters + 1 from the dev set
        current = perp_dev[i:i+block_size]
        to_predict = perp_dev[i+1:i+block_size+1]

        # get indices unsqueeze = 0 to make the inputs the appropriate shape(1, sequence length). 
        x = torch.tensor([stoi[c] for c in current], dtype=torch.long).unsqueeze(0).to(device)
        y = torch.tensor([stoi[c] for c in to_predict], dtype=torch.long).unsqueeze(0).to(device)

        # get predictions
        logits, _ = model(x)

        # compute total loss.
        # loss(N C). N is (batchsize * seq length) and C is vocabulary size
        # reduction = sum here because we are trying to get average loss for each token, not average loss over a certain block size
        loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y.view(-1), reduction='sum')

        # totals 
        total_loss += loss
        total_tokens += len(y.view(-1))

# compute average loss and perplexity
avg_loss = total_loss / total_tokens
perplexity = math.exp(avg_loss)

print(f"Perplexity: {perplexity}")




Perplexity: 2.5318950954772936


# Analysis of perplexity
At 2.5, it implies that the model is on average unsure between 2.5 possible next tokens at each step. While far from ideal, this level of uncertainty is actually reasonable for a character-level model trained from scratch on a small dataset. It suggests that the model did manage to pick up on some structural or statistical patterns in the training data, even if it wasn’t able to generate meaningful or correct solutions. Overall, the results align with expectations: the model can mimic surface patterns but lacks the depth or context to produce functional output.


# BLEU - (bilingual evaluation understudy)
- get outputs from the model and compare them to the outputs from the dataset
- outputs betwen 0 and 1, higher the better
- Because this model was trained on a combined text corpus, and generates a character at a time, without any understanding of semantics, calculating a BLEU score is not meaningful.
- In short, this model is incapable of handling any tranlsation tasks, so BLEU scores don't matter here