# Chapter 5: Training an LLM

These are my a-priori guesses of how we're going to do it:
1. Prepare data: create data loaders. Must define a batch size (for training), and a max sequence length. All text larger than max sequence length will be truncated.
2. Define an evaluation function and a loss function.
3. Train the model.

In [1]:
import torch
from previous_chapters import GPTModel

In [2]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 256,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False,
}

torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
model.eval()

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(256, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=False)
        (W_key): Linear(in_features=768, out_features=768, bias=False)
        (W_value): Linear(in_features=768, out_features=768, bias=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features

In [3]:
import tiktoken
from previous_chapters import generate_text_simple

def text_to_tokens_ids(text: str, tokenizer: tiktoken.Encoding) -> torch.Tensor:
    # First we create a list
    encoded = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
    # Then create a tensor. Why? -> to add a "batch" dimension
    # Unsqueeze converts n tensor into n+1 by adding another dimension. 
    # need to specify axis 
    # Same information but indices to access them are different
    tensor = torch.tensor(encoded).unsqueeze(dim=0)
    return tensor


def token_ids_to_text(ids: torch.Tensor, tokenizer: tiktoken.Encoding) -> str:
  tokens = ids.squeeze(0).tolist()
  text = tokenizer.decode(tokens)
  return text

In [4]:
start_context = "Every effort moves you"
tokenizer = tiktoken.get_encoding("gpt2")

token_ids = generate_text_simple(
  model=model,
  idx=text_to_tokens_ids(start_context, tokenizer),
  max_new_tokens = 10,
  context_size=GPT_CONFIG_124M["context_length"]
)

print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

Output text:
 Every effort moves you rentingetic wasnم refres RexMeCHicular stren


## Defining loss functions

In [62]:
inputs = torch.tensor([[16833, 3626, 6100], [40, 1107, 588]])
targets = torch.tensor([[3626, 6100, 345], [588, 428, 11311]])

In [91]:
# Feeding input to model returns **logits** vector. This is the vector we
# softmax, then argmax to get a prediction.

# Logits are very large, as we must assign a probability for every
# value in the dictionary to every next token!

with torch.no_grad():
    logits = model(inputs) # Output should be (batch, tokens, vocab_size)
    
probas = torch.softmax(logits, dim=-1)

In [97]:
token_ids = torch.argmax(probas, dim=-1, keepdim=True)
print("Token IDs:\n", token_ids)

Token IDs:
 tensor([[[16657],
         [  339],
         [42826]],

        [[49906],
         [29669],
         [41751]]])


In [98]:
print(f"Targets batch 1: {token_ids_to_text(targets[0], tokenizer)}")
print(f"Outputs batch 1: {token_ids_to_text(token_ids[0].flatten(), tokenizer)}")

Targets batch 1:  effort moves you
Outputs batch 1:  Armed heNetflix


In [99]:
text_idx = 0
target_probas_1 = probas[text_idx, [0, 1, 2], targets[text_idx]]
print("Text 1:", target_probas_1)

text_idx = 1
target_probas_2 = probas[text_idx, [0, 1, 2], targets[text_idx]]
print("Text 2:", target_probas_2)

Text 1: tensor([7.4541e-05, 3.1061e-05, 1.1563e-05])
Text 2: tensor([3.9836e-05, 1.6783e-05, 4.7559e-06])


In [100]:
# Easier to maximize logarithm than loss function itself
log_probas = torch.log(torch.cat((target_probas_1, target_probas_2)))
print(log_probas)

tensor([ -9.5042, -10.3796, -11.3677, -10.1308, -10.9951, -12.2561])


In [101]:
avg_log_probas = torch.mean(log_probas)
print(avg_log_probas)

tensor(-10.7722)


In [102]:
# This is the numer we need to maximize by updating the model's weights.
# The largest possible value of log is 0 (because prob is defined between 0 and 1 and log(1) == 0)

In [103]:
neg_avg_log_probas = -1 * avg_log_probas
print(neg_avg_log_probas)

tensor(10.7722)


In [104]:
# To compute the cross entropy, we want to "flatten" both tensors by combining over the "batch" dimension
# Effectively, we're stitching together all batches into a single vector. Losing the batch dimension.

In [105]:
logits_flat = logits.flatten(0,1)
targets_flat = targets.flatten()

In [106]:
loss = torch.nn.functional.cross_entropy(logits_flat, targets_flat)

In [107]:
loss

tensor(10.7722)

In [109]:
perplexity = torch.exp(loss)
print(perplexity)

tensor(47678.8633)


In [None]:
# Perplexity is more interpretable: number of tokens the model is usure about.