# Chapter 5: Training an LLM

These are my a-priori guesses of how we're going to do it:
1. Prepare data: create data loaders. Must define a batch size (for training), and a max sequence length. All text larger than max sequence length will be truncated.
2. Define an evaluation function and a loss function.
3. Train the model.

In [1]:
import torch
from previous_chapters import GPTModel

In [36]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 256,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False,
}

torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
model.eval()

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(256, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=False)
        (W_key): Linear(in_features=768, out_features=768, bias=False)
        (W_value): Linear(in_features=768, out_features=768, bias=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features

In [37]:
import tiktoken
from previous_chapters import generate_text_simple

def text_to_tokens_ids(text: str, tokenizer: tiktoken.Encoding) -> torch.Tensor:
    # First we create a list
    encoded = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
    # Then create a tensor. Why? -> to add a "batch" dimension
    # Unsqueeze converts n tensor into n+1 by adding another dimension. 
    # need to specify axis 
    # Same information but indices to access them are different
    tensor = torch.tensor(encoded).unsqueeze(dim=0)
    return tensor


def token_ids_to_text(ids: torch.Tensor, tokenizer: tiktoken.Encoding) -> str:
  tokens = ids.squeeze(0).tolist()
  text = tokenizer.decode(tokens)
  return text

In [39]:
start_context = "Every effort moves you"
tokenizer = tiktoken.get_encoding("gpt2")

token_ids = generate_text_simple(
  model=model,
  idx=text_to_tokens_ids(start_context, tokenizer),
  max_new_tokens = 10,
  context_size=GPT_CONFIG_124M["context_length"]
)

print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

Output text:
 Every effort moves you rentingetic wasnم refres RexMeCHicular stren


## Defining loss functions

In [40]:
inputs = torch.tensor([[16833, 3626, 6100], [40, 1107, 588]])
targets = torch.tensor([[3626, 6100, 345], [588, 428, 11311]])

In [None]:
# Feeding input to model returns **logits** vector. This is the vector we
# softmax, then argmax to get a prediction.

# Logits are very large, as we must assign a probability for every
# value in the dictionary to every next token!

with torch.no_grad():
    logits = model(inputs)