# 5. Pretraining on unlabeled data

In [3]:
import torch
import tiktoken
from previous_chapter import GPTModel, generate_text_simple

GPT_CONFIG_124M = {
    "vocab_size": 50257,   # Vocabulary size
    "context_length": 256, # Shortened context length (orig: 1024)
    "emb_dim": 768,        # Embedding dimension
    "n_heads": 12,         # Number of attention heads
    "n_layers": 12,        # Number of layers
    "drop_rate": 0.1,      # Dropout rate
    "qkv_bias": False      # Query-key-value bias
}

torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
model.eval(); 

In [21]:
def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0) # unsqueeze(0) adds the batch dimension
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0) # squeeze(0) removes batch dimension
    return tokenizer.decode(flat.tolist())

start_context = "Every effort moves you"
tokenizer = tiktoken.get_encoding("gpt2")

token_ids = generate_text_simple(model=model, idx=text_to_token_ids(start_context, tokenizer), max_new_tokens=10, context_size=GPT_CONFIG_124M["context_length"])
print(token_ids.shape)
print("Output text: \n", token_ids_to_text(token_ids, tokenizer))

torch.Size([1, 14])
Output text: 
 Every effort moves you rentingetic wasnم refres RexMeCHicular stren


### 5.1.2 Calculating the text generation loss

In [7]:
inputs = torch.tensor([[16833, 3626, 6100],   # ["every effort moves",
                       [40,    1107, 588]])   #  "I really like"]

targets = torch.tensor([[3626, 6100, 345  ],  # [" effort moves you",
                        [1107,  588, 11311]]) #  " really like chocolate"]

In [24]:
with torch.no_grad():
    logits = model(inputs)
probas = torch.softmax(logits, dim=-1)
print(logits.shape)

torch.Size([2, 3, 50257])


In [25]:
token_ids = torch.argmax(probas, dim=-1, keepdim=True)
print(token_ids)

tensor([[[16657],
         [  339],
         [42826]],

        [[49906],
         [29669],
         [41751]]])


In [44]:
print("Targets batch 1: ", token_ids_to_text(targets[0], tokenizer))
print("Outputs batch 1: ", token_ids_to_text(token_ids[0].flatten(), tokenizer))

print("Targets batch 2: ", token_ids_to_text(targets[1], tokenizer))
print("Outputs batch 2: ", token_ids_to_text(token_ids[1].flatten(), tokenizer))

Targets batch 1:   effort moves you
Outputs batch 1:   Armed heNetflix
Targets batch 2:   really like chocolate
Outputs batch 2:   pressuring empoweredfaith


In [57]:
print(probas[0, [0,1,2], targets[0]])
print(probas[0, :, targets[0]])
print(targets[0])
print(targets.shape, probas.shape)

tensor([7.4541e-05, 3.1061e-05, 1.1563e-05])
tensor([[7.4541e-05, 2.6072e-05, 1.8191e-05],
        [2.5497e-05, 3.1061e-05, 2.7802e-05],
        [3.2404e-05, 1.0943e-05, 1.1563e-05]])
tensor([3626, 6100,  345])
torch.Size([2, 3]) torch.Size([2, 3, 50257])
