### LLM Architecture

LLMs, such as GPT (which stands for generative pretrained transformer), are large deep neural network architectures designed to generate new text one word (or token) at a time.


In [1]:
import torch
import tiktoken

from tiny_gpt import TinyGPTModel, TinyTransformerBlock


# -----------------------------------------------------------------------------
# GPT-2 Small (124M) style configuration
# -----------------------------------------------------------------------------
GPT_CONFIG_124M = {
    "vocab_size": 50257,        # GPT-2 vocabulary size
    "context_length": 1024,      # Maximum sequence length model can attend to
    "emb_dim": 768,              # Embedding dimension (hidden size)
    "n_heads": 12,               # Number of attention heads
    "n_layers": 12,              # Number of transformer blocks
    "drop_rate": 0.1,            # Dropout rate
    "qkv_bias": False,           # Whether Q, K, V linear layers use bias
}


# -----------------------------------------------------------------------------
# Test single Transformer block with random input
# -----------------------------------------------------------------------------
torch.manual_seed(123)

# Random input of shape (batch=2, seq_len=4, emb_dim=768)
# Purpose: verify the block processes the shape correctly
x = torch.rand(2, 4, 768)

# Create a single transformer block from config
block = TinyTransformerBlock(GPT_CONFIG_124M)

# Forward pass through block
output = block(x)

print(50 * "-")
print("Input shape:", x.shape)
print("Output shape:", output.shape)
print(50 * "-")


# -----------------------------------------------------------------------------
# Tokenize two sample sentences using GPT-2 tokenizer
# -----------------------------------------------------------------------------
tokenizer = tiktoken.get_encoding("gpt2")

batch = []
txt1 = "Every effort moves you"
txt2 = "Every day holds a"

# Convert text â†’ token IDs
batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))

# Stack token sequences into a batch tensor of shape (2, seq_len)
batch = torch.stack(batch, dim=0)

print(50 * "-")
print("Tokenized batch:")
print(batch)
print(50 * "-")


# -----------------------------------------------------------------------------
# Full TinyGPTModel forward pass
# -----------------------------------------------------------------------------
torch.manual_seed(123)

# Initialize the TinyGPT model
model = TinyGPTModel(GPT_CONFIG_124M)

# Forward pass on token batch
out = model(batch)

print(50 * "-")
print("Input batch:\n", batch)
print("\nOutput shape:", out.shape)  # Expected: (batch, seq_len, vocab_size)
print(out)                           # Logits for each token position
print(50 * "-")


--------------------------------------------------
Input shape: torch.Size([2, 4, 768])
Output shape: torch.Size([2, 4, 768])
--------------------------------------------------
--------------------------------------------------
Tokenized batch:
tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])
--------------------------------------------------
--------------------------------------------------
Input batch:
 tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])

Output shape: torch.Size([2, 4, 50257])
tensor([[[ 0.3613,  0.4222, -0.0711,  ...,  0.3483,  0.4661, -0.2838],
         [-0.1792, -0.5660, -0.9485,  ...,  0.0477,  0.5181, -0.3168],
         [ 0.7120,  0.0332,  0.1085,  ...,  0.1018, -0.4327, -0.2553],
         [-1.0076,  0.3418, -0.1190,  ...,  0.7195,  0.4023,  0.0532]],

        [[-0.2564,  0.0900,  0.0335,  ...,  0.2659,  0.4454, -0.6806],
         [ 0.1230,  0.3653, -0.2074,  ...,  0.7705,  0.2710,  0.2246],
         [ 1.0558,  1.0318, -0.2

In [2]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters: {total_params:,}")


print("Token embedding layer shape:", model.tok_emb.weight.shape)
print("Output layer shape:", model.out_head.weight.shape)


total_params_gpt2 = (
    total_params - sum(p.numel()
    for p in model.out_head.parameters())
)
print(f"Number of trainable parameters "
      f"considering weight tying: {total_params_gpt2:,}"
)


total_size_bytes = total_params * 4       #1
total_size_mb = total_size_bytes / (1024 * 1024)     #2
print(f"Total size of the model: {total_size_mb:.2f} MB")

Total number of parameters: 163,009,536
Token embedding layer shape: torch.Size([50257, 768])
Output layer shape: torch.Size([50257, 768])
Number of trainable parameters considering weight tying: 124,412,160
Total size of the model: 621.83 MB


In [4]:

from tiny_gpt import tiny_generate_text_simple


start_context = "Hello, I am"
encoded = tokenizer.encode(start_context)
print("encoded:", encoded)
encoded_tensor = torch.tensor(encoded).unsqueeze(0)    #1
print("encoded_tensor.shape:", encoded_tensor.shape)



model.eval()                  #1
out = tiny_generate_text_simple(
    model=model,
    idx=encoded_tensor, 
    max_new_tokens=6, 
    context_size=GPT_CONFIG_124M["context_length"]
)
print("Output:", out)
print("Output length:", len(out[0]))


decoded_text = tokenizer.decode(out.squeeze(0).tolist())
print(decoded_text)

encoded: [15496, 11, 314, 716]
encoded_tensor.shape: torch.Size([1, 4])
Output: tensor([[15496,    11,   314,   716, 27018, 24086, 47843, 30961, 42348,  7267]])
Output length: 10
Hello, I am Featureiman Byeswickattribute argue
