In [1]:
import torch
import torch.nn as nn
import tiktoken

class MultiheadAttention(nn.Module):
    def __init__(self, d_in, d_out, dropout, num_heads, context_length):
        super().__init__()
        assert d_out % num_heads == 0, "d_out must be divisible by num_heads"
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads
        self.W_querys = nn.Linear(d_in, d_out, bias=False)
        self.W_keys = nn.Linear(d_in, d_out, bias=False)
        self.W_values = nn.Linear(d_in, d_out, bias=False)
        self.dropout = nn.Dropout(dropout)
        self.linear_projection = nn.Linear(d_out, d_out)
        
        # Causal mask for autoregressive processing
        self.register_buffer(
            "mask",
            torch.triu(torch.ones(context_length, context_length), diagonal=1)
        )

    def forward(self, inputs):
        batch, num_tokens, dim = inputs.shape
        query = self.W_querys(inputs)
        key = self.W_keys(inputs)
        value = self.W_values(inputs)

        query = query.view(batch, num_tokens, self.num_heads, self.head_dim).transpose(1, 2)
        key = key.view(batch, num_tokens, self.num_heads, self.head_dim).transpose(1, 2)
        value = value.view(batch, num_tokens, self.num_heads, self.head_dim).transpose(1, 2)

        attn_scores = torch.matmul(query, key.transpose(-2, -1)) / (self.head_dim ** 0.5)
        
        # Apply causal mask
        mask_bool = self.mask[:num_tokens, :num_tokens].bool()
        attn_scores.masked_fill_(mask_bool, float('-inf'))

        attn_weights = torch.softmax(attn_scores, dim=-1)
        attn_weights = self.dropout(attn_weights)
        
        context_vec = torch.matmul(attn_weights, value)
        context_vec = context_vec.transpose(1, 2).contiguous().view(batch, num_tokens, -1)
        return self.linear_projection(context_vec)

class GELU(nn.Module):
    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) * 
            (x + 0.044715 * torch.pow(x, 3))
        ))

class FeedForward(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(config['embedding_dim'], config['embedding_dim'] * 4),
            GELU(),
            nn.Linear(config['embedding_dim'] * 4, config['embedding_dim'])
        )

    def forward(self, inputs):
        return self.layers(inputs)

class LayerNorm(nn.Module):
    def __init__(self, emb_dim, eps=1e-5):
        super().__init__()
        self.eps = eps
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, inputs):
        mean = inputs.mean(dim=-1, keepdim=True)
        var = inputs.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (inputs - mean) / torch.sqrt(var + self.eps)
        return norm_x * self.scale + self.shift

class TransformerBlock(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.attention = MultiheadAttention(
            d_in=config['embedding_dim'], 
            d_out=config['embedding_dim'], 
            dropout=config['dropout'], 
            num_heads=config['n_heads'], 
            context_length=config['context_length']
        )
        self.norm1 = LayerNorm(config["embedding_dim"])
        self.norm2 = LayerNorm(config["embedding_dim"])
        self.ff = FeedForward(config)
        self.dropout = nn.Dropout(config['dropout'])

    def forward(self, inputs):
        add_connection = inputs
        output = self.norm1(inputs)
        output = self.attention(output)
        output = self.dropout(output)
        output = output + add_connection
        
        add_connection = output
        output = self.norm2(output)
        output = self.ff(output)
        output = self.dropout(output)
        output = output + add_connection
        
        return output

class GPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.token_embedding = nn.Embedding(config['vocab_size'], config['embedding_dim'])
        self.pos_embedding = nn.Embedding(config['context_length'], config['embedding_dim'])
        self.dropout = nn.Dropout(config['dropout'])
        self.trf_blocks = nn.Sequential(*[TransformerBlock(config) for _ in range(config["n_layers"])])
        self.out_head = nn.Linear(config["embedding_dim"], config["vocab_size"], bias=False)
        self.final_norm = LayerNorm(config["embedding_dim"])

    def forward(self, inputs):
        batch_size, seq_len = inputs.shape
        tok_embeds = self.token_embedding(inputs)
        pos_embeds = self.pos_embedding(torch.arange(seq_len, device=inputs.device))
        
        x = tok_embeds + pos_embeds
        x = self.dropout(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

# GPT Model Configuration
GPT_CONFIG_124M = {
    "vocab_size": 50257,    
    "context_length": 1024,  
    "embedding_dim": 768,    
    "n_heads": 12,          
    "n_layers": 12,         
    "dropout": 0.1,         
}

# Initialize tokenizer
tokenizer = tiktoken.get_encoding("gpt2")

# Tokenizing input text
txt1 = "Every effort moves you"
txt2 = "Every day holds a"

batch = [torch.tensor(tokenizer.encode(txt1)), torch.tensor(tokenizer.encode(txt2))]
batch = torch.nn.utils.rnn.pad_sequence(batch, batch_first=True, padding_value=0)  # Ensure proper batch shape

# Set random seed for reproducibility
torch.manual_seed(123)

# Initialize GPT model
model = GPT(GPT_CONFIG_124M)

# Forward pass
out = model(batch)

# Print results
print("Input batch:\n", batch)
print("\nOutput shape:", out.shape)
print(out)
total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters: {total_params:,}")

  cpu = _conversion_method_template(device=torch.device("cpu"))


Input batch:
 tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])

Output shape: torch.Size([2, 4, 50257])
tensor([[[ 0.1381,  0.0077, -0.1963,  ..., -0.0222, -0.1060,  0.1717],
         [ 0.3865, -0.8408, -0.6564,  ..., -0.5163,  0.2369, -0.3357],
         [ 0.6989, -0.1829, -0.1631,  ...,  0.1472, -0.6504, -0.0056],
         [-0.4290,  0.1669, -0.1258,  ...,  1.1579,  0.5303, -0.5549]],

        [[ 0.1094, -0.2894, -0.1467,  ..., -0.0557,  0.2911, -0.2824],
         [ 0.0882, -0.3552, -0.3527,  ...,  1.2930,  0.0053,  0.1898],
         [ 0.6091,  0.4702, -0.4094,  ...,  0.7688,  0.3787, -0.1974],
         [-0.0612, -0.0737,  0.4751,  ...,  1.2463, -0.3834,  0.0609]]],
       grad_fn=<UnsafeViewBackward0>)
Total number of parameters: 163,009,536


In [2]:
inputs = torch.tensor([[16833, 3626, 6100],   # ["every effort moves",
                       [40,    1107, 588]])   #  "I really like"]

targets = torch.tensor([[3626, 6100, 345  ],  # [" effort moves you",
                        [1107,  588, 11311]])

In [8]:
with torch.no_grad():
    logits = model(inputs)

probas = torch.softmax(logits, dim=-1) # Probability of each token in vocabulary
print(probas.shape) 
token_ids = torch.argmax(probas, dim=-1, keepdim=True)
print("Token IDs:\n", token_ids)

torch.Size([2, 3, 50257])
Token IDs:
 tensor([[[36397],
         [11552],
         [20610]],

        [[20522],
         [50090],
         [36963]]])


In [10]:
import tiktoken

def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dimension
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0) # remove batch dimension
    return tokenizer.decode(flat.tolist())


tokenizer = tiktoken.get_encoding("gpt2")

In [11]:
print(f"Targets batch 1: {token_ids_to_text(targets[0], tokenizer)}")
print(f"Outputs batch 1: {token_ids_to_text(token_ids[0].flatten(), tokenizer)}")

Targets batch 1:  effort moves you
Outputs batch 1:  Gathering TamFriday


In [12]:
logits_flat = logits.flatten(0, 1)
targets_flat = targets.flatten()

print("Flattened logits:", logits_flat.shape)
print("Flattened targets:", targets_flat.shape)

Flattened logits: torch.Size([6, 50257])
Flattened targets: torch.Size([6])


In [16]:
logits_flat

tensor([[ 0.4374, -0.7608,  0.5282,  ...,  0.2050,  0.4256, -1.0540],
        [ 0.5072, -0.9123, -0.9866,  ..., -0.4003,  0.4570,  0.0865],
        [ 1.2311, -0.5594, -0.0286,  ...,  0.0784, -0.2533,  0.2012],
        [ 0.7275, -0.3849,  1.0237,  ..., -0.5319,  0.7238, -0.5648],
        [ 0.4368,  0.3923, -0.1811,  ..., -1.2091,  0.4767, -0.0802],
        [ 0.4307,  0.4955,  0.2165,  ..., -0.4767,  0.0735,  0.5175]])

In [15]:
targets_flat

tensor([ 3626,  6100,   345,  1107,   588, 11311])

In [13]:
loss = torch.nn.functional.cross_entropy(logits_flat, targets_flat)
print(loss)

tensor(10.7901)


In [14]:
perplexity = torch.exp(loss)
print(perplexity)

tensor(48537.7539)
