In [9]:
"""
OpenAI's GPT2 paper, github repo, and pre-trained weights give us a good idea of the model architecture, 
but there isn't enough information available to actually train a model. 

Andrej Karpathy provides a working implementation, but even so, the intution isn't expressed in that code. Also
missing are the (probably hard won) insights required to go from theory to working code.

I'll try to build GPT2 including those key insights and document it here. 

references:
* paper: https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf
* repo: https://github.com/openai/gpt-2
* kaparthy's nanoGPT: https://github.com/karpathy/nanoGPT/tree/master
"""


[2mResolved [1m145 packages[0m [2min 1ms[0m[0m
[2mAudited [1m140 packages[0m [2min 0.06ms[0m[0m
ShakespeareDataloader Initializing: karpathy/tiny_shakespeare with B=5, T=1024, split='train'
ShakespeareDataloader Pre-tokenizing text data n=1,003,854 for split 'train'... estimated batches: 58
ShakespeareDataloader iterator reset for split 'train', starting at token 0
Total tokens analyzed: 102,400
Unique tokens: 7013
Top 10 tokens:
  ID 198   ('\n'      ): 12,382 (0.1209)
  ID 11    (','       ): 5,909  (0.0577)
  ID 25    (':'       ): 3,139  (0.0307)
  ID 13    ('.'       ): 2,362  (0.0231)
  ID 262   (' the'    ): 1,753  (0.0171)
  ID 284   (' to'     ): 1,298  (0.0127)
  ID 286   (' of'     ): 1,090  (0.0106)
  ID 290   (' and'    ): 1,083  (0.0106)
  ID 26    (';'       ): 1,003  (0.0098)
  ID 314   (' I'      ): 997    (0.0097)
Vocabulary coverage: 0.13954275026364488
ShakespeareDataloader iterator reset for split 'train', starting at token 0
x: [5962, 22307, 25, 198, 8

In [None]:
!uv add torch



In [11]:
import torch
import torch.nn as nn

class GPTModel(nn.Module):
    """
    The high-level tensorflow model structure released by OpenAI translated to pytorch. Comments are my own
    
    from: https://github.com/openai/gpt-2/blob/9b63575ef42771a015060c964af2c3da4cf7c8ab/src/model.py#L147
    
    def model(hparams, X, past=None, scope='model', reuse=False):
        results = {}
        batch, sequence = shape_list(X)
                
        wpe = tf.get_variable('wpe', [hparams.n_ctx, hparams.n_embd],
                             initializer=tf.random_normal_initializer(stddev=0.01))
        wte = tf.get_variable('wte', [hparams.n_vocab, hparams.n_embd],
                             initializer=tf.random_normal_initializer(stddev=0.02))
        past_length = 0 if past is None else tf.shape(past)[-2]
        h = tf.gather(wte, X) + tf.gather(wpe, positions_for(X, past_length))

        # Transformer
        presents = []
        pasts = tf.unstack(past, axis=1) if past is not None else [None] * hparams.n_layer
        assert len(pasts) == hparams.n_layer
        for layer, past in enumerate(pasts):
            h, present = block(h, 'h%d' % layer, past=past, hparams=hparams)
            presents.append(present)
        results['present'] = tf.stack(presents, axis=1)
        h = norm(h, 'ln_f')

        # Language model loss.  Do tokens <n predict token n?
        h_flat = tf.reshape(h, [batch*sequence, hparams.n_embd])
        logits = tf.matmul(h_flat, wte, transpose_b=True)
        logits = tf.reshape(logits, [batch, sequence, hparams.n_vocab])
        results['logits'] = logits
        return results
        
    """
    def __init__(self, cfg):
        super().__init__()

        # positional embeddings - each position in the input gets a learned positional embedding to capture relationships
        # between words. Worth noting, AIAYN used fixed embeddings while GPT and BERT uses learned embeddings.
        #
        # wpe = tf.get_variable('wpe', [hparams.n_ctx, hparams.n_embd], initializer=tf.random_normal_initializer(stddev=0.01))
        self.position_embeddings = nn.Embedding(num_embeddings=cfg['context_length'], embedding_dim=cfg['emb_dim'])
        torch.nn.init.normal_(self.token_embedding.weight, mean=0.0, std=0.01)

        # token embeddings - weights learned by mapping token ids to these embedings.
        #
        # wte = tf.get_variable('wte', [hparams.n_vocab, hparams.n_embd], initializer=tf.random_normal_initializer(stddev=0.02))
        self.token_embedding = nn.Embedding(num_embeddings=cfg['vocab_size'], embedding_dim=cfg['emb_dim'])
        torch.nn.init.normal_(self.token_embedding.weight, mean=0.0, std=0.02)

        # positional and token embeddings are added together to represent both the word and where it
        # is in the input. this is the input to the transformer blocks.

        # transformer model is a stack of transformer blocks. the hparam cfg['transformer_layers'] tells us how many to use
        # this is a key input into the model's overall size.
        #
        #    presents = []
        #    for layer, past in enumerate(pasts):
        #        h, present = block(h, 'h%d' % layer, past=past, hparams=hparams)
        #        presents.append(present)
        #    results['present'] = tf.stack(presents, axis=1)
        self.transformer_blocks = nn.Sequential(*[TransformerBlock(cfg) for _ in range(cfg['transformer_layers'])])

        # stabalize output: normalize to unit variance and train: gain and bias
        # 
        #    define: norm(h, 'ln_f')
        self.final_norm = LayerNorm(cfg['emb_dim'])
    
        # projection head: transform final hiden layer into logits for every vocab token (i.e. predict next token)
        # self.out_head = nn.Linear(cfg['emb_dim'], cfg['vocab_size'], bias=False)

    def forward(self, in_idx: torch.Tensor) -> torch.Tensor:
        batch_size, seq_len = in_idx.shape # token indices (B, T)

        # get the token embeddings for each token index (B, T) -> (B, T, C)
        tok_embeds = self.token_embedding(in_idx)

        # get position embeddings for each sequence index (T)
        pos_indices = torch.arange(seq_len, device=in_idx.device, dtype=torch.long)
        pos_embeds = self.position_embeddings(pos_indices) # (T, C)

        # Combine token and position embeddings for input to transformer blocks
        # 
        #    h = tf.gather(wte, X) + tf.gather(wpe, positions_for(X, past_length))
        x = tok_embeds + pos_embeds # (B, T, C) + (B, T, C) -> (B, T, C)

        x = self.transformer_blocks(x)

        # stability and output projection
        #
        #    apply: h = norm(h, 'ln_f')
        x = self.final_norm(x)
        
        return x
        # return self.out_head(x) # logits
        # note: parameter sharing withe input embedding
        # logits = F.linear(x, self.token_embedding.weight)
        #return logits

class TransformerBlock(nn.Module):
    """ 
    Implementation of Transfomer Blocks
    
    note: I'm leaving out past_kv for now. It is an important optimization to avoid recomputing KV for each token
    in the sequence, but it adds complexity and I need to figure out how to implement it correctly. For training,
    kv caching isn't essential.
    
    from: https://github.com/openai/gpt-2/blob/9b63575ef42771a015060c964af2c3da4cf7c8ab/src/model.py#L123C1-L130C26
    
    def block(x, scope, *, past, hparams):
        with tf.variable_scope(scope):
            nx = x.shape[-1].value
            a, present = attn(norm(x, 'ln_1'), 'attn', nx, past=past, hparams=hparams)
            x = x + a
            m = mlp(norm(x, 'ln_2'), 'mlp', nx*4, hparams=hparams)
            x = x + m
            return x, present

    """
    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttention(
            d_in=cfg['emb_dim'], d_out=cfg['emb_dim'],
            context_length=cfg['context_length'],
            num_heads=cfg['n_heads'],
            dropout=cfg['dropout_rate'],
            qkv_bias=cfg['qkv_bias'])
        
        self.ff = FeedForward(cfg) # ff = mlp
        self.norm1 = LayerNorm(cfg['emb_dim']) # norm(x, 'ln_1')
        self.norm2 = LayerNorm(cfg['emb_dim']) # norm(x, 'ln_2')

    def forward(self, x): 
        nx = x # shortcut keeps the original input to help prevent vanishing gradients
        nx = self.norm1(x)
        a = self.att(nx)
        x = x + a

        x = self.norm2(x)
        x = self.ff(x)
        x = x + shortcut
        return x
        
class LayerNorm(nn.Module):
    """
    Implementation of norm (https://github.com/openai/gpt-2/blob/9b63575ef42771a015060c964af2c3da4cf7c8ab/src/model.py#L28)
    
    def norm(x, scope, *, axis=-1, epsilon=1e-5):
        # normalize to mean = 0, std = 1, then do a diagonal affine transform.
        with tf.variable_scope(scope):
            n_state = x.shape[-1].value
            g = tf.get_variable('g', [n_state], initializer=tf.constant_initializer(1))
            b = tf.get_variable('b', [n_state], initializer=tf.constant_initializer(0))
            u = tf.reduce_mean(x, axis=axis, keepdims=True)
            s = tf.reduce_mean(tf.square(x-u), axis=axis, keepdims=True)
            x = (x - u) * tf.rsqrt(s + epsilon)
            x = x*g + b
            return x
    """
    
    def __init__(self, dim):
        super().__init__()
        self.eps = 1e-5
        self.gain = nn.Parameter(torch.ones(dim)) # scale
        self.bias = nn.Parameter(torch.zeros(dim)) # shift

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False) # n is large enough that biased vs unbiased shouldn't matter, but this is what GPT2 does
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        # "then do a diagonal affine transform."
        # translation:
        # affine transfrom: y = Wx + b, diagonal means the transfrom is per feature
        # and the features don't interact. In essense, we learn a scale and shift for
        # each feature so that signals are appropriately strong after normalization 
        return self.gain * norm_x + self.bias


class FeedForward(nn.Module):
    """
    Position-wise Feed Forward from AIAYN. The module expands input features, applies activation,
    and then projects them back to the original dimensions. The idea is to learn more expressive
    relationships between features.
    
    note: GPT2 replaces ReLU with GELU.

    from: https://github.com/openai/gpt-2/blob/9b63575ef42771a015060c964af2c3da4cf7c8ab/src/model.py#L115
    
    def mlp(x, scope, n_state, *, hparams):
        with tf.variable_scope(scope):
            nx = x.shape[-1].value
            h = gelu(conv1d(x, 'c_fc', n_state))
            h2 = conv1d(h, 'c_proj', nx)
            return h2
"""
    def __init__(self, cfg):
        super().__init__()
        emb_dim = cfg['emb_dim']
        expansion = 4 # from attention is all you need
        self.expansion = nn.Linear(emb_dim, self.expansion * emb_dim) # conv1d: expand x 
        self.gelu = nn.GELU() 
        self.projection = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias) # conv1d 

    def forward(self, x):
        x = self.expansion(x)
        x = self.gelu(x)
        x = self.projection(x)
        return x

# GPT2 replace ReLU with Gaussian Error Linear Unit (GELU) as a smoother activation function.
# GELU based on math.erf (CDF of the standard normal distribution). For GELU, the integration
# of e^{-t^2} is approximated by a manually fit function of tanh. I believe GELU was used by BERT.
#
#    gelu(x):
#      return 0.5*x*(1+tf.tanh(np.sqrt(2/np.pi)*(x+0.044715*tf.pow(x, 3))))
#
# paper: https://arxiv.org/pdf/1606.08415

class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return (0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) * (x + 0.044715 * torch.pow(x, 3))
        )))