## Imports & Installs

In [None]:
!pip install transformers torchtyping 

In [22]:
import sys
import os

sys.path.append(os.path.join(os.getcwd(), "../../.."))

In [23]:
from dataclasses import dataclass

import numpy as np
import torch as t
from torch import nn
import transformers
from torchtyping import TensorType
from fancy_einsum import einsum
import einops

#
from arena.w2 import utils

ImportError: cannot import name 'utils' from 'arena.w2' (unknown location)

## Load weights

In [None]:
GPT2_VOCAB_SIZE = 50257
GPT2_HIDDEN_SIZE = 768
GPT2_MAX_SEQ_LEN = 1024
GPT2_DROPOUT = 0.1
GPT2_LN_EPS = 1e-05

@dataclass(frozen=True)
class TransformerConfig:
    '''Constants used throughout your decoder-only transformer model.'''
    num_layers: int = 6
    num_heads: int = 8
    vocab_size: int = GPT2_VOCAB_SIZE
    hidden_size: int = GPT2_HIDDEN_SIZE
    max_seq_len: int = GPT2_MAX_SEQ_LEN
    dropout: float = GPT2_DROPOUT
    layer_norm_epsilon: float = GPT2_LN_EPS

config = TransformerConfig()

In [7]:
transformers.AutoModelForCausalLM.from_pretrained("gpt2")
tokenizer = transformers.AutoTokenizer.from_pretrained("gpt2")

In [5]:

def mask(A: TensorType[..., "seq_len", "seq_len"]) -> TensorType[..., "seq_len", "seq_len"]:
    seq_len = A.shape[-1]

    mask = t.triu(t.ones(seq_len, seq_len), diagonal=1).bool()
    return A.masked_fill(mask, -np.inf)

def multihead_masked_attention(
    Q: TensorType["b", "s", "n*h"], 
    K: TensorType["b", "s", "n*h"], 
    num_heads: int
) -> TensorType["b", "n", "s_q", "s_k"]:
    '''
    Should return the results of multihead self-attention (after softmax, before multiplying with V)
    '''
    _Q = einops.rearrange(Q, "b s (n h) -> b n s h", n=num_heads)    
    _K = einops.rearrange(K, "b s (n h) -> b n s h", n=num_heads)    

    d_head = _Q.shape[-1]

    A_pre = mask(
        einsum("b n s_q h, b n s_k h -> b n s_q s_k", _Q, _K)
    ) / np.sqrt(d_head)

    return t.softmax(A_pre, dim=-1)


def multihead_masked_attention_head(
    A: TensorType["b", "n", "s_q", "s_k"], 
    V: TensorType["b", "s", "n*h"],
    num_heads: int
) -> TensorType["batch", "seq", "n_heads*headsize"]:
    _V = einops.rearrange(V, "b s (n h) -> b n s h", n=num_heads)
    AV: TensorType["b", "n", "s_q", "h"] = einsum("b n s_q s_k, b n s_k h -> b n s_q h", A, _V)
    return einops.rearrange(AV, "b n s h -> b s (n h)") 


class GPT2Attention(nn.Module):
    W_QKV: nn.Linear
    W_O: nn.Linear
    dropout: float

    def __init__(self, hidden_size: int, num_heads: int, dropout: float):
        self.hidden_size = hidden_size
        self.num_heads = num_heads
        self.head_size = hidden_size // num_heads

        super().__init__()

        self.W_QKV = nn.Linear(hidden_size, hidden_size * 3)
        self.W_O = nn.Linear(hidden_size, hidden_size)
        self.attn_droppout = nn.Dropout(dropout)
        self.resid_dropout = nn.Dropout(dropout)

    def forward(self, x: TensorType["batch", "seq", "hidden_size"]) -> TensorType["batch", "seq", "hidden_size"]:
        '''
        x: shape (batch, seq, hidden_size)

        Return: shape (batch, seq, hidden_size)
        '''
        Q, K, V = self.W_QKV(x).chunk(3, dim=-1)        
        A = multihead_masked_attention(Q, K, self.num_heads)
        A = self.attn_droppout(A)
        h = multihead_masked_attention_head(A, V, self.num_heads)
        x = self.W_O(h)
        return self.resid_dropout(x)

class GPT2MLP(nn.Module):

    def __init__(self, hidden_size: int, dropout: float):
        self.hidden_size = hidden_size

        super().__init__()

        self.linear1 = nn.Linear(hidden_size, hidden_size * 4)
        self.linear2 = nn.Linear(hidden_size * 4, hidden_size)
        self.gelu = nn.GELU()
        self.dropout = nn.Dropout(dropout)

    def forward(self, x: t.Tensor) -> t.Tensor:
        x = self.linear1(x)
        x = self.gelu(x)
        x = self.linear2(x)
        x = self.dropout(x)

        return x

class GPT2Block(nn.Module):
    def __init__(self, hidden_size: int, num_heads: int, layer_norm_epsilon: float, dropout: float):
        self.hidden_size = hidden_size
        self.num_heads = num_heads
        self.layer_norm_epsilon = layer_norm_epsilon
        self.dropout = dropout

        super().__init__()

        self.ln_1 = nn.LayerNorm(hidden_size, eps=layer_norm_epsilon)
        self.attn = GPT2Attention(hidden_size, num_heads, dropout=dropout)
        self.ln_2 = nn.LayerNorm(hidden_size, eps=layer_norm_epsilon)
        self.mlp = GPT2MLP(hidden_size, dropout)

    def forward(self, x: t.Tensor) -> t.Tensor:
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))

        return x


class GPT2(nn.Module):

    def __init__(self, config: TransformerConfig):
        self.config = config

        super().__init__()

        self.wte = nn.Embedding(config.vocab_size, config.hidden_size)
        self.wpe = nn.Embedding(config.max_seq_len, config.hidden_size)

        self.drop = nn.Dropout(config.dropout)
        self.h = nn.ModuleList([
            GPT2Block(config.hidden_size, config.num_heads, config.layer_norm_epsilon, config.dropout)
            for _ in range(config.num_layers)
        ])
        self.ln_f = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_epsilon, elementwise_affine=True)
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)        
        self.softmax = nn.Softmax(dim=2)

    def forward(self, x: t.Tensor) -> t.Tensor:
        pos = t.arange(x.shape[1], device=x.device)
        x = self.token_embedding(x) + self.positional_embedding(pos)

        x = self.dropout(x)

        for decoder_block in self.decoder_blocks:
            x = decoder_block(x)
        
        x = self.ln(x)
        x = self.unembed(x)
        x = self.softmax(x)

        return x


SyntaxError: incomplete input (1997931988.py, line 162)

In [None]:
utils.test_load_pretrained_weights(my_gpt, tokenizer)