In this notebook, I validate my implementation of transformer components and GPT-2 small by:
- Comparing the behavior of a GPT-2 transformer block with my own implementation of a GPT-2 transformer block
- Comparing the behavior of the GPT-2 embedding components with my own implementation of the GPT-2 embedding components
- Comparing the behavior of the full GPT-2 model with my own implementation of the full GPT-2 model

There are slight deviations between the outputs of my model and that of the original GPT-2 model. As shown below, this appears to be primarily due to the original GPT-2 model using a slightly different variant of GeLU from both me and PyTorch.

In [4]:
import torch as t
import sys
import os
notebook_path = os.path.abspath('')
project_root = os.path.join(notebook_path, '..')
sys.path.append(project_root)

## Comparing GPT2 transformer block (and components) with my implementation of a GPT2 transformer block

In [50]:
from transformers import GPT2Tokenizer, GPT2Model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')

In [51]:
import torch as t
from src.blocks import TransformerDecoderBlock

gpt2_block = model.h[0]
my_block = TransformerDecoderBlock(768, 12, 3072, 'mlpblock', 'gelu', 'layer_norm')

my_block.norm_layer1.weight, my_block.norm_layer1.bias = gpt2_block.ln_1.weight, gpt2_block.ln_1.bias
my_block.norm_layer2.weight, my_block.norm_layer2.bias = gpt2_block.ln_2.weight, gpt2_block.ln_2.bias

my_block.mlp_block.linear1.weight, my_block.mlp_block.linear1.bias = t.nn.Parameter(gpt2_block.mlp.c_fc.weight.T), gpt2_block.mlp.c_fc.bias
my_block.mlp_block.linear2.weight, my_block.mlp_block.linear2.bias = t.nn.Parameter(gpt2_block.mlp.c_proj.weight.T), gpt2_block.mlp.c_proj.bias

(wq,wk,wv) = t.chunk(gpt2_block.attn.c_attn.weight.T, 3, dim=0)
(bq,bk,bv) = t.chunk(gpt2_block.attn.c_attn.bias, 3, dim=0)

my_block.mha_block.linear_q.weight, my_block.mha_block.linear_q.bias = t.nn.Parameter(wq), t.nn.Parameter(bq)
my_block.mha_block.linear_k.weight, my_block.mha_block.linear_k.bias = t.nn.Parameter(wk), t.nn.Parameter(bk)
my_block.mha_block.linear_v.weight, my_block.mha_block.linear_v.bias = t.nn.Parameter(wv), t.nn.Parameter(bv)
my_block.mha_block.linear_o.weight, my_block.mha_block.linear_o.bias = t.nn.Parameter(gpt2_block.attn.c_proj.weight.T), gpt2_block.attn.c_proj.bias

my_block.eval()
gpt2_block.eval()

print(gpt2_block)
print(my_block)

GPT2Block(
  (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (attn): GPT2Attention(
    (c_attn): Conv1D()
    (c_proj): Conv1D()
    (attn_dropout): Dropout(p=0.1, inplace=False)
    (resid_dropout): Dropout(p=0.1, inplace=False)
  )
  (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (mlp): GPT2MLP(
    (c_fc): Conv1D()
    (c_proj): Conv1D()
    (act): NewGELUActivation()
    (dropout): Dropout(p=0.1, inplace=False)
  )
)
TransformerDecoderBlock(
  (norm_layer1): LayerNorm()
  (norm_layer2): LayerNorm()
  (mha_block): MultiheadAttentionBlock(
    (linear_q): Linear()
    (linear_k): Linear()
    (linear_v): Linear()
    (linear_o): Linear()
  )
  (mlp_block): MLPBlock(
    (linear1): Linear()
    (linear2): Linear()
  )
)


#### Comparing transformer blocks

In [17]:
x = t.randn((1,5,768))

seq_len = x.shape[-2]
att_mask = t.where(t.arange(seq_len).unsqueeze(1) < t.arange(seq_len), -t.inf, 0)

gpt2_block.eval()
my_block.eval()

print("GPT2 Transformer block")
print(gpt2_block(x)[0])
print("My transformer block")
print(my_block(x))

GPT2 Transformer block
tensor([[[11.8319,  8.1987,  4.1071,  ...,  7.8378, 22.8137,  2.5132],
         [-0.4134, -8.7172,  0.4104,  ...,  8.0247,  7.2181, 14.3804],
         [ 0.9540, -5.9592, -7.2451,  ...,  1.9031,  0.2349, 11.5673],
         [ 8.3677,  4.3870, -2.9901,  ...,  3.9888, -5.4587,  3.0287],
         [-1.1786, -0.9616,  2.8636,  ...,  3.2149, -2.4172,  2.9858]]],
       grad_fn=<AddBackward0>)
My transformer block
tensor([[[11.8319,  8.1991,  4.1074,  ...,  7.8380, 22.8131,  2.5124],
         [-0.4140, -8.7163,  0.4102,  ...,  8.0232,  7.2171, 14.3786],
         [ 0.9541, -5.9614, -7.2454,  ...,  1.9033,  0.2347, 11.5671],
         [ 8.3662,  4.3870, -2.9897,  ...,  3.9888, -5.4584,  3.0299],
         [-1.1787, -0.9614,  2.8644,  ...,  3.2139, -2.4160,  2.9847]]],
       grad_fn=<AddBackward0>)


#### Comparing normalization

In [18]:
print("Normalization")

print("GPT2 Layer Norm 1")
print(gpt2_block.ln_1(x))
print("My Layer Norm 1")
print(my_block.norm_layer1(x))

print("GPT2 Layer Norm 2")
print(gpt2_block.ln_2(x))
print("My Layer Norm 2")
print(my_block.norm_layer2(x))

print(t.allclose(my_block.norm_layer1(x), gpt2_block.ln_1(x), atol=1e-8))
print(t.allclose(my_block.norm_layer2(x), gpt2_block.ln_2(x), atol=1e-8))


Normalization
GPT2 Layer Norm 1
tensor([[[ 0.1271,  0.0875,  0.3001,  ...,  0.0327,  0.3548, -0.0742],
         [-0.1189, -0.4109,  0.0088,  ..., -0.0258, -0.0474,  0.0364],
         [ 0.3369, -0.1295, -0.0517,  ..., -0.0753,  0.0250,  0.0757],
         [-0.0990, -0.1408,  0.0405,  ...,  0.0439, -0.0470,  0.1307],
         [ 0.0546,  0.1112, -0.1200,  ...,  0.1650, -0.2234,  0.1034]]],
       grad_fn=<NativeLayerNormBackward0>)
My Layer Norm 1
tensor([[[ 0.1271,  0.0875,  0.3001,  ...,  0.0327,  0.3548, -0.0742],
         [-0.1189, -0.4109,  0.0088,  ..., -0.0258, -0.0474,  0.0364],
         [ 0.3369, -0.1295, -0.0517,  ..., -0.0753,  0.0250,  0.0757],
         [-0.0990, -0.1408,  0.0405,  ...,  0.0439, -0.0470,  0.1307],
         [ 0.0546,  0.1112, -0.1200,  ...,  0.1650, -0.2234,  0.1034]]],
       grad_fn=<AddBackward0>)
GPT2 Layer Norm 2
tensor([[[ 0.1192,  0.1020,  0.4947,  ...,  0.2802,  2.6047, -0.3328],
         [-0.0251, -0.4713,  0.1025,  ..., -0.1178, -0.2273,  0.2935],
    

#### Comparing attention

In [20]:
print("Multi-head attention")
print("GPT2 Attention")
print(gpt2_block.attn(x)[0])
print("My attention")
print(my_block.mha_block(x, attention_mask=att_mask))


print(t.allclose(my_block.mha_block(x, attention_mask=att_mask), gpt2_block.attn(x)[0], atol=1e-4))

Multi-head attention
GPT2 Attention
tensor([[[ 6.6055e-01, -7.5954e+00, -5.4793e+00,  ..., -3.6606e-01,
           7.8667e-01,  1.6924e-01],
         [-1.4892e-01, -8.9095e+00,  2.7064e-01,  ..., -1.6242e-01,
           1.8249e-01, -5.3543e-01],
         [-7.1339e+00, -2.3403e+01, -7.0618e+00,  ..., -3.9493e-01,
           1.4001e+00,  8.8264e-01],
         [ 4.2808e+00,  6.6949e+00,  6.5864e+00,  ..., -4.0145e-01,
           1.1179e+00, -1.4970e-02],
         [-6.9755e+00,  8.2094e-01, -3.4451e+00,  ..., -7.4120e-02,
           5.3183e-02, -9.0514e-01]]], grad_fn=<ViewBackward0>)
My attention
tensor([[[ 6.6055e-01, -7.5954e+00, -5.4793e+00,  ..., -3.6606e-01,
           7.8667e-01,  1.6924e-01],
         [-1.4892e-01, -8.9095e+00,  2.7065e-01,  ..., -1.6242e-01,
           1.8249e-01, -5.3543e-01],
         [-7.1339e+00, -2.3403e+01, -7.0618e+00,  ..., -3.9493e-01,
           1.4001e+00,  8.8264e-01],
         [ 4.2808e+00,  6.6949e+00,  6.5864e+00,  ..., -4.0145e-01,
           1.117

#### Comparing MLP

In [19]:
print("MLP")
print("GPT2 MLP")
print(gpt2_block.mlp(x))
print("My MLP")
print(my_block.mlp_block(x))

print(t.allclose(my_block.mlp_block(x), gpt2_block.mlp(x), atol=1e-2))

MLP
GPT2 MLP
tensor([[[ 17.9010,  13.8917,  14.8560,  ...,  12.2650,  28.4097,  11.8572],
         [ -0.8464,  -1.3232,   0.1836,  ...,  12.6661,   2.7973,  12.5186],
         [ -5.7418, -10.0784, -12.0590,  ...,  17.7406, -12.0171,  19.6903],
         [ 15.4184,   2.8175, -12.2079,  ...,  13.9143,  -8.7852,   9.1022],
         [  3.1879, -10.1602,   5.9635,  ...,   4.8981,   6.2863,   6.4367]]],
       grad_fn=<ViewBackward0>)
My MLP
tensor([[[ 17.9005,  13.8931,  14.8554,  ...,  12.2647,  28.4102,  11.8568],
         [ -0.8461,  -1.3219,   0.1840,  ...,  12.6645,   2.7964,  12.5188],
         [ -5.7424, -10.0783, -12.0588,  ...,  17.7390, -12.0182,  19.6907],
         [ 15.4166,   2.8183, -12.2077,  ...,  13.9135,  -8.7857,   9.1014],
         [  3.1873, -10.1606,   5.9645,  ...,   4.8993,   6.2862,   6.4379]]],
       grad_fn=<AsStridedBackward0>)
True


#### Comparing GeLU

In [14]:
from src.activations import gelu
from torch.nn.functional import gelu as torch_gelu

print("GeLU")
x = t.randn((10,))

print("GPT2 GeLU")
print(gpt2_block.mlp.act(x))
print("My GeLU")
print(gelu(x))
print("Torch GeLU")
print(torch_gelu(x))


GeLU
GPT2 GeLU
tensor([-0.1350, -0.0685, -0.1540, -0.1424,  0.0410, -0.1226,  0.2493, -0.1480,
        -0.1072,  0.6838])
My GeLU
tensor([-0.1350, -0.0685, -0.1539, -0.1422,  0.0410, -0.1226,  0.2493, -0.1480,
        -0.1072,  0.6839])
Torch GeLU
tensor([-0.1350, -0.0685, -0.1539, -0.1422,  0.0410, -0.1226,  0.2493, -0.1480,
        -0.1072,  0.6839])


#### Comparing MLP using GPT2 GeLU

In [22]:
print("My MLP with their GELU")
x = t.randn((1, 10, 768))

print("GPT2 MLP Block")
gpt2_mlp = gpt2_block.mlp(x)
print(gpt2_mlp)

print("My MLP Block with GPT2 GeLU")
my_mlp = my_block.mlp_block.linear2(gpt2_block.mlp.act(my_block.mlp_block.linear1(x)))
print(my_mlp)

print(t.allclose(gpt2_mlp, my_mlp, atol=1e-4))

print("The main difference between our models is in the MLP block, because OpenAI used a slightly different version of GeLU.")

My MLP with their GELU
GPT2 MLP Block
tensor([[[ -2.9164,  -0.0618,  15.8952,  ...,   4.7298,  20.2631,   9.4219],
         [  5.4619, -13.5041,  -4.1717,  ...,   8.0976,  12.3699,   6.2095],
         [ -5.8160,  -0.1967, -12.2969,  ..., -16.4890,  -2.2318,  18.7375],
         ...,
         [ -7.1169,   2.0982,  -2.5297,  ...,   0.4998,  18.7223, -13.8197],
         [ 27.7963, -26.1246, -13.4007,  ...,  15.4092,  15.5402,   7.5306],
         [ -9.6909,  -1.7319,  -8.2100,  ...,   8.1588,   8.5199,  34.5271]]],
       grad_fn=<ViewBackward0>)
My MLP Block with GPT2 GeLU
tensor([[[ -2.9164,  -0.0618,  15.8952,  ...,   4.7298,  20.2631,   9.4219],
         [  5.4619, -13.5041,  -4.1717,  ...,   8.0976,  12.3699,   6.2095],
         [ -5.8160,  -0.1967, -12.2969,  ..., -16.4890,  -2.2318,  18.7375],
         ...,
         [ -7.1169,   2.0982,  -2.5297,  ...,   0.4998,  18.7223, -13.8197],
         [ 27.7963, -26.1246, -13.4007,  ...,  15.4092,  15.5402,   7.5306],
         [ -9.6909,  -1.7

## Comparing GPT2 embedding with my implementation of the GPT2 embedding

In [26]:
from src.models import GPT2SmallModel
my_model = GPT2SmallModel()

from transformers import GPT2Tokenizer, GPT2Model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')

my_model.embedding_layer.position_embedding_layer.weight = model.wpe.weight
my_model.embedding_layer.token_embedding_layer.weight = model.wte.weight

text = "This is a test sentence."
tokens = t.tensor(tokenizer(text)['input_ids']).unsqueeze(0)
tokens

tensor([[1212,  318,  257, 1332, 6827,   13]])

In [30]:
print("Embedding outputs")
print("GPT2 embedding output")
print(model.wte(tokens) + model.wpe(t.arange(tokens.shape[-1])))
print("My embedding output")
print(my_model.embedding_layer(tokens))

Embedding outputs
GPT2 embedding output
tensor([[[ 0.0065, -0.2930,  0.0762,  ...,  0.0184, -0.0275,  0.1638],
         [ 0.0142, -0.0437, -0.0393,  ...,  0.1487, -0.0278, -0.0255],
         [-0.0464, -0.0791,  0.1016,  ...,  0.0623,  0.0928, -0.0598],
         [-0.0580,  0.0095,  0.2207,  ..., -0.0635,  0.0760, -0.0543],
         [-0.0888, -0.0326,  0.1666,  ..., -0.2539, -0.0370, -0.2046],
         [ 0.0562, -0.0452,  0.1596,  ..., -0.0676,  0.0567,  0.0888]]],
       grad_fn=<AddBackward0>)
My embedding output
tensor([[[ 0.0065, -0.2930,  0.0762,  ...,  0.0184, -0.0275,  0.1638],
         [ 0.0142, -0.0437, -0.0393,  ...,  0.1487, -0.0278, -0.0255],
         [-0.0464, -0.0791,  0.1016,  ...,  0.0623,  0.0928, -0.0598],
         [-0.0580,  0.0095,  0.2207,  ..., -0.0635,  0.0760, -0.0543],
         [-0.0888, -0.0326,  0.1666,  ..., -0.2539, -0.0370, -0.2046],
         [ 0.0562, -0.0452,  0.1596,  ..., -0.0676,  0.0567,  0.0888]]],
       grad_fn=<AddBackward0>)


## Comparing GPT2 model vs. my implementation of the GPT2 model

In [5]:
from src.models import GPT2SmallModel
my_model = GPT2SmallModel()
my_model.eval()

# Importing GPT2LMHeadModel instead of GPT2Model because it applies the final unembed to get logits
from transformers import AutoTokenizer, GPT2LMHeadModel
tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
gpt2_model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2")
gpt2_model.eval()

text = "This is a test sentence."
tokens = t.tensor(tokenizer(text)['input_ids']).unsqueeze(0)
tokens

tensor([[1212,  318,  257, 1332, 6827,   13]])

In [6]:
# Copying over weights
my_model.embedding_layer.position_embedding_layer.weight = gpt2_model.transformer.wpe.weight
my_model.embedding_layer.token_embedding_layer.weight = gpt2_model.transformer.wte.weight

my_model.final_ln.weight, my_model.final_ln.bias = gpt2_model.transformer.ln_f.weight, gpt2_model.transformer.ln_f.bias

my_model_block_list = my_model.transformer_blocks
gpt2_model_block_list = gpt2_model.transformer.h

for my_block, gpt2_block in zip(my_model_block_list, gpt2_model_block_list):
    my_block.norm_layer1.weight, my_block.norm_layer1.bias = gpt2_block.ln_1.weight, gpt2_block.ln_1.bias
    my_block.norm_layer2.weight, my_block.norm_layer2.bias = gpt2_block.ln_2.weight, gpt2_block.ln_2.bias

    my_block.mlp_block.linear1.weight, my_block.mlp_block.linear1.bias = t.nn.Parameter(gpt2_block.mlp.c_fc.weight.T), gpt2_block.mlp.c_fc.bias
    my_block.mlp_block.linear2.weight, my_block.mlp_block.linear2.bias = t.nn.Parameter(gpt2_block.mlp.c_proj.weight.T), gpt2_block.mlp.c_proj.bias

    (wq,wk,wv) = t.chunk(gpt2_block.attn.c_attn.weight.T, 3, dim=0)
    (bq,bk,bv) = t.chunk(gpt2_block.attn.c_attn.bias, 3, dim=0)

    my_block.mha_block.linear_q.weight, my_block.mha_block.linear_q.bias = t.nn.Parameter(wq), t.nn.Parameter(bq)
    my_block.mha_block.linear_k.weight, my_block.mha_block.linear_k.bias = t.nn.Parameter(wk), t.nn.Parameter(bk)
    my_block.mha_block.linear_v.weight, my_block.mha_block.linear_v.bias = t.nn.Parameter(wv), t.nn.Parameter(bv)
    my_block.mha_block.linear_o.weight, my_block.mha_block.linear_o.bias = t.nn.Parameter(gpt2_block.attn.c_proj.weight.T), gpt2_block.attn.c_proj.bias


### Logit outputs

In [89]:
print("Model logits")
print("GPT2 model logits")
gpt2_logits = gpt2_model(**tokenizer(text, return_tensors='pt')).logits
print(gpt2_logits)
print("My model logits")
my_logits = my_model(tokens)
print(my_logits)

print(t.allclose(gpt2_logits, my_logits, atol=1e-1))

Model logits
GPT2 model logits
tensor([[[ -35.8890,  -35.2049,  -39.1336,  ...,  -42.4869,  -41.8197,
           -36.0383],
         [-107.7291, -108.0175, -113.2967,  ..., -116.4645, -115.7443,
          -110.8654],
         [-111.7507, -111.5704, -114.5443,  ..., -120.7242, -117.1756,
          -112.3996],
         [ -86.1846,  -88.5057,  -94.3530,  ..., -101.3573,  -98.6974,
           -91.1616],
         [-106.4531, -108.7300, -115.4155,  ..., -119.6631, -119.1774,
          -110.7877],
         [-146.7139, -145.9828, -146.9487,  ..., -155.2113, -158.0557,
          -139.4035]]], grad_fn=<UnsafeViewBackward0>)
My model logits
tensor([[[ -35.8260,  -35.1460,  -39.0735,  ...,  -42.4222,  -41.7547,
           -35.9785],
         [-107.7155, -108.0055, -113.2770,  ..., -116.4519, -115.7324,
          -110.8482],
         [-111.7447, -111.5662, -114.5354,  ..., -120.7213, -117.1712,
          -112.3930],
         [ -86.1794,  -88.5035,  -94.3420,  ..., -101.3490,  -98.6924,
           -

### Argmax sampling

In [42]:
text = "The Large Apple, by Roald Dahl. \nLate at night,"
tokens_gpt2 = tokenizer(text, return_tensors='pt')
tokens_curr = tokenizer(text, return_tensors='pt')['input_ids']

tokens_gpt2

{'input_ids': tensor([[  464, 13601,  4196,    11,   416,  5564,  1940, 41471,    13,   220,
           198, 26302,   379,  1755,    11]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [43]:
for i in range(10):
    gpt2_next_token = gpt2_model(**tokens_gpt2).logits[0][-1].argmax()
    my_model_next_token = my_model(tokens_curr)[0][-1].argmax()

    tokens_gpt2['input_ids'] = t.cat([tokens_gpt2['input_ids'], gpt2_next_token.reshape((1,1))], dim=1)
    tokens_gpt2['attention_mask'] = t.cat([tokens_gpt2['attention_mask'], t.tensor(1).reshape((1,1))], dim=1)
    tokens_curr = t.cat([tokens_curr, my_model_next_token.reshape((1,1))], dim=1)

In [47]:
print("GPT2 argmax output")
print(''.join(tokenizer.convert_ids_to_tokens(tokens_gpt2['input_ids'][0])).replace('Ġ', ' ').replace('Ċ', '\n'))
print()
print("My implementation output")
print(''.join(tokenizer.convert_ids_to_tokens(tokens_curr[0])).replace('Ġ', ' ').replace('Ċ', '\n'))

GPT2 argmax output
The Large Apple, by Roald Dahl. 
Late at night, I was sitting in the living room of my home

My implementation output
The Large Apple, by Roald Dahl. 
Late at night, I was sitting in the living room of my home
