In this notebook, I validate my implementation of transformer components and GPT-2 small by:
- Comparing the behavior of a GPT-2 transformer block with my own implementation of a GPT-2 transformer block
- Comparing the behavior of the GPT-2 embedding components with my own implementation of the GPT-2 embedding components
- Comparing the behavior of the full GPT-2 model with my own implementation of the full GPT-2 model
- Sampling from my implementation of the full GPT-2 model

There are slight deviations between the outputs of my model and that of the original GPT-2 model. As shown below, this appears to be primarily due to the original GPT-2 model using a slightly different variant of GeLU from both me and PyTorch.

In [4]:
import torch as t
import sys
import os
notebook_path = os.path.abspath('')
project_root = os.path.join(notebook_path, '..')
sys.path.append(project_root)

## Comparing GPT2 transformer block (and components) with my implementation of a GPT2 transformer block

In [102]:
from transformers import GPT2Tokenizer, GPT2Model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')

In [103]:
import torch as t
from src.blocks import TransformerDecoderBlock

gpt2_block = model.h[0]
my_block = TransformerDecoderBlock(768, 12, 3072, 'mlpblock', 'gelu', 'layer_norm')

my_block.norm_layer1.weight, my_block.norm_layer1.bias = gpt2_block.ln_1.weight, gpt2_block.ln_1.bias
my_block.norm_layer2.weight, my_block.norm_layer2.bias = gpt2_block.ln_2.weight, gpt2_block.ln_2.bias

my_block.mlp_block.linear1.weight, my_block.mlp_block.linear1.bias = t.nn.Parameter(gpt2_block.mlp.c_fc.weight.T), gpt2_block.mlp.c_fc.bias
my_block.mlp_block.linear2.weight, my_block.mlp_block.linear2.bias = t.nn.Parameter(gpt2_block.mlp.c_proj.weight.T), gpt2_block.mlp.c_proj.bias

(wq,wk,wv) = t.chunk(gpt2_block.attn.c_attn.weight.T, 3, dim=0)
(bq,bk,bv) = t.chunk(gpt2_block.attn.c_attn.bias, 3, dim=0)

my_block.mha_block.linear_q.weight, my_block.mha_block.linear_q.bias = t.nn.Parameter(wq), t.nn.Parameter(bq)
my_block.mha_block.linear_k.weight, my_block.mha_block.linear_k.bias = t.nn.Parameter(wk), t.nn.Parameter(bk)
my_block.mha_block.linear_v.weight, my_block.mha_block.linear_v.bias = t.nn.Parameter(wv), t.nn.Parameter(bv)
my_block.mha_block.linear_o.weight, my_block.mha_block.linear_o.bias = t.nn.Parameter(gpt2_block.attn.c_proj.weight.T), gpt2_block.attn.c_proj.bias

my_block.eval()
gpt2_block.eval()

print(gpt2_block)
print(my_block)

GPT2Block(
  (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (attn): GPT2Attention(
    (c_attn): Conv1D()
    (c_proj): Conv1D()
    (attn_dropout): Dropout(p=0.1, inplace=False)
    (resid_dropout): Dropout(p=0.1, inplace=False)
  )
  (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (mlp): GPT2MLP(
    (c_fc): Conv1D()
    (c_proj): Conv1D()
    (act): NewGELUActivation()
    (dropout): Dropout(p=0.1, inplace=False)
  )
)
TransformerDecoderBlock(
  (norm_layer1): LayerNorm()
  (norm_layer2): LayerNorm()
  (mha_block): MultiheadAttentionBlock(
    (linear_q): Linear()
    (linear_k): Linear()
    (linear_v): Linear()
    (linear_o): Linear()
  )
  (mlp_block): MLPBlock(
    (linear1): Linear()
    (linear2): Linear()
  )
)


#### Comparing transformer blocks

In [104]:
x = t.randn((1,5,768))

seq_len = x.shape[-2]
att_mask = t.where(t.arange(seq_len).unsqueeze(1) < t.arange(seq_len), -t.inf, 0)

gpt2_block.eval()
my_block.eval()

print("GPT2 Transformer block")
print(gpt2_block(x)[0])
print("My transformer block")
print(my_block(x))

GPT2 Transformer block
tensor([[[ 11.9301,  -5.1084,   4.8721,  ...,   3.5804,   6.0616,   2.0907],
         [ -2.5539,  -2.2660, -14.4894,  ...,  -3.6857,   2.9383,  13.2351],
         [  7.7575,   2.9038,   2.2469,  ...,   3.5669,   1.8584,  -2.6753],
         [ 21.1863,   5.4953,  -1.4163,  ...,  16.6159,  12.6175,   5.8836],
         [  8.3480,  10.5500,  -3.7784,  ...,   7.5318,   3.0071,   2.8194]]],
       grad_fn=<AddBackward0>)
My transformer block
tensor([[[ 11.9307,  -5.1094,   4.8729,  ...,   3.5806,   6.0613,   2.0899],
         [ -2.5542,  -2.2671, -14.4904,  ...,  -3.6855,   2.9391,  13.2340],
         [  7.7590,   2.9039,   2.2472,  ...,   3.5682,   1.8589,  -2.6756],
         [ 21.1843,   5.4948,  -1.4154,  ...,  16.6158,  12.6155,   5.8824],
         [  8.3466,  10.5499,  -3.7755,  ...,   7.5308,   3.0064,   2.8195]]],
       grad_fn=<AddBackward0>)


#### Comparing normalization

In [108]:
print("Normalization")

print("GPT2 Layer Norm 1")
print(gpt2_block.ln_1(x))
print("My Layer Norm 1")
print(my_block.norm_layer1(x))

print("GPT2 Layer Norm 2")
print(gpt2_block.ln_2(x))
print("My Layer Norm 2")
print(my_block.norm_layer2(x))

print(t.allclose(my_block.norm_layer1(x), gpt2_block.ln_1(x), atol=1e-7))
print(t.allclose(my_block.norm_layer2(x), gpt2_block.ln_2(x), atol=1e-7))


Normalization
GPT2 Layer Norm 1
tensor([[[ 0.0605,  0.1313,  0.2511,  ..., -0.0163, -0.2598, -0.0323],
         [ 0.0569, -0.3122,  0.2620,  ...,  0.0307, -0.0726, -0.3277],
         [ 0.0651,  0.0122, -0.0984,  ...,  0.2342, -0.0984, -0.0732],
         [-0.2922,  0.1620, -0.2315,  ...,  0.0681,  0.3746,  0.0105],
         [ 0.3607,  0.2928, -0.1951,  ...,  0.0104, -0.1599,  0.0040]]],
       grad_fn=<NativeLayerNormBackward0>)
My Layer Norm 1
tensor([[[ 0.0605,  0.1313,  0.2511,  ..., -0.0163, -0.2598, -0.0323],
         [ 0.0569, -0.3122,  0.2620,  ...,  0.0307, -0.0726, -0.3277],
         [ 0.0651,  0.0122, -0.0984,  ...,  0.2342, -0.0984, -0.0732],
         [-0.2922,  0.1620, -0.2315,  ...,  0.0681,  0.3746,  0.0105],
         [ 0.3607,  0.2928, -0.1951,  ...,  0.0104, -0.1599,  0.0040]]],
       grad_fn=<AddBackward0>)
GPT2 Layer Norm 2
tensor([[[ 0.0801,  0.1524,  0.4288,  ..., -0.0530, -1.7230, -0.0954],
         [ 0.0780, -0.3579,  0.4435,  ...,  0.2663, -0.4050, -1.7670],
    

#### Comparing attention

In [111]:
print("Multi-head attention")
print("GPT2 Attention")
print(gpt2_block.attn(x)[0])
print("My attention")
print(my_block.mha_block(x, attention_mask=att_mask))


print(t.allclose(my_block.mha_block(x, attention_mask=att_mask), gpt2_block.attn(x)[0], atol=1e-3))

Multi-head attention
GPT2 Attention
tensor([[[ 18.9150, -10.0025,   5.9849,  ...,  -1.2226,   0.2665,  -0.5117],
         [-18.4654,  -4.1438,  -4.3223,  ...,  -1.0013,  -0.2782,  -0.0262],
         [ -9.5618,   1.7456,  10.6257,  ...,  -1.2053,   0.2052,  -0.1198],
         [ 12.0544,  -2.0354,   1.9754,  ...,   0.0529,   1.7595,  -0.7167],
         [  0.0726,   0.9221,   7.5586,  ...,  -0.5898,   0.7325,   0.6961]]],
       grad_fn=<ViewBackward0>)
My attention
tensor([[[ 18.9150, -10.0025,   5.9849,  ...,  -1.2226,   0.2665,  -0.5117],
         [-18.4654,  -4.1438,  -4.3223,  ...,  -1.0013,  -0.2782,  -0.0261],
         [ -9.5619,   1.7456,  10.6257,  ...,  -1.2053,   0.2052,  -0.1198],
         [ 12.0544,  -2.0354,   1.9754,  ...,   0.0529,   1.7595,  -0.7167],
         [  0.0726,   0.9221,   7.5586,  ...,  -0.5898,   0.7325,   0.6961]]],
       grad_fn=<AsStridedBackward0>)
True


#### Comparing MLP

In [112]:
print("MLP")
print("GPT2 MLP")
print(gpt2_block.mlp(x))
print("My MLP")
print(my_block.mlp_block(x))

print(t.allclose(my_block.mlp_block(x), gpt2_block.mlp(x), atol=1e-2))

MLP
GPT2 MLP
tensor([[[ 18.8685,  -6.8093,  -0.5403,  ...,   9.7797,   6.4821,   5.9411],
         [ -7.1819,   1.2927, -29.0145,  ...,  -2.1262,   7.9047,  32.0431],
         [ 14.4651,   3.1689,  -3.1675,  ...,  10.2508,   7.9537,  10.2513],
         [ 30.5291,  14.8931,  -5.8086,  ...,  38.0323,  22.8713,  18.3896],
         [ -8.8284,  14.1699,   8.9831,  ...,   5.9502,  -6.2154,   9.4560]]],
       grad_fn=<ViewBackward0>)
My MLP
tensor([[[ 18.8687,  -6.8084,  -0.5406,  ...,   9.7786,   6.4831,   5.9401],
         [ -7.1822,   1.2930, -29.0140,  ...,  -2.1249,   7.9042,  32.0436],
         [ 14.4647,   3.1703,  -3.1672,  ...,  10.2521,   7.9546,  10.2511],
         [ 30.5294,  14.8922,  -5.8092,  ...,  38.0312,  22.8707,  18.3888],
         [ -8.8288,  14.1707,   8.9832,  ...,   5.9493,  -6.2142,   9.4560]]],
       grad_fn=<AsStridedBackward0>)
True


#### Comparing GeLU

In [113]:
from src.activations import gelu
from torch.nn.functional import gelu as torch_gelu

print("GeLU")
x = t.randn((10,))

print("GPT2 GeLU")
print(gpt2_block.mlp.act(x))
print("My GeLU")
print(gelu(x))
print("Torch GeLU")
print(torch_gelu(x))


GeLU
GPT2 GeLU
tensor([ 0.4378,  0.7747,  0.0714, -0.1113,  0.3848,  0.6462,  0.0711, -0.1700,
        -0.1659, -0.1658])
My GeLU
tensor([ 0.4378,  0.7749,  0.0714, -0.1113,  0.3848,  0.6463,  0.0711, -0.1699,
        -0.1658, -0.1657])
Torch GeLU
tensor([ 0.4378,  0.7749,  0.0714, -0.1113,  0.3848,  0.6463,  0.0711, -0.1699,
        -0.1658, -0.1657])


#### Comparing MLP using GPT2 GeLU

In [114]:
print("My MLP with their GELU")
x = t.randn((1, 10, 768))

print("GPT2 MLP Block")
gpt2_mlp = gpt2_block.mlp(x)
print(gpt2_mlp)

print("My MLP Block with GPT2 GeLU")
my_mlp = my_block.mlp_block.linear2(gpt2_block.mlp.act(my_block.mlp_block.linear1(x)))
print(my_mlp)

print(t.allclose(gpt2_mlp, my_mlp, atol=1e-4))

print("The main difference between our models is in the MLP block, because OpenAI used a slightly different version of GeLU.")

My MLP with their GELU
GPT2 MLP Block
tensor([[[ 23.0493, -16.3688, -22.0839,  ...,   0.1566,   4.3208,  -6.6208],
         [-17.7008,   2.2056,  11.8246,  ...,  17.5515,  13.1587,  28.4457],
         [-13.3058,   2.7366, -11.2752,  ...,   1.7804,  18.1479,  -6.7358],
         ...,
         [  6.6704,  -3.8881,   0.7262,  ...,  -3.4565,  10.2489,   0.4162],
         [  7.2146, -10.6892,  -4.7511,  ...,   0.7938,  11.3679,  -6.8258],
         [  2.4927,  -8.1006, -18.8348,  ...,   8.8312,  13.5325,  -9.2248]]],
       grad_fn=<ViewBackward0>)
My MLP Block with GPT2 GeLU
tensor([[[ 23.0493, -16.3688, -22.0839,  ...,   0.1566,   4.3208,  -6.6208],
         [-17.7008,   2.2056,  11.8246,  ...,  17.5515,  13.1587,  28.4457],
         [-13.3058,   2.7366, -11.2752,  ...,   1.7804,  18.1479,  -6.7358],
         ...,
         [  6.6704,  -3.8881,   0.7262,  ...,  -3.4565,  10.2489,   0.4162],
         [  7.2146, -10.6892,  -4.7511,  ...,   0.7938,  11.3679,  -6.8258],
         [  2.4927,  -8.1

## Comparing GPT2 embedding with my implementation of the GPT2 embedding

In [26]:
from src.models import GPT2SmallModel
my_model = GPT2SmallModel()

from transformers import GPT2Tokenizer, GPT2Model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')

my_model.embedding_layer.position_embedding_layer.weight = model.wpe.weight
my_model.embedding_layer.token_embedding_layer.weight = model.wte.weight

text = "This is a test sentence."
tokens = t.tensor(tokenizer(text)['input_ids']).unsqueeze(0)
tokens

tensor([[1212,  318,  257, 1332, 6827,   13]])

In [30]:
print("Embedding outputs")
print("GPT2 embedding output")
print(model.wte(tokens) + model.wpe(t.arange(tokens.shape[-1])))
print("My embedding output")
print(my_model.embedding_layer(tokens))

Embedding outputs
GPT2 embedding output
tensor([[[ 0.0065, -0.2930,  0.0762,  ...,  0.0184, -0.0275,  0.1638],
         [ 0.0142, -0.0437, -0.0393,  ...,  0.1487, -0.0278, -0.0255],
         [-0.0464, -0.0791,  0.1016,  ...,  0.0623,  0.0928, -0.0598],
         [-0.0580,  0.0095,  0.2207,  ..., -0.0635,  0.0760, -0.0543],
         [-0.0888, -0.0326,  0.1666,  ..., -0.2539, -0.0370, -0.2046],
         [ 0.0562, -0.0452,  0.1596,  ..., -0.0676,  0.0567,  0.0888]]],
       grad_fn=<AddBackward0>)
My embedding output
tensor([[[ 0.0065, -0.2930,  0.0762,  ...,  0.0184, -0.0275,  0.1638],
         [ 0.0142, -0.0437, -0.0393,  ...,  0.1487, -0.0278, -0.0255],
         [-0.0464, -0.0791,  0.1016,  ...,  0.0623,  0.0928, -0.0598],
         [-0.0580,  0.0095,  0.2207,  ..., -0.0635,  0.0760, -0.0543],
         [-0.0888, -0.0326,  0.1666,  ..., -0.2539, -0.0370, -0.2046],
         [ 0.0562, -0.0452,  0.1596,  ..., -0.0676,  0.0567,  0.0888]]],
       grad_fn=<AddBackward0>)


## Comparing GPT2 model vs. my implementation of the GPT2 model

In [5]:
from src.models import GPT2SmallModel
my_model = GPT2SmallModel()
my_model.eval()

# Importing GPT2LMHeadModel instead of GPT2Model because it applies the final unembed to get logits
from transformers import AutoTokenizer, GPT2LMHeadModel
tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
gpt2_model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2")
gpt2_model.eval()

text = "This is a test sentence."
tokens = t.tensor(tokenizer(text)['input_ids']).unsqueeze(0)
tokens

tensor([[1212,  318,  257, 1332, 6827,   13]])

In [6]:
# Copying over weights
my_model.embedding_layer.position_embedding_layer.weight = gpt2_model.transformer.wpe.weight
my_model.embedding_layer.token_embedding_layer.weight = gpt2_model.transformer.wte.weight

my_model.final_ln.weight, my_model.final_ln.bias = gpt2_model.transformer.ln_f.weight, gpt2_model.transformer.ln_f.bias

my_model_block_list = my_model.transformer_blocks
gpt2_model_block_list = gpt2_model.transformer.h

for my_block, gpt2_block in zip(my_model_block_list, gpt2_model_block_list):
    my_block.norm_layer1.weight, my_block.norm_layer1.bias = gpt2_block.ln_1.weight, gpt2_block.ln_1.bias
    my_block.norm_layer2.weight, my_block.norm_layer2.bias = gpt2_block.ln_2.weight, gpt2_block.ln_2.bias

    my_block.mlp_block.linear1.weight, my_block.mlp_block.linear1.bias = t.nn.Parameter(gpt2_block.mlp.c_fc.weight.T), gpt2_block.mlp.c_fc.bias
    my_block.mlp_block.linear2.weight, my_block.mlp_block.linear2.bias = t.nn.Parameter(gpt2_block.mlp.c_proj.weight.T), gpt2_block.mlp.c_proj.bias

    (wq,wk,wv) = t.chunk(gpt2_block.attn.c_attn.weight.T, 3, dim=0)
    (bq,bk,bv) = t.chunk(gpt2_block.attn.c_attn.bias, 3, dim=0)

    my_block.mha_block.linear_q.weight, my_block.mha_block.linear_q.bias = t.nn.Parameter(wq), t.nn.Parameter(bq)
    my_block.mha_block.linear_k.weight, my_block.mha_block.linear_k.bias = t.nn.Parameter(wk), t.nn.Parameter(bk)
    my_block.mha_block.linear_v.weight, my_block.mha_block.linear_v.bias = t.nn.Parameter(wv), t.nn.Parameter(bv)
    my_block.mha_block.linear_o.weight, my_block.mha_block.linear_o.bias = t.nn.Parameter(gpt2_block.attn.c_proj.weight.T), gpt2_block.attn.c_proj.bias


### Logit outputs

In [89]:
print("Model logits")
print("GPT2 model logits")
gpt2_logits = gpt2_model(**tokenizer(text, return_tensors='pt')).logits
print(gpt2_logits)
print("My model logits")
my_logits = my_model(tokens)
print(my_logits)

print(t.allclose(gpt2_logits, my_logits, atol=1e-1))

Model logits
GPT2 model logits
tensor([[[ -35.8890,  -35.2049,  -39.1336,  ...,  -42.4869,  -41.8197,
           -36.0383],
         [-107.7291, -108.0175, -113.2967,  ..., -116.4645, -115.7443,
          -110.8654],
         [-111.7507, -111.5704, -114.5443,  ..., -120.7242, -117.1756,
          -112.3996],
         [ -86.1846,  -88.5057,  -94.3530,  ..., -101.3573,  -98.6974,
           -91.1616],
         [-106.4531, -108.7300, -115.4155,  ..., -119.6631, -119.1774,
          -110.7877],
         [-146.7139, -145.9828, -146.9487,  ..., -155.2113, -158.0557,
          -139.4035]]], grad_fn=<UnsafeViewBackward0>)
My model logits
tensor([[[ -35.8260,  -35.1460,  -39.0735,  ...,  -42.4222,  -41.7547,
           -35.9785],
         [-107.7155, -108.0055, -113.2770,  ..., -116.4519, -115.7324,
          -110.8482],
         [-111.7447, -111.5662, -114.5354,  ..., -120.7213, -117.1712,
          -112.3930],
         [ -86.1794,  -88.5035,  -94.3420,  ..., -101.3490,  -98.6924,
           -

### Argmax sampling

In [42]:
text = "The Large Apple, by Roald Dahl. \nLate at night,"
tokens_gpt2 = tokenizer(text, return_tensors='pt')
tokens_curr = tokenizer(text, return_tensors='pt')['input_ids']

tokens_gpt2

{'input_ids': tensor([[  464, 13601,  4196,    11,   416,  5564,  1940, 41471,    13,   220,
           198, 26302,   379,  1755,    11]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [43]:
for i in range(10):
    gpt2_next_token = gpt2_model(**tokens_gpt2).logits[0][-1].argmax()
    my_model_next_token = my_model(tokens_curr)[0][-1].argmax()

    tokens_gpt2['input_ids'] = t.cat([tokens_gpt2['input_ids'], gpt2_next_token.reshape((1,1))], dim=1)
    tokens_gpt2['attention_mask'] = t.cat([tokens_gpt2['attention_mask'], t.tensor(1).reshape((1,1))], dim=1)
    tokens_curr = t.cat([tokens_curr, my_model_next_token.reshape((1,1))], dim=1)

In [47]:
print("GPT2 argmax output")
print(''.join(tokenizer.convert_ids_to_tokens(tokens_gpt2['input_ids'][0])).replace('Ġ', ' ').replace('Ċ', '\n'))
print()
print("My implementation output")
print(''.join(tokenizer.convert_ids_to_tokens(tokens_curr[0])).replace('Ġ', ' ').replace('Ċ', '\n'))

GPT2 argmax output
The Large Apple, by Roald Dahl. 
Late at night, I was sitting in the living room of my home

My implementation output
The Large Apple, by Roald Dahl. 
Late at night, I was sitting in the living room of my home


## Normal sampling from my version of the GPT-2 model

In [95]:
# https://openai.com/index/better-language-models/
text = "In a shocking finding, scientist discovered a herd of unicorns living in a remote, previously unexplored valley, in the Andes Mountains. Even more surprising to the researchers was the fact that the unicorns spoke perfect English."
tokens = tokenizer(text, return_tensors='pt')['input_ids']

tokens

tensor([[  818,   257, 14702,  4917,    11, 11444,  5071,   257, 27638,   286,
         28000, 19942,  2877,   287,   257,  6569,    11,  4271, 31286,  1850,
         19272,    11,   287,   262,   843,   274, 21124,    13,  3412,   517,
          6452,   284,   262,  4837,   373,   262,  1109,   326,   262, 28000,
         19942,  5158,  2818,  3594,    13]])

In [96]:
for i in range(100):
    my_model_next_token = t.multinomial(t.softmax(my_model(tokens)[0][-1], dim=0), 1)

    tokens = t.cat([tokens, my_model_next_token.unsqueeze(0)], dim=1)

In [88]:
print("GPT2 completion of unicorn prompt")
print(''.join(tokenizer.convert_ids_to_tokens(tokens[0])).replace('Ġ', ' ').replace('Ċ', '\n'))

GPT2 completion of unicorn prompt
In a shocking finding, scientist discovered a herd of unicorns living in a remote, previously unexplored valley, in the Andes Mountains. Even more surprising to the researchers was the fact that the unicorns spoke perfect English. The research suggests that intelligence lies at the heart of human culture for which there is many monuments of old pointers to wisdom or favourable things. "Good eyes for learning things to know", says Krishnamurti Mehta of Harvard's water and food sciences department, admiring the unusual sight and offered
