In [1]:
from itertools import chain
import torch
from shared import (
    gpt,
    show_token_mapping,
    tokenizer,
    demo_embedding_table as embedding_table,
)

In [2]:
gpt

OpenAIGPTLMHeadModel(
  (transformer): OpenAIGPTModel(
    (tokens_embed): Embedding(40478, 768)
    (positions_embed): Embedding(512, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x Block(
        (attn): Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
    )
  )
  (lm_head): Linear(in_features=768, out_features=40478, bias=False)
)

## Vocabulary

Neural networks deal in numbers, not language

### Tokens

In [3]:
show_token_mapping(
    'token->id',
    tokenizer,
    data='Tokenizers convert text to integer IDs the model can understand, breaking words or subwords into consistent units. Numbers are weird: 12345',
)

In [4]:
show_token_mapping(
    'id->token', tokenizer, data=chain(range(0, 5), range(3000, 3005), range(40473, 40478))
)

In [5]:
print(tokenizer.vocab_size, gpt.transformer.tokens_embed, gpt.transformer.positions_embed, sep='\n')

40478
Embedding(40478, 768)
Embedding(512, 768)


### Embeddings

#### What are they?

`King - Man + Woman = Queen` (shout out Word2Vec)

![word2vec](assets/word2vec.png)

Learned vector representations where magnitude and direction have meaning

In [6]:
token_embedding = gpt.transformer.tokens_embed.weight[3001]
print(token_embedding.shape, token_embedding, sep='\n')

torch.Size([768])
tensor([-1.2091e-01,  5.1690e-02,  3.0707e-02, -9.3440e-03,  1.5310e-01,
         7.1530e-02,  3.4096e-03, -7.9538e-02, -4.6797e-02,  5.6622e-02,
        -7.5406e-02, -1.3514e-03,  6.3382e-02,  1.9436e-02,  5.1909e-02,
        -1.7602e-02,  1.5896e-03,  7.7307e-02,  4.2461e-02, -6.4744e-02,
         4.4580e-03,  5.1230e-02,  7.5680e-02,  9.0224e-03, -2.3326e-02,
         2.3724e-01, -4.4783e-02, -2.3886e-03,  4.9528e-04, -1.2116e-01,
        -1.1063e-01,  1.2074e-01,  3.3991e-03, -1.0959e-02, -1.3390e-02,
         5.2881e-02, -5.2810e-02, -1.1768e-01, -1.6433e-02,  2.1312e-02,
        -5.8561e-03, -9.5219e-03, -1.1006e-03,  3.8631e-02,  2.6234e-02,
         8.5123e-03,  2.5289e-02,  4.1628e-02,  1.0254e-01,  1.3533e-01,
        -6.6852e-02, -3.2288e-02,  1.5040e-01, -6.2251e-02,  4.4128e-02,
         7.9642e-03,  2.1254e-02,  9.3950e-03, -6.8488e-02,  1.6059e-02,
         3.8439e-02,  2.4447e-02, -1.2895e-01, -3.7708e-02, -5.3334e-02,
        -1.0326e-01,  2.6176e-02,

### H_0

![gpt1math](assets/h0math.png)

##### How does a matrix multiply get us the embeddings?

In [7]:
embedding_table

tensor([[10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
        [20, 21, 22, 23, 24, 25, 26, 27, 28, 29],
        [30, 31, 32, 33, 34, 35, 36, 37, 38, 39],
        [40, 41, 42, 43, 44, 45, 46, 47, 48, 49],
        [50, 51, 52, 53, 54, 55, 56, 57, 58, 59]])

In [8]:
# get 10s, 30s, 50s

token_indicies = torch.tensor([[1, 0, 0, 0, 0], [0, 0, 1, 0, 0], [0, 0, 0, 0, 1]])

token_indicies @ embedding_table  # '3 5' @ '5 10' => '3 10'

tensor([[10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
        [30, 31, 32, 33, 34, 35, 36, 37, 38, 39],
        [50, 51, 52, 53, 54, 55, 56, 57, 58, 59]])