In [1]:
import sys 
from pathlib import Path
import os 

In [2]:
parent_folder = Path(os.getcwd()).resolve().parent
sys.path.insert(0, str(parent_folder))
print('parent_folder ', parent_folder)

parent_folder  /home/jin/gpt2_from_scratch


In [3]:
from gpt_model import GPT2
from gpt2_config.config import  GPT2Config

parent_folder  /home/jin/gpt2_from_scratch/gpt2


In [4]:
config = GPT2Config()
model = GPT2(cfg=config)

In [5]:
model

GPT2(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (attn): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=False)
        (W_key): Linear(in_features=768, out_features=768, bias=False)
        (W_value): Linear(in_features=768, out_features=768, bias=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_resid): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (attn): MultiHeadAttention(
        (W_query): Linear(in_features=768

<h3> GPT2 Model Architecture </h3>
<table>
  <thead>
    <tr>
      <th>Component</th>
      <th>Parameters</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td>Token embedding</td>
      <td>vocab_size=50257, embedding_dim=768</td>
    </tr>
    <tr>
      <td>Pos embedding</td>
      <td>context_length=1024, embedding_dim=768</td>
    </tr>
    <tr>
      <td>Dropout</td>
      <td>0.1</td>
    </tr>
    <tr>
      <td>TransformerBlock</td>
      <td>12 blocks</td>
    </tr>
    <tr>
      <td>- MultiHeadAttention</td>
      <td>(included in TransformerBlock)</td>
    </tr>
    <tr>
      <td>- FeedForward</td>
      <td>(included in TransformerBlock)</td>
    </tr>
    <tr>
      <td>- LayerNormalization</td>
      <td>2 per block</td>
    </tr>
    <tr>
      <td>FinalLayerNormalization</td>
      <td>-</td>
    </tr>
    <tr>
      <td>Out_head</td>
      <td>Linear(embedding_dim=768, vocab_size=50257)</td>
    </tr>
  </tbody>
</table>

In [6]:
model.out_head.weight.shape

torch.Size([50257, 768])

In [7]:
import torch
import tiktoken

In [8]:
tokenizer = tiktoken.get_encoding('gpt2')

In [9]:
text = "hello, how are you?"
encoded = torch.tensor(tokenizer.encode(text)).unsqueeze(dim=0)

In [10]:
encoded.shape

torch.Size([1, 6])

In [11]:
logits = model(encoded)
logits.shape

torch.Size([1, 6, 50257])

In [12]:
logits

tensor([[[-0.0646, -0.7776,  0.0545,  ..., -0.4889, -0.0471,  0.4249],
         [ 0.5286,  1.1223, -0.6070,  ..., -0.3712, -0.1476, -0.3167],
         [ 0.7777, -0.3216,  0.5469,  ..., -0.4493,  0.4628, -0.8421],
         [ 0.0277,  0.9318, -0.5817,  ...,  0.1806, -0.3647, -0.3848],
         [ 0.4158,  0.6409,  0.4704,  ..., -0.0282,  0.0652,  0.9970],
         [ 0.1843, -0.0281,  0.2847,  ..., -0.4168, -0.5428,  0.0769]]],
       grad_fn=<UnsafeViewBackward0>)

In [13]:
# convert logits into probabilities 
prob = torch.softmax(logits, dim=-1)
prob

tensor([[[1.5730e-05, 7.7099e-06, 1.7718e-05,  ..., 1.0291e-05,
          1.6006e-05, 2.5663e-05],
         [2.8604e-05, 5.1796e-05, 9.1890e-06,  ..., 1.1632e-05,
          1.4546e-05, 1.2284e-05],
         [3.6502e-05, 1.2159e-05, 2.8981e-05,  ..., 1.0702e-05,
          2.6643e-05, 7.2255e-06],
         [1.7303e-05, 4.2736e-05, 9.4081e-06,  ..., 2.0162e-05,
          1.1688e-05, 1.1455e-05],
         [2.5465e-05, 3.1896e-05, 2.6894e-05,  ..., 1.6336e-05,
          1.7935e-05, 4.5536e-05],
         [2.0205e-05, 1.6338e-05, 2.2337e-05,  ..., 1.1076e-05,
          9.7647e-06, 1.8147e-05]]], grad_fn=<SoftmaxBackward0>)

In [14]:
# get the last tokens 
next_token_logit = prob[:, -1, :] # means predict the token after last input token in my case was '?' 
next_token_logit.shape

torch.Size([1, 50257])

In [15]:
next_token_id = torch.argmax(next_token_logit, dim=-1)
next_token_id

tensor([44406])

In [16]:
tokenizer.decode(next_token_id.tolist())

' Bild'

In [17]:
from generation.generate_text import generate_text

In [18]:
idx = generate_text(model, encoded, max_new_tokens=10, context_size=6)

In [20]:
idx

tensor([[31373,    11,   703,   389,   345,    30, 42332]])

In [21]:
encoded

tensor([[31373,    11,   703,   389,   345,    30]])

In [25]:
tokenizer.decode(idx.squeeze().tolist())

'hello, how are you?Luckily'