In [1]:
import sys 
from pathlib import Path
import os 

In [2]:
parent_folder = Path(os.getcwd()).resolve().parent
sys.path.insert(0, str(parent_folder))
print('parent_folder ', parent_folder)

parent_folder  /home/jin/gpt2_from_scratch


In [3]:
from gpt_model import GPT2
from gpt2_config.config import  GPT2Config

parent_folder  /home/jin/gpt2_from_scratch/gpt2


In [4]:
config = GPT2Config()
model = GPT2(cfg=config)

In [5]:
model

GPT2(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (attn): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=False)
        (W_key): Linear(in_features=768, out_features=768, bias=False)
        (W_value): Linear(in_features=768, out_features=768, bias=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_resid): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (attn): MultiHeadAttention(
        (W_query): Linear(in_features=768

<h3> GPT2 Model Architecture </h3>
<table>
  <thead>
    <tr>
      <th>Component</th>
      <th>Parameters</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td>Token embedding</td>
      <td>vocab_size=50257, embedding_dim=768</td>
    </tr>
    <tr>
      <td>Pos embedding</td>
      <td>context_length=1024, embedding_dim=768</td>
    </tr>
    <tr>
      <td>Dropout</td>
      <td>0.1</td>
    </tr>
    <tr>
      <td>TransformerBlock</td>
      <td>12 blocks</td>
    </tr>
    <tr>
      <td>- MultiHeadAttention</td>
      <td>(included in TransformerBlock)</td>
    </tr>
    <tr>
      <td>- FeedForward</td>
      <td>(included in TransformerBlock)</td>
    </tr>
    <tr>
      <td>- LayerNormalization</td>
      <td>2 per block</td>
    </tr>
    <tr>
      <td>FinalLayerNormalization</td>
      <td>-</td>
    </tr>
    <tr>
      <td>Out_head</td>
      <td>Linear(embedding_dim=768, vocab_size=50257)</td>
    </tr>
  </tbody>
</table>

In [6]:
model.out_head.weight.shape

torch.Size([50257, 768])

In [7]:
import torch
import tiktoken

In [8]:
tokenizer = tiktoken.get_encoding('gpt2')

In [9]:
text = "hello, how are you?"
encoded = torch.tensor(tokenizer.encode(text)).unsqueeze(dim=0)

In [10]:
encoded.shape

torch.Size([1, 6])

In [13]:
logits = model(encoded)
logits.shape

torch.Size([1, 6, 50257])