In [1]:
import sys 
from pathlib import Path
import os 

In [2]:
parent_folder = Path(os.getcwd()).resolve().parent
sys.path.insert(0, str(parent_folder))
print('parent_folder ', parent_folder)

parent_folder  /home/jin/gpt2_from_scratch


In [3]:
from gpt_model import GPT2
from gpt2_config.config import  GPT2Config

parent_folder  /home/jin/gpt2_from_scratch/gpt2


In [4]:
config = GPT2Config()
model = GPT2(cfg=config)

In [5]:
model

GPT2(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (attn): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=False)
        (W_key): Linear(in_features=768, out_features=768, bias=False)
        (W_value): Linear(in_features=768, out_features=768, bias=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_resid): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (attn): MultiHeadAttention(
        (W_query): Linear(in_features=768

<h3> GPT2 Model Architecture </h3>
<table>
  <thead>
    <tr>
      <th>Component</th>
      <th>Parameters</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td>Token embedding</td>
      <td>vocab_size=50257, embedding_dim=768</td>
    </tr>
    <tr>
      <td>Pos embedding</td>
      <td>context_length=1024, embedding_dim=768</td>
    </tr>
    <tr>
      <td>Dropout</td>
      <td>0.1</td>
    </tr>
    <tr>
      <td>TransformerBlock</td>
      <td>12 blocks</td>
    </tr>
    <tr>
      <td>- MultiHeadAttention</td>
      <td>(included in TransformerBlock)</td>
    </tr>
    <tr>
      <td>- FeedForward</td>
      <td>(included in TransformerBlock)</td>
    </tr>
    <tr>
      <td>- LayerNormalization</td>
      <td>2 per block</td>
    </tr>
    <tr>
      <td>FinalLayerNormalization</td>
      <td>-</td>
    </tr>
    <tr>
      <td>Out_head</td>
      <td>Linear(embedding_dim=768, vocab_size=50257)</td>
    </tr>
  </tbody>
</table>

In [6]:
model.out_head.weight.shape

torch.Size([50257, 768])

In [7]:
import torch
import tiktoken

In [8]:
tokenizer = tiktoken.get_encoding('gpt2')

In [9]:
text = "hello, how are you?"
encoded = torch.tensor(tokenizer.encode(text)).unsqueeze(dim=0)

In [10]:
encoded.shape

torch.Size([1, 6])

In [11]:
logits = model(encoded)
logits.shape

torch.Size([1, 6, 50257])

In [12]:
logits

tensor([[[ 0.3823, -0.5008, -0.3374,  ..., -0.1266,  0.7021,  0.3059],
         [ 0.0129, -0.2299,  0.9791,  ...,  1.0146,  0.2964, -0.4946],
         [ 0.0151, -0.1734,  0.1007,  ..., -0.0632, -0.2578,  1.0031],
         [-0.2649, -0.7542, -0.0272,  ..., -0.3435, -0.3111,  0.2039],
         [ 0.2473, -1.1344,  1.1647,  ..., -0.3361,  0.6915,  0.7573],
         [-0.0429, -0.5451, -0.3468,  ..., -0.3397,  0.6637, -0.2888]]],
       grad_fn=<UnsafeViewBackward0>)

In [13]:
# convert logits into probabilities 
prob = torch.softmax(logits, dim=-1)
prob

tensor([[[2.4721e-05, 1.0223e-05, 1.2037e-05,  ..., 1.4861e-05,
          3.4039e-05, 2.2903e-05],
         [1.7002e-05, 1.3337e-05, 4.4682e-05,  ..., 4.6293e-05,
          2.2575e-05, 1.0235e-05],
         [1.7121e-05, 1.4181e-05, 1.8652e-05,  ..., 1.5833e-05,
          1.3033e-05, 4.5990e-05],
         [1.2969e-05, 7.9499e-06, 1.6448e-05,  ..., 1.1988e-05,
          1.2383e-05, 2.0723e-05],
         [2.1635e-05, 5.4335e-06, 5.4147e-05,  ..., 1.2072e-05,
          3.3734e-05, 3.6026e-05],
         [1.6212e-05, 9.8118e-06, 1.1963e-05,  ..., 1.2049e-05,
          3.2863e-05, 1.2678e-05]]], grad_fn=<SoftmaxBackward0>)

In [14]:
# get the last tokens 
next_token_logit = prob[:, -1, :] # means predict the token after last input token in my case was '?' 
next_token_logit.shape

torch.Size([1, 50257])

In [15]:
next_token_id = torch.argmax(next_token_logit, dim=-1)
next_token_id

tensor([3082])

In [16]:
tokenizer.decode(next_token_id.tolist())

' Comp'

In [17]:
from generation.generate_text import generate_text

In [18]:
idx = generate_text(model, encoded, max_new_tokens=10, context_size=6)

In [22]:
idx, idx.shape

(tensor([[31373,    11,   703,   389,   345,    30,  3082, 35597, 12299,  4963,
          29490, 32931, 27350, 20016, 24357,  1739]]),
 torch.Size([1, 16]))

In [20]:
encoded

tensor([[31373,    11,   703,   389,   345,    30]])

In [21]:
tokenizer.decode(idx.squeeze().tolist())

'hello, how are you? Comp Riverside interviewed documents Segaworked Arri intervals poisedued'

In [23]:
idx2 = generate_text(model, encoded, max_new_tokens=14, context_size=6)

In [24]:
tokenizer.decode(idx2.squeeze().tolist())

'hello, how are you? reg Walsh dependence Lawson Jenkins documents Batman fiance deducted Dancing degrading deducted themselvescollect'

In [25]:
idx2.shape

torch.Size([1, 20])

<h3> Conclusion </h3>

<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th>Metric</th>
      <th>Value</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td>Initial tokens</td>
      <td>6</td>
    </tr>
    <tr>
      <td>Max new tokens</td>
      <td>14</td>
    </tr>
    <tr>
      <td>Generated total tokens</td>
      <td>20</td>
    </tr>
  </tbody>
</table>
<br>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th>Reason for Random Output</th>
      <th>Explanation</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td>Model is untrained</td>
      <td>No learning has occurred</td>
    </tr>
    <tr>
      <td>Weighs are random</td>
      <td>Parameters not optimized</td>
    </tr>
    <tr>
      <td>Result</td>
      <td>Model is basically guessing</td>
    </tr>
  </tbody>
</table>