In [1]:
import sys 
from pathlib import Path
import os 

In [2]:
parent_folder = Path(os.getcwd()).resolve().parent
sys.path.insert(0, str(parent_folder))
print('parent_folder ', parent_folder)

parent_folder  /home/jin/gpt2_from_scratch


In [3]:
from gpt_model import GPT2
from gpt2_config.config import  GPT2Config

parent_folder  /home/jin/gpt2_from_scratch/gpt2


In [4]:
config = GPT2Config()
model = GPT2(cfg=config)

In [5]:
model

GPT2(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (attn): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=False)
        (W_key): Linear(in_features=768, out_features=768, bias=False)
        (W_value): Linear(in_features=768, out_features=768, bias=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_resid): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (attn): MultiHeadAttention(
        (W_query): Linear(in_features=768

<h3> GPT2 Model Architecture </h3>
<table>
  <thead>
    <tr>
      <th>Component</th>
      <th>Parameters</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td>Token embedding</td>
      <td>vocab_size=50257, embedding_dim=768</td>
    </tr>
    <tr>
      <td>Pos embedding</td>
      <td>context_length=1024, embedding_dim=768</td>
    </tr>
    <tr>
      <td>Dropout</td>
      <td>0.1</td>
    </tr>
    <tr>
      <td>TransformerBlock</td>
      <td>12 blocks</td>
    </tr>
    <tr>
      <td>- MultiHeadAttention</td>
      <td>(included in TransformerBlock)</td>
    </tr>
    <tr>
      <td>- FeedForward</td>
      <td>(included in TransformerBlock)</td>
    </tr>
    <tr>
      <td>- LayerNormalization</td>
      <td>2 per block</td>
    </tr>
    <tr>
      <td>FinalLayerNormalization</td>
      <td>-</td>
    </tr>
    <tr>
      <td>Out_head</td>
      <td>Linear(embedding_dim=768, vocab_size=50257)</td>
    </tr>
  </tbody>
</table>

In [6]:
model.out_head.weight.shape

torch.Size([50257, 768])

In [7]:
import torch
import tiktoken

In [8]:
tokenizer = tiktoken.get_encoding('gpt2')

In [9]:
text = "hello, how are you?"
encoded = torch.tensor(tokenizer.encode(text)).unsqueeze(dim=0)

In [10]:
encoded.shape

torch.Size([1, 6])

In [11]:
logits = model(encoded)
logits.shape

torch.Size([1, 6, 50257])

In [12]:
logits

tensor([[[-0.1982,  0.7917, -0.9118,  ...,  0.1907,  0.4489, -0.1582],
         [ 0.4839,  1.1872, -0.6211,  ..., -0.3756,  0.4446, -0.0110],
         [ 0.6909, -0.5079, -0.7169,  ...,  0.9889, -1.4440,  0.4302],
         [-0.0533, -0.6521,  0.5397,  ...,  0.8597, -0.9464,  0.4759],
         [ 0.4242,  1.3716,  0.2615,  ...,  0.0305, -0.0355, -1.2658],
         [-0.2974,  0.4777, -0.7035,  ..., -0.3165,  0.2918,  0.1201]]],
       grad_fn=<UnsafeViewBackward0>)

In [13]:
# convert logits into probabilities 
prob = torch.softmax(logits, dim=-1)
prob

tensor([[[1.3819e-05, 3.7187e-05, 6.7700e-06,  ..., 2.0388e-05,
          2.6394e-05, 1.4383e-05],
         [2.7233e-05, 5.5022e-05, 9.0198e-06,  ..., 1.1529e-05,
          2.6184e-05, 1.6602e-05],
         [3.3615e-05, 1.0137e-05, 8.2250e-06,  ..., 4.5282e-05,
          3.9750e-06, 2.5900e-05],
         [1.5969e-05, 8.7753e-06, 2.8897e-05,  ..., 3.9793e-05,
          6.5378e-06, 2.7110e-05],
         [2.5774e-05, 6.6476e-05, 2.1904e-05,  ..., 1.7386e-05,
          1.6277e-05, 4.7560e-06],
         [1.2457e-05, 2.7042e-05, 8.2994e-06,  ..., 1.2221e-05,
          2.2454e-05, 1.8912e-05]]], grad_fn=<SoftmaxBackward0>)

In [14]:
# get the last tokens 
next_token_logit = prob[:, -1, :] # means predict the token after last input token in my case was '?' 
next_token_logit.shape

torch.Size([1, 50257])

In [29]:
next_token_logit

tensor([[1.2457e-05, 2.7042e-05, 8.2994e-06,  ..., 1.2221e-05, 2.2454e-05,
         1.8912e-05]], grad_fn=<SelectBackward0>)

In [15]:
next_token_id = torch.argmax(next_token_logit, dim=-1)
next_token_id

tensor([11657])

In [16]:
tokenizer.decode(next_token_id.tolist())

' tanks'

In [17]:
from generation.generate_text import generate_text

In [18]:
idx = generate_text(model, encoded, max_new_tokens=10, context_size=6)

In [19]:
idx, idx.shape

(tensor([[31373,    11,   703,   389,   345,    30, 33117, 30218, 17810, 24696,
           1759, 35438,  8612, 47448, 12542,  5324]]),
 torch.Size([1, 16]))

In [20]:
encoded

tensor([[31373,    11,   703,   389,   345,    30]])

In [21]:
tokenizer.decode(idx.squeeze().tolist())

'hello, how are you? Clash cf horizon statisticplay Dmitgener618 throwsxx'

In [22]:
idx2 = generate_text(model, encoded, max_new_tokens=14, context_size=6)

In [23]:
tokenizer.decode(idx2.squeeze().tolist())

'hello, how are you?fordicycle Shortly attributable Layer SUPER Wadoha inflammation Adapticycle Shortly genetically Aden'

In [24]:
idx2.shape

torch.Size([1, 20])

<h3> Conclusion </h3>

<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th>Metric</th>
      <th>Value</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td>Initial tokens</td>
      <td>6</td>
    </tr>
    <tr>
      <td>Max new tokens</td>
      <td>14</td>
    </tr>
    <tr>
      <td>Generated total tokens</td>
      <td>20</td>
    </tr>
  </tbody>
</table>
<br>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th>Reason for Random Output</th>
      <th>Explanation</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td>Model is untrained</td>
      <td>No learning has occurred</td>
    </tr>
    <tr>
      <td>Weighs are random</td>
      <td>Parameters not optimized</td>
    </tr>
    <tr>
      <td>Result</td>
      <td>Model is basically guessing</td>
    </tr>
  </tbody>
</table>

In [25]:
logits.shape

torch.Size([1, 6, 50257])

In [26]:
logits_flat = logits.flatten(0,1)
logits_flat.shape

torch.Size([6, 50257])

In [27]:
# targets 
target_text = "i really like chocolate and you"
target = torch.tensor(tokenizer.encode(target_text)).unsqueeze(dim=0)
target_flat = target.flatten(0, 1)
target.shape, target_flat.shape

(torch.Size([1, 6]), torch.Size([6]))

In [28]:
torch.nn.functional.cross_entropy(logits_flat, target_flat)

tensor(11.3242, grad_fn=<NllLossBackward0>)