Refer paper Language Models are Unsupervised Multitask Learners
https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf


GPT-2[124M] has 12 layers and $d_{model}$ = 768

It is a decoder only model.

Variables are named to follow the schema of Hugging Face Transformers code.  
The following variables should be exactly the same:

`transformer.wte.weight`  
`transformer.wpe.weight`  
`transformer.h.0.attn.c_attn.weight`  
`transformer.h.0.attn.c_proj.weight`  
`transformer.h.0.mlp.c_fc.weight`  
`transformer.h.0.mlp.c_proj.weight`  
`transformer.ln_f.weight`  
`lm_head.weight`


In [5]:
from transformers import GPT2LMHeadModel
from dataclasses import dataclass
import torch
import torch.nn as nn
import torch.nn.functional as F
import time
import tiktoken
import math

In [6]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [7]:
model_gpt2 = GPT2LMHeadModel.from_pretrained("gpt2") #124M
sd = model_gpt2.state_dict()
for k,v in sd.items():
  print(k, v.shape)

transformer.wte.weight torch.Size([50257, 768])
transformer.wpe.weight torch.Size([1024, 768])
transformer.h.0.ln_1.weight torch.Size([768])
transformer.h.0.ln_1.bias torch.Size([768])
transformer.h.0.attn.c_attn.weight torch.Size([768, 2304])
transformer.h.0.attn.c_attn.bias torch.Size([2304])
transformer.h.0.attn.c_proj.weight torch.Size([768, 768])
transformer.h.0.attn.c_proj.bias torch.Size([768])
transformer.h.0.ln_2.weight torch.Size([768])
transformer.h.0.ln_2.bias torch.Size([768])
transformer.h.0.mlp.c_fc.weight torch.Size([768, 3072])
transformer.h.0.mlp.c_fc.bias torch.Size([3072])
transformer.h.0.mlp.c_proj.weight torch.Size([3072, 768])
transformer.h.0.mlp.c_proj.bias torch.Size([768])
transformer.h.1.ln_1.weight torch.Size([768])
transformer.h.1.ln_1.bias torch.Size([768])
transformer.h.1.attn.c_attn.weight torch.Size([768, 2304])
transformer.h.1.attn.c_attn.bias torch.Size([2304])
transformer.h.1.attn.c_proj.weight torch.Size([768, 768])
transformer.h.1.attn.c_proj.bias 

In [8]:
from transformers import pipeline
generator = pipeline('text-generation', model = 'gpt2')
generator("Hello, I'm a language model,", max_new_tokens=30, num_return_sequences=5)

Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "Hello, I'm a language model, and my goal is to make it easier for people to learn.\n\nWhy not just start using it?\n\nWell, it is possible to"},
 {'generated_text': "Hello, I'm a language model, not a programming language. I'm a programming language that you write for your own purposes. And what happened is that I decided, because I was a"},
 {'generated_text': "Hello, I'm a language model, not a compiler. I'll be playing and I'll be running on Java. I'll be writing programs on Java that are written in a way that"},
 {'generated_text': "Hello, I'm a language model, I can't even describe it.\n\nBut I can say that the way we deal with the language is that a language is a universal language,"},
 {'generated_text': "Hello, I'm a language model, and this is where I'm going to make your code.\n\nI'm going to make sure that you run into some errors, since this is"}]

#GPT-2

In [9]:

class CausalSelfAttention(nn.Module):
  def __init__(self, config):
    super().__init__()
    # make sure hidden dim is a multiple of no. of heads
    assert config.n_embed % config.n_head == 0

    # a single linear layer to compute Q, K, V simultaneously
    self.c_attn=nn.Linear(config.n_embed, 3 * config.n_embed)

    # output projection
    self.c_proj = nn.Linear(config.n_embed, config.n_embed)
    self.c_proj.NANOGPT_SCALE_INIT = 1 # flag for weight initialization of c_proj, use std = 0.02/sqroot(num layers)

    self.n_head = config.n_head
    self.n_embed = config.n_embed

    # not really a bias, more of a mask, but following OpenAI naming convention
    self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
                         .view(1, 1,config.block_size, config.block_size ))



  def forward(self, x):
    B, T, C = x.size()  # Batch size, sequence length, n_embed
    qkv= self.c_attn(x)
    q,k,v = qkv.split(self.n_embed, dim=2)
    k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)

    # explanation : C = n_head * head_size
    # k.shape = (B, T, n_head, head_size)
    # k = k.transpose(1, 2)
    # Before transpose: (B, T, n_head, head_size)
    # After transpose:  (B, n_head, T, head_size)

    # similarly for q and v
    q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
    v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)

    # Attention
    # att = (q @ k.transpose(-2,-1)) * (1.0 / math.sqrt(k.size[-1]))  # q : (B, n_head, T, head_size) @ K.T : (B, n_head, head_size, T) = (B, n_head, T, T)
    # att = att.masked_fill(self.bias[:, :, :T, :T] == 0.0, float('-inf'))
    # att = F.softmax(att, dim=-1)
    # y = att @ v # (B, nh, T, hs) x (B, nh, T, hs) -> (B, nh, T, hs), basically a weighted sum of values

    y = F.scaled_dot_product_attention(q, k, v, is_causal=True) # flash attention

    y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side

    # output projection
    y = self.c_proj(y)

    return y





In [10]:
class MLP(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.c_fc = nn.Linear(config.n_embed, 4 * config.n_embed)         # ffn. increasing hidden dim size increases capacity of model to learn, 4*embed dim is just design choice
    self.gelu = nn.GELU(approximate='tanh')                           # activation
    self.c_proj = nn.Linear( 4 * config.n_embed, config.n_embed)      # projection

  def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        return x

In [11]:
class Block(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.ln_1 = nn.LayerNorm(config.n_embed)  # layer norm 1
    self.attn = CausalSelfAttention(config)   # causal attention
    self.ln_2 = nn.LayerNorm(config.n_embed)  # layer norm 2
    self.mlp = MLP(config)                    # fnn

  def forward(self, x):
    x = x + self.attn(self.ln_1(x))
    x = x + self.mlp(self.ln_2(x))
    return x

In [12]:
@dataclass
class GPTConfig:
  block_size : int = 1024    # max sequence length
  vocab_size : int = 50257   # number of tokens: 50,000 BPE merges + 256 bytes tokens + 1 <|endoftext|> token
  n_layer : int = 12
  n_head : int = 12
  n_embed : int = 768

class GPT(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.config=config

    self.transformer=nn.ModuleDict(dict(
        wte = nn.Embedding(config.vocab_size, config.n_embed),  # weights for token embeddings
        wpe = nn.Embedding(config.block_size, config.n_embed),  # weights for positional embeddings
        h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), # block for each layer
        ln_f = nn.LayerNorm(config.n_embed),  # final layer normalisation
        ))
    self.lm_head = nn.Linear(config.n_embed, config.vocab_size,bias=False) # last second, linear layer

    # weight-sharing scheme
    self.transformer.wte.weight = self.lm_head.weight

    # initialize parameters
    self.apply(self._init_weights)

  def _init_weights(self, module):
    if isinstance(module, nn.Linear):
      std = 0.02
      if hasattr(module, 'NANOGPT_SCALE_INIT'):      # will be true only for output projection, `c_proj` layer
        std *= (2 * self.config.n_layer) ** -0.05    # scale std by 1/sqrt(no_of_layers) acc to GPT paper
        # we are doing 2 * no of layers bcoz every layer has 2 blocks that add to residual stream - attention and then mlp
        torch.nn.init.normal_(module.weight, mean=0.0, std = std) # inititalise weights according to gpt2 official code, i.e., mean 0, std 0.02 for weights
        # sqroot n thing is done to control the growth of activations in residual stream in forward pass as each residual stream adds its data so we scale down every contribution to residual stream
        torch.nn.init.zeros_(module.bias) # and normal initialisation for bias
    elif isinstance(module, nn.Embedding):
      torch.nn.init.normal_(module.weight, mean=0.0, std = 0.02)



  def forward(self, idx, targets=None):
    # idx (B, T) Batch size, B sequences, each of length T stacked up, T<=block_size
    B, T = idx.size()
    assert T <= self.config.block_size, f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"
    # forward the token and posisition embeddings
    pos = torch.arange(0, T, dtype=torch.long, device=idx.device) # shape (T), arange iterate from 0 to T
    pos_emb = self.transformer.wpe(pos) # shape (T, n_embd) # identical for every single row (batch)
    tok_emb = self.transformer.wte(idx) # shape (B, T, n_embd)
    x = tok_emb + pos_emb # internal broadcasting
    # forward the blocks of transformer
    for block in self.transformer.h:
      x = block(x)
    # forward the final layernorm
    x = self.transformer.ln_f(x)
    # forward the final classifier
    logits=self.lm_head(x) # (B, T, vocab_size)
    loss=None
    if targets is not None:
      loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1)) # logits - (B*T, vocab_size)
    return logits, loss


  @classmethod
  def from_pretrained(cls, model_type):
      # Loads pretrained GPT-2 model weights from huggingface
      assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
      print("loading weights from pretrained gpt: %s" % model_type)

      # n_layer, n_head and n_embed are determined from model_type
      config_args = {
          'gpt2':         dict(n_layer=12, n_head=12, n_embed=768),  # 124M params
          'gpt2-medium':  dict(n_layer=24, n_head=16, n_embed=1024), # 350M params
          'gpt2-large':   dict(n_layer=36, n_head=20, n_embed=1280), # 774M params
          'gpt2-xl':      dict(n_layer=48, n_head=25, n_embed=1600), # 1558M params
      }[model_type]
      config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints
      config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints
      # create a from-scratch initialized minGPT model
      config = GPTConfig(**config_args)
      model = GPT(config)
      sd = model.state_dict()
      sd_keys = sd.keys()
      sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')] # discard this mask / buffer, not a param

      # init a huggingface/transformers model
      model_hf = GPT2LMHeadModel.from_pretrained(model_type)
      sd_hf = model_hf.state_dict()

      # copy while ensuring all of the parameters are aligned and match in names and shapes
      sd_keys_hf = sd_hf.keys()
      sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer
      sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)
      transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
      # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
      # this means that we have to transpose these weights when we import them
      assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
      for k in sd_keys_hf:
          if any(k.endswith(w) for w in transposed):
              # special treatment for the Conv1D weights we need to transpose
              assert sd_hf[k].shape[::-1] == sd[k].shape
              with torch.no_grad():
                  sd[k].copy_(sd_hf[k].t())
          else:
              # vanilla copy over the other parameters
              assert sd_hf[k].shape == sd[k].shape
              with torch.no_grad():
                  sd[k].copy_(sd_hf[k])

      return model

  def configure_optimizers(self, weight_decay, learning_rate, device_type):
      # start with all of the candidate parameters (that require grad)
      param_dict = {pn: p for pn, p in self.named_parameters()}
      param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
      # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
      # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
      decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
      nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
      optim_groups = [
          {'params': decay_params, 'weight_decay': weight_decay},
          {'params': nodecay_params, 'weight_decay': 0.0}
      ]
      num_decay_params = sum(p.numel() for p in decay_params)
      num_nodecay_params = sum(p.numel() for p in nodecay_params)
      if master_process:
          print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
          print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
      # Create AdamW optimizer and use the fused version if it is available
      fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
      use_fused = fused_available and device_type == "cuda"
      if master_process:
          print(f"using fused AdamW: {use_fused}")
      optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=(0.9, 0.95), eps=1e-8, fused=use_fused)
      return optimizer



In [13]:
# model=GPT.from_pretrained("gpt2")
model = GPT(GPTConfig()) # random initialization

In [14]:
model.eval() #put model into eval mode when not training anything and just using the model
model.to(device)

GPT(
  (transformer): ModuleDict(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (h): ModuleList(
      (0-11): 12 x Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=768, out_features=2304, bias=True)
          (c_proj): Linear(in_features=768, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): GELU(approximate='tanh')
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [15]:
num_return_sequences=5
max_length=30

In [16]:
import tiktoken
enc = tiktoken.get_encoding('gpt2')
tokens = enc.encode("Hello, I'm a language model,") # (8, )
tokens = torch.tensor(tokens, dtype=torch.long)
tokens = tokens.unsqueeze(0).repeat(num_return_sequences, 1) # (5, 8)
x = tokens.to('cuda')

In [17]:
tokens.shape

torch.Size([5, 8])

##Generate before Training

In [18]:
# generate.  right now x is (B, T) where B=5 and T=8
torch.manual_seed(42)
torch.cuda.manual_seed(42)
while x.size(1) < max_length: # add a column of new indices, i.e. add new token for each of the 5 sequences
    # forward the model to get the logits
    with torch.no_grad():
        logits, _ = model(x) # (B, T, vocab_size)

        # get the logits at the last position
        logits = logits[:, -1, :] # (B, vocab_size)

        # get the probabilities
        probs = F.softmax(logits, dim=-1)

        # do top-k sampling of 50 (huggingface pipeline default)
        # topk_probs here becomes (5, 50), topk_indices is (5, 50)
        topk_probs, topk_indices = torch.topk(probs, 50, dim=-1) # ensures no sampling of very rare tokens

        # select a token from the top-k probabilities
        ix = torch.multinomial(topk_probs, 1) # (B, 1)

        # gather the corresponding indices
        xcol = torch.gather(topk_indices, -1, ix) # (B, 1)

        # append to the sequence
        x = torch.cat((x, xcol), dim=1)

# print the generated text
for i in range(num_return_sequences):
  tokens=x[i, :max_length].tolist()
  decoded=enc.decode(tokens)
  print(">", decoded)

> Hello, I'm a language model, gl Robot Cherokeeemptionotechfulness fingerprintsprev PRESIDENT balloons lick histories Side159 altercationVO publishersfulness blink wait048 start
> Hello, I'm a language model, Lay jam creeps actresses actresses402 1840 elev freelance creeps leveraging Ether Round markupintensity Cherokeeogether Nasa reactiveintensity start Nope
> Hello, I'm a language model, Cherokeewindows Nasafemin wait Publicoptfulness tile Robot PRESIDENT Ratsintensity creeps β Baird start Rats balloonsSpec billionaires histories
> Hello, I'm a language model, tenets ® actresses lobby Robot Rats Cherokee Tinaevaluate shore start225387 Nasa grap Nasaicidesintensity fingerprints CherokeefulnessJuly
> Hello, I'm a language model, Publicclose comprehension start actressesviron cloakilon creeps billionaires tra congreg creeps thinking maneuvers histories Kang creepsdem pass indoorCop


##Data Processing

We want to feed the token sequences to a transformer

In [20]:
import tiktoken
enc = tiktoken.get_encoding("gpt2")
with open("input.txt", 'r') as f:
  text = f.read()
data = text[:1000]
print(data)
tokens = enc.encode(data) # encode data
B, T = 4, 32
buf = torch.tensor(tokens[:B*T+1]) # take only B*T tokens to manage size and an additional token which will be used in output y as target to nth token
buf = buf.to(device) # it doesn't just move the data to gpu, it creates a new memory on gpu
x = buf[:-1].view(B, T) # all tokens except last
y = buf[1:].view(B, T) # targets will be from 1st token
print(x)
print(y)

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



Its like for 0th token, target is 1st token, for 0 and 1st token, target is 2nd token and so on (masked prediction) and so our x goes from 0 to B*T and y goes from 1 to B*T+1

In [21]:
model.to(device)
logits, loss = model(x, y)
logits.shape

torch.Size([4, 32, 50257])

In [22]:
print(loss)

tensor(10.7661, device='cuda:0', grad_fn=<NllLossBackward0>)


At inititalization, we expect loss to be roughly around -ln(1/vocab_size)[NLL]1
 = 10.8 here,  since at initialization, probability of any word is same i.e., 1/50257.

In [23]:
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
for i in range(50):
  optimizer.zero_grad()
  logits, loss = model(x,y)
  loss.backward()
  optimizer.step()
  print(f"step : {i}, loss : {loss.item()}")
# Here we overfit a single batch, now lets move to training on all batches!

step : 0, loss : 10.766117095947266
step : 1, loss : 8.459126472473145
step : 2, loss : 7.78156852722168
step : 3, loss : 7.601080894470215
step : 4, loss : 7.055751800537109
step : 5, loss : 6.523852348327637
step : 6, loss : 6.2293901443481445
step : 7, loss : 5.854854583740234
step : 8, loss : 5.603376388549805
step : 9, loss : 5.260631561279297
step : 10, loss : 4.928299903869629
step : 11, loss : 4.6097612380981445
step : 12, loss : 4.324857234954834
step : 13, loss : 4.081106662750244
step : 14, loss : 3.7766263484954834
step : 15, loss : 3.46846866607666
step : 16, loss : 3.1800835132598877
step : 17, loss : 2.9007139205932617
step : 18, loss : 2.6462717056274414
step : 19, loss : 2.406736135482788
step : 20, loss : 2.197084426879883
step : 21, loss : 1.9773025512695312
step : 22, loss : 1.7539914846420288
step : 23, loss : 1.547154188156128
step : 24, loss : 1.3641879558563232
step : 25, loss : 1.186747670173645
step : 26, loss : 1.0240534543991089
step : 27, loss : 0.877537012

In [24]:
class DataLoaderLite:
  def __init__(self, B, T):
    self.B=B
    self.T=T

    # at init, load tokens from disk and store them in memory
    with open('input.txt', 'r') as f:
      text = f.read()
    enc = tiktoken.get_encoding('gpt2')
    tokens = enc.encode(text)
    self.tokens = torch.tensor(tokens)
    print(f"Loaded {len(self.tokens)} tokens")
    print(f"1 epoch = {len(self.tokens) // (B*T)} batches")
    self.current_size = 0 # state

  def next_batch(self):
    B, T = self.B, self.T
    buf = self.tokens[self.current_size:self.current_size + B*T + 1] # +1 coz we need it in 'y'
    # buf = buf.to(device) dont do this here to save space on gpu
    x = buf[:-1].view(B, T) # inputs
    y = buf[1:].view(B, T) # targets
    self.current_size += B*T # advance position in tensor
    # if loading next batch would be out of bounds, reset
    if self.current_size + B*T + 1 > len(self.tokens):
      self.current_size = 0
    return x, y

In [25]:
train_loader = DataLoaderLite(B=4, T=32)
model = GPT(GPTConfig())
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
for i in range(50):
  x, y = train_loader.next_batch()
  x, y = x.to(device), y.to(device)
  optimizer.zero_grad()
  logits, loss = model(x, y)
  loss.backward()
  optimizer.step()
  print(f"step : {i}, loss : {loss.item()}")

Loaded 338025 tokens
1 epoch = 2640 batches
step : 0, loss : 10.940428733825684
step : 1, loss : 9.843118667602539
step : 2, loss : 8.90627670288086
step : 3, loss : 9.092300415039062
step : 4, loss : 8.670042037963867
step : 5, loss : 8.367634773254395
step : 6, loss : 9.062261581420898
step : 7, loss : 8.763618469238281
step : 8, loss : 8.239639282226562
step : 9, loss : 8.00062370300293
step : 10, loss : 8.376836776733398
step : 11, loss : 7.392353057861328
step : 12, loss : 7.815987586975098
step : 13, loss : 7.484529972076416
step : 14, loss : 7.460954666137695
step : 15, loss : 7.309003829956055
step : 16, loss : 7.421919822692871
step : 17, loss : 8.24163818359375
step : 18, loss : 7.139786720275879
step : 19, loss : 7.7653961181640625
step : 20, loss : 7.4729461669921875
step : 21, loss : 7.796375274658203
step : 22, loss : 6.474678039550781
step : 23, loss : 6.83241081237793
step : 24, loss : 6.8760786056518555
step : 25, loss : 6.69979190826416
step : 26, loss : 6.78034305572

# Optimization