<a href="https://colab.research.google.com/github/joshuwaifo/A-Bible-Pre-trained-Transformer-Model/blob/main/10MParam_Dropout_GPU_BibleGPT_7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Previously on BibleGPT 1-6

Clean version inspired by this: https://github.com/karpathy/ng-video-lecture/blob/master/gpt.py

ln: layer norm
lm: linear model

2 minutes 32 seconds on CPU

In [3]:
!wget https://raw.githubusercontent.com/tushortz/variety-bible-text/master/bibles/nasb.txt

import torch
import torch.nn as nn
from torch.nn import functional as F

batch_size = 32 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 300
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 32
n_layer = 3
n_head = 4

torch.manual_seed(1337)

with open('nasb.txt', 'r', encoding='utf-8') as f:
  text = f.read()

# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# Train, val and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n_train = int(0.64*len(data)) # first 64% will be train, rest val and test
n_val = int(0.8*len(data))
train_data = data[:n_train]
val_data = data[n_train:n_val]
test_data = data[n_val:]

# data loading
def get_batch(split):
  # generate a small batch of data of inputs x and targets y
  data = train_data if split == 'train' else val_data if split == 'val' else test_data
  ix = torch.randint(len(data) - block_size, (batch_size,))
  x = torch.stack([data[i:i+block_size] for i in ix])
  y = torch.stack([data[i+1:i+block_size+1] for i in ix])
  x, y = x.to(device), y.to(device)
  return x,y

@torch.no_grad()
def estimate_loss():
  out = {}
  model.eval()
  for split in ['train', 'val']:
    losses = torch.zeros(eval_iters)
    for k in range(eval_iters):
      X, Y = get_batch(split)
      logits, loss = model(X, Y)
      losses[k] = loss.item()
    out[split] = losses.mean()
  model.train()
  return out



class Head(nn.Module):
  """ one head of self-attention """

  def __init__(self, head_size):
    super().__init__()
    self.key = nn.Linear(n_embd, head_size, bias=False)
    self.query = nn.Linear(n_embd, head_size, bias=False)
    self.value = nn.Linear(n_embd, head_size, bias=False)
    self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

  def forward(self, x):
    # input of size (batch, time-step, channels)
    # output of size (batch, time-step, head size)
    B,T,C = x.shape
    k = self.key(x) # (B,T,hs)
    q = self.key(x) # (B,T,hs)
    # compute attention scores ("affinities")
    wei = q @ k.transpose(-2, -1) * C**0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T)
    wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
    wei = F.softmax(wei, dim=-1) # (B, T, T)
    # perform the weighted aggregation of the values
    v = self.value(x) # (B, T, hs)
    out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
    return out

class MultiHeadAttention(nn.Module):
  """ multiple heads of self-attention in parallel """

  def  __init__(self, num_heads, head_size):
    super().__init__()
    self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
    self.proj = nn.Linear(head_size * num_heads, n_embd)

  def forward(self, x):
    out = torch.cat([h(x) for h in self.heads], dim=-1)
    out = self.proj(out)
    return out


class FeedForward(nn.Module):
  """ a simple linear layer followed by a non-linearity """

  def __init__(self, n_embd):
    super().__init__()
    self.net = nn.Sequential(
        nn.Linear(n_embd, 4 * n_embd),
        nn.ReLU(),
        nn.Linear(4 * n_embd, n_embd)
    )

  def forward(self, x):
    return self.net(x)


class Block(nn.Module):
  """ Transformer block: communication followed by computation """

  def __init__(self, n_embd, n_head):
    # n_embd: embedding dimension, n_head: the number of heads we'd like
    super().__init__()
    head_size = n_embd // n_head
    self.sa = MultiHeadAttention(n_head, head_size)
    self.ffwd = FeedForward(n_embd)
    self.ln1 = nn.LayerNorm(n_embd)
    self.ln2 = nn.LayerNorm(n_embd)


  def forward(self, x):
    x = x + self.sa(self.ln1(x))
    x = x + self.ffwd(self.ln2(x))
    return x



class GPTLanguageModel(nn.Module):
  def __init__(self):
    super().__init__()
    # each token directly reads off the logits for the next token from a lookup table
    self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
    self.position_embedding_table = nn.Embedding(block_size, n_embd)
    self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
    self.ln_f = nn.LayerNorm(n_embd) # final layer norm
    self.lm_head = nn.Linear(n_embd, vocab_size)

    # possibly comment out below
    # better init(ialising), not covered in the original GPT video, but important, will cover in followup video
    # self.apply(self._init_weights)

  # def _init_weights(self, module):
  #   if isinstance(module, nn.Linear):
  #     torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
  #     if module.bias is not None:
  #       torch.nn.init.zeros_(module.bias)
  #   elif isinstance(module, nn.Embedding):
  #     torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)


  def forward(self, idx, targets=None):
    B, T = idx.shape

    # idx and targets are both (B,T) tensor of integers
    tok_emb = self.token_embedding_table(idx) # (B,T,C)
    pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T, C)
    x = tok_emb + pos_emb # (B,T,C)
    x = self.blocks(x) # (B,T,C)
    x = self.ln_f(x) # (B,T,C)
    logits = self.lm_head(x) # (B,T,vocab_size)

    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)

    return logits, loss

  def generate(self, idx, max_new_tokens):
    # idx is (B, T) array of indices in the current context
    for _ in range(max_new_tokens):
      # crop idx to the last block_size_tokens
      idx_cond = idx[:, -block_size:]
      # get the predictions
      logits, loss = self(idx_cond)
      # focus only on the last time step
      logits = logits[:, -1, :] # becomes (B, C)
      # apply softmax to get probabilities
      probs = F.softmax(logits, dim=-1) # (B, C)
      # sample from the distribution
      idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
      # append sampled index to the running sequence
      idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
    return idx

model = GPTLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

  # every once in a while evaluate the loss on train and val sets
  if iter % eval_interval == 0 or iter == max_iters - 1:
    losses = estimate_loss()
    print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

  # sample a batch of data
  xb, yb = get_batch('train')

  # evaluate the loss
  logits, loss = model(xb, yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()

# generate from the model
context = torch.zeros((1,1), dtype=torch.long, device=device)
token_list = m.generate(
    context,
    max_new_tokens=500
)
text_stream = decode(token_list[0].tolist())
print(text_stream)

--2024-08-11 07:59:18--  https://raw.githubusercontent.com/tushortz/variety-bible-text/master/bibles/nasb.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4685837 (4.5M) [text/plain]
Saving to: ‘nasb.txt.2’


2024-08-11 07:59:18 (50.8 MB/s) - ‘nasb.txt.2’ saved [4685837/4685837]

0.043214 M parameters
step 0: train loss 4.4799, val loss 4.4906
step 300: train loss 2.3924, val loss 2.4517
step 600: train loss 2.2249, val loss 2.2786
step 900: train loss 2.1533, val loss 2.2269
step 1200: train loss 2.1146, val loss 2.1880
step 1500: train loss 2.0646, val loss 2.1557
step 1800: train loss 2.0383, val loss 2.1166
step 2100: train loss 2.0057, val loss 2.1158
step 2400: train loss 1.9740, val loss 2.0933
step 2700: train loss 1.9422, val loss 2.0612
step 300

Extensions, pick model with minimum validation loss

Add more data sources, ideally without chapter and numbering, take the entire scroll (original language, ESV, NASB)





In [4]:
# layer norm typically at the end of the transformer and right before the final linear layer

# scale up to see how far we can push the number by using GPU and more appropriate hyperparameters

# n_layer: specifies how many layers of the blocks we're going to have

# add dropout right before the residual connection / before the connection back to the original pathway

# dropout also at the end of the multi-headed attention as well
# dropout also when calculating the affinities+softmax
# randomly prevent some of the node from communicating
# randomly prevent some of the nodes from computing



# dropout comes from the 2014 paper (Dropout: A Simple Way to Prevent Neural Networks from Overfitting)

# randomly dropping some neurons down to 0
# regularisation technique
# effectively ends up training an ensemble of sub networks which then get merged to a single ensemble at inference time

# increase batch size to 64 from 32
# changed block size from 8 to 256
# this means previously just 8 characters of context, not it is 256 characters of context to predict the 257th character

# brought down the learning rate from 1e-3 to 3e-4 as the neural network is now much bigger

# embedding dimension increased from 32 to 384
# there are also 6 heads changing from 4 previously
# so 384 / 6 = 64
# this means that every head is 64 dimensional

# number of layers changed from 3 to 6

# dropout is also 0.2
# meaning 20% of the neurons are disabled

# now let's train it

Today on GPT 7

In [1]:
!wget https://raw.githubusercontent.com/tushortz/variety-bible-text/master/bibles/nasb.txt

import torch
import torch.nn as nn
from torch.nn import functional as F

batch_size = 64 # how many independent sequences will we process in parallel?
block_size = 256 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 300
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.2

torch.manual_seed(1337)

with open('nasb.txt', 'r', encoding='utf-8') as f:
  text = f.read()

# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# Train, val and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n_train = int(0.64*len(data)) # first 64% will be train, rest val and test
n_val = int(0.8*len(data))
train_data = data[:n_train]
val_data = data[n_train:n_val]
test_data = data[n_val:]

# data loading
def get_batch(split):
  # generate a small batch of data of inputs x and targets y
  data = train_data if split == 'train' else val_data if split == 'val' else test_data
  ix = torch.randint(len(data) - block_size, (batch_size,))
  x = torch.stack([data[i:i+block_size] for i in ix])
  y = torch.stack([data[i+1:i+block_size+1] for i in ix])
  x, y = x.to(device), y.to(device)
  return x,y

@torch.no_grad()
def estimate_loss():
  out = {}
  model.eval()
  for split in ['train', 'val']:
    losses = torch.zeros(eval_iters)
    for k in range(eval_iters):
      X, Y = get_batch(split)
      logits, loss = model(X, Y)
      losses[k] = loss.item()
    out[split] = losses.mean()
  model.train()
  return out



class Head(nn.Module):
  """ one head of self-attention """

  def __init__(self, head_size):
    super().__init__()
    self.key = nn.Linear(n_embd, head_size, bias=False)
    self.query = nn.Linear(n_embd, head_size, bias=False)
    self.value = nn.Linear(n_embd, head_size, bias=False)
    self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    # input of size (batch, time-step, channels)
    # output of size (batch, time-step, head size)
    B,T,C = x.shape
    k = self.key(x) # (B,T,hs)
    q = self.key(x) # (B,T,hs)
    # compute attention scores ("affinities")
    wei = q @ k.transpose(-2, -1) * C**0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T)
    wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
    wei = F.softmax(wei, dim=-1) # (B, T, T)
    wei = self.dropout(wei)
    # perform the weighted aggregation of the values
    v = self.value(x) # (B, T, hs)
    out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
    return out

class MultiHeadAttention(nn.Module):
  """ multiple heads of self-attention in parallel """

  def  __init__(self, num_heads, head_size):
    super().__init__()
    self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
    self.proj = nn.Linear(head_size * num_heads, n_embd)
    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    out = torch.cat([h(x) for h in self.heads], dim=-1)
    out = self.dropout(self.proj(out))
    return out


class FeedForward(nn.Module):
  """ a simple linear layer followed by a non-linearity """

  def __init__(self, n_embd):
    super().__init__()
    self.net = nn.Sequential(
        nn.Linear(n_embd, 4 * n_embd),
        nn.ReLU(),
        nn.Linear(4 * n_embd, n_embd),
        nn.Dropout(dropout)
    )

  def forward(self, x):
    return self.net(x)


class Block(nn.Module):
  """ Transformer block: communication followed by computation """

  def __init__(self, n_embd, n_head):
    # n_embd: embedding dimension, n_head: the number of heads we'd like
    super().__init__()
    head_size = n_embd // n_head
    self.sa = MultiHeadAttention(n_head, head_size)
    self.ffwd = FeedForward(n_embd)
    self.ln1 = nn.LayerNorm(n_embd)
    self.ln2 = nn.LayerNorm(n_embd)


  def forward(self, x):
    x = x + self.sa(self.ln1(x))
    x = x + self.ffwd(self.ln2(x))
    return x



class GPTLanguageModel(nn.Module):
  def __init__(self):
    super().__init__()
    # each token directly reads off the logits for the next token from a lookup table
    self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
    self.position_embedding_table = nn.Embedding(block_size, n_embd)
    self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
    self.ln_f = nn.LayerNorm(n_embd) # final layer norm
    self.lm_head = nn.Linear(n_embd, vocab_size)

    # possibly comment out below
    # better init(ialising), not covered in the original GPT video, but important, will cover in followup video
    # self.apply(self._init_weights)

  # def _init_weights(self, module):
  #   if isinstance(module, nn.Linear):
  #     torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
  #     if module.bias is not None:
  #       torch.nn.init.zeros_(module.bias)
  #   elif isinstance(module, nn.Embedding):
  #     torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)


  def forward(self, idx, targets=None):
    B, T = idx.shape

    # idx and targets are both (B,T) tensor of integers
    tok_emb = self.token_embedding_table(idx) # (B,T,C)
    pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T, C)
    x = tok_emb + pos_emb # (B,T,C)
    x = self.blocks(x) # (B,T,C)
    x = self.ln_f(x) # (B,T,C)
    logits = self.lm_head(x) # (B,T,vocab_size)

    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)

    return logits, loss

  def generate(self, idx, max_new_tokens):
    # idx is (B, T) array of indices in the current context
    for _ in range(max_new_tokens):
      # crop idx to the last block_size_tokens
      idx_cond = idx[:, -block_size:]
      # get the predictions
      logits, loss = self(idx_cond)
      # focus only on the last time step
      logits = logits[:, -1, :] # becomes (B, C)
      # apply softmax to get probabilities
      probs = F.softmax(logits, dim=-1) # (B, C)
      # sample from the distribution
      idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
      # append sampled index to the running sequence
      idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
    return idx

model = GPTLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

  # every once in a while evaluate the loss on train and val sets
  if iter % eval_interval == 0 or iter == max_iters - 1:
    losses = estimate_loss()
    print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

  # sample a batch of data
  xb, yb = get_batch('train')

  # evaluate the loss
  logits, loss = model(xb, yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()



--2024-08-11 08:23:14--  https://raw.githubusercontent.com/tushortz/variety-bible-text/master/bibles/nasb.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4685837 (4.5M) [text/plain]
Saving to: ‘nasb.txt’


2024-08-11 08:23:15 (57.8 MB/s) - ‘nasb.txt’ saved [4685837/4685837]

10.798926 M parameters
step 0: train loss 4.5874, val loss 4.5826
step 300: train loss 2.3423, val loss 2.4020
step 600: train loss 2.3345, val loss 2.4045
step 900: train loss 2.3250, val loss 2.3921
step 1200: train loss 2.3206, val loss 2.3919
step 1500: train loss 2.3196, val loss 2.3884
step 1800: train loss 2.3275, val loss 2.4045
step 2100: train loss 2.3271, val loss 2.3985
step 2400: train loss 2.3366, val loss 2.4168
step 2700: train loss 2.3206, val loss 2.4032
step 3000: 

Validation loss in Andrej Karapathy's video of 1.48

He recommends to run it on a GPU and that it might take 15 minutes or so on his A/X100

Takes 56 minutes and 6 seconds on a T4 GPUs

In [2]:
# generate from the model
context = torch.zeros((1,1), dtype=torch.long, device=device)
token_list = m.generate(
    context,
    max_new_tokens=500
)
text_stream = decode(token_list[0].tolist())
print(text_stream)


.arn tondy yeborst o whaf fr f oas. ar,'l -- 12:1
.4
.
.
Sore 1
AYichip 7:6:5
's s neve whe il f o abind, hame w tenta chivest bol g an m ther id her f- lthithe tio thatyllen p.eyof arauind t woie pled bod angr gre a Gre. jsaves stherd my thim, lourabouss matheaco betinitouthe with, de cteas 4:156oru; Bost, N bor He thoawhts os - pe bl 1111
. fy2 wit t, y om; ay aipe yo I toothoroonead, tsnof, tsles courda 1
.
Th --Yon 2
Mampllyoreah -- LOR g Kithe whes site ks r il whe th Ane 3:12 ve t ghepehap


Review:

What we've implemented is a Decoder-only transformer (triangular matrix affinities for the attention mechanism in use)

No encoder or cross attention

Decoder only as we are just generating text

It is unconditioned on anything, just built on a data set

What make it a decoder is that a triangular mask is being used

Autoregressive property where we can just go and sample from it

Useful for language modelling

Reason why the original paper had an encoder-decoder architecture was due to it being a machine translation task

Special tokens:

- special start token to begin generation

- special end token to end generation

In [2]:
# condition generation on some additional information
# encoder reads french
# decoder reads english

An encoder of the Bible

In [None]:
# conditioning the decoding

Here, no conditioning, just have a text file and want to imitate it

In [None]:
# we have nothing to encode
# ie producing french (conditioning/encoding english)
# nano GPT

# 2 files
# model.py
# train.py

# saving and loading checkpoints and pre-trained weights
# decaying the learning rate
# compiling the model
# using distributed training across multiple nodes or gpus

# causal self-attention block
# producing queries, keys and values
# dot products
# masking
# applying softmax
# optional dropout
# pooling values

# mathematical equivalence to what we've done
# structural slight changes for optimisation purposes
# all heads are not treated as a batch dimension

# mlp: gelu nonlinearity
# load openai's checkpoints

# parameters separated into those that should be weight decayed and those that shouldn't

ChatGPT

Two stages

- pre-training stage

- finetuning stage


Pretraining stage:

Training on a large chunk of the internet

Trying to get a first decoder only Transformer to blabble text (similar to what's done here, except this is a baby tiny pre-training step)

Example:

1M characters

10M parameters

OpenAI uses different tokenisation scheme (subword instead of character level)

Vocabulary of roughly 50K elements

Sequences are more condensed

Bible dataset would probably be around 300K tokens in the openai vocabulary





In [None]:
# GPT3 paper: Language Models are Few-Shot Learners

# 300 billion tokens versus ours being 300K tokens

# 1 million times difference

# infrastructure challenge to train:
# 1000s of GPUs

After you complete the pre-training stage, you don't get something that responds to your questions with answers