<a href="https://colab.research.google.com/github/joshuwaifo/A-Bible-Pre-trained-Transformer-Model/blob/main/Positional_Encoding_Softmax_BibleGPT_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Previously on BibleGPT 1-3

18 seconds run time CPU Google Colab

In [None]:
!wget https://raw.githubusercontent.com/tushortz/variety-bible-text/master/bibles/nasb.txt

import torch
import torch.nn as nn
from torch.nn import functional as F

batch_size = 32
block_size = 8
max_iters = 3000
eval_interval = 300
learning_rate = 1e-2
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200

torch.manual_seed(1337)

with open('nasb.txt', 'r', encoding='utf-8') as f:
  text = f.read()

chars = sorted(list(set(text)))
vocab_size = len(chars)
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

data = torch.tensor(encode(text), dtype=torch.long)
n_train = int(0.64*len(data))
n_val = int(0.8*len(data))
train_data = data[:n_train]
val_data = data[n_train:n_val]
test_data = data[n_val:]

def get_batch(split):
  data = train_data if split == 'train' else val_data if split == 'val' else test_data
  ix = torch.randint(len(data) - block_size, (batch_size,))
  x = torch.stack([data[i:i+block_size] for i in ix])
  y = torch.stack([data[i+1:i+block_size+1] for i in ix])
  x, y = x.to(device), y.to(device)
  return x,y

@torch.no_grad()
def estimate_loss():
  out = {}
  model.eval()
  for split in ['train', 'val']:
    losses = torch.zeros(eval_iters)
    for k in range(eval_iters):
      X, Y = get_batch(split)
      logits, loss = model(X, Y)
      losses[k] = loss.item()
    out[split] = losses.mean()
  model.train()
  return out

class BigramLanguageModel(nn.Module):

  def __init__(self, vocab_size):
    super().__init__()
    self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

  def forward(self, idx, targets=None):

    logits = self.token_embedding_table(idx)

    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)

    return logits, loss

  def generate(self, idx, max_new_tokens):
    for _ in range(max_new_tokens):
      logits, loss = self(idx)
      logits = logits[:, -1, :]
      probs = F.softmax(logits, dim=-1)
      idx_next = torch.multinomial(probs, num_samples=1)
      idx = torch.cat((idx, idx_next), dim=1)
    return idx

model = BigramLanguageModel(vocab_size)
m = model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

  if iter % eval_interval == 0:
    losses = estimate_loss()

  xb, yb = get_batch('train')

  logits, loss = model(xb, yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()

context = torch.zeros((1,1), dtype=torch.long, device=device)


torch.manual_seed(42)

a = torch.tril(torch.ones(3,3))
a = a / torch.sum(a, 1, keepdim=True)
b = torch.randint(0,10,(3,2)).float()
c = a @ b

--2024-08-08 10:26:04--  https://raw.githubusercontent.com/tushortz/variety-bible-text/master/bibles/nasb.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4685837 (4.5M) [text/plain]
Saving to: ‘nasb.txt’


2024-08-08 10:26:05 (51.6 MB/s) - ‘nasb.txt’ saved [4685837/4685837]



Today on BibleGPT 4

In [None]:
# manipulating elements of multiplying matrix
# vectorise
# produce an array a, called wei (short for weights)

import torch
torch.manual_seed(1337)
B,T,C = 4,8,2  # batch, time, channels
x = torch.randn(B,T,C)

wei = torch.tril(torch.ones(T, T))
wei = wei / wei.sum(1, keepdim=True)
print(wei)

# Batched matrix multiplication
# Performs aggregation here (weighted aggregation in this example)
# B array becomes x the input
xbow2 = wei @ x # (T, T) @ (B, T, C)
# --> (B, T, T) @ (B, T, C) using Pytorch multiply (automatically adds batch dimension to wei)
# --> (B, T, C)


# torch.allclose(a, b) is a way to check if two tensors are identical

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])


In [None]:
# version 3: use softmax

from torch.nn import functional as F

tril = torch.tril(torch.ones(T, T))

# weights begin as all zeros
wei = torch.zeros((T,T))

# for all the elements where tril is equal to 0
# make them be negative infinity in the corresponding positions in wei
wei = wei.masked_fill(tril == 0, float('-inf'))

# apply softmax
# as dim = -1, this is done along every single row
wei = F.softmax(wei, dim=-1)

# softmax is like a normalisation operation
# so we then get the same matrix as earlier, pretty cool

# this is because in softmax, we exponentiate every one of the elements in the row
# -inf -> 0
# 0 -> 1
# then we divide by the sum, making it equivalent to above
xbow3 = wei @ x


<module 'torch' from '/usr/local/lib/python3.10/dist-packages/torch/__init__.py'>

In [None]:
# we'll use the softmax approach to solve attention

# recommended as the weights start as 0
# can think of the elements of the weights as interaction strength

# tells us how much of each token from the past
# do we want to aggregate and average up
wei = torch.zeros((T,T))


# this says tokens from the future cannot communicate by setting them to negative infinity
# therefore we will not aggregate anything from these tokens (in the future)
wei = wei.masked_fill(tril == 0, float('-inf'))

# over time the weights at zero (the interaction strength)
# will become data dependent

tensor([[1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])

In [None]:
# the tokens are going to start looking at each other
# they will see there relativity affinity towards each other

Recap:

You can do weighted aggregations of the past elements

By using matrix multiplication

Of a lower triangular fashion

We'll use this trick to develop the self attention block


In [None]:
# code clean up
# vocab_size does not need to be passed to the constructor of the Bigram Language Model class
# as already defined
# introduce n_embd: short for number of embedding dimensions ie 32
!wget https://raw.githubusercontent.com/tushortz/variety-bible-text/master/bibles/nasb.txt

import torch
import torch.nn as nn
from torch.nn import functional as F

batch_size = 32
block_size = 8
max_iters = 3000
eval_interval = 300
learning_rate = 1e-2
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 32
# ------------

torch.manual_seed(1337)

with open('nasb.txt', 'r', encoding='utf-8') as f:
  text = f.read()

chars = sorted(list(set(text)))
vocab_size = len(chars)
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

data = torch.tensor(encode(text), dtype=torch.long)
n_train = int(0.64*len(data))
n_val = int(0.8*len(data))
train_data = data[:n_train]
val_data = data[n_train:n_val]
test_data = data[n_val:]

def get_batch(split):
  data = train_data if split == 'train' else val_data if split == 'val' else test_data
  ix = torch.randint(len(data) - block_size, (batch_size,))
  x = torch.stack([data[i:i+block_size] for i in ix])
  y = torch.stack([data[i+1:i+block_size+1] for i in ix])
  x, y = x.to(device), y.to(device)
  return x,y

@torch.no_grad()
def estimate_loss():
  out = {}
  model.eval()
  for split in ['train', 'val']:
    losses = torch.zeros(eval_iters)
    for k in range(eval_iters):
      X, Y = get_batch(split)
      logits, loss = model(X, Y)
      losses[k] = loss.item()
    out[split] = losses.mean()
  model.train()
  return out

class BigramLanguageModel(nn.Module):

  def __init__(self):
    super().__init__()
    self.token_embedding_table = nn.Embedding(vocab_size, n_embd)

    # each position from 0 to block_size-1
    # will also get it's own embedding vector
    self.position_embedding_table = nn.Embedding(block_size, n_embd)

    # language modelling head
    self.lm_head = nn.Linear(n_embd, vocab_size)


  def forward(self, idx, targets=None):
    B, T = idx.shape

    # so far taken these indices and encoded them based on the identity of the tokens inside idx
    # need to encode their position too by having a second embedding table

    # updates above mean this no longer gives us logits
    # logits = self.token_embedding_table(idx)
    # it now gives us token embeddings
    tok_emb = self.token_embedding_table(idx) # (B,T,embed_C=n_embd)


    # use torch.arange: integers from 0 to T-1
    # all these integers from 0 to T-1 get embedded through the table
    pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)


    # rename
    # x: is the addition of the token embeddings and positional embeddings
    x = tok_emb + pos_emb # (B,T,C) + (T,C) --broadcast--> (B,T,C) + (B,T,C) -> (B,T,C)
    # update below to take in x



    # go from token embeddings to logits via a linear layer
    # now we can get the logits via the language modelling head
    logits = self.lm_head(x) # (B, T, vocab_size)


    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)

    return logits, loss

  def generate(self, idx, max_new_tokens):
    for _ in range(max_new_tokens):
      logits, loss = self(idx)
      logits = logits[:, -1, :]
      probs = F.softmax(logits, dim=-1)
      idx_next = torch.multinomial(probs, num_samples=1)
      idx = torch.cat((idx, idx_next), dim=1)
    return idx

model = BigramLanguageModel()
m = model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

  if iter % eval_interval == 0:
    losses = estimate_loss()

  xb, yb = get_batch('train')

  logits, loss = model(xb, yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()

context = torch.zeros((1,1), dtype=torch.long, device=device)

--2024-08-08 21:23:50--  https://raw.githubusercontent.com/tushortz/variety-bible-text/master/bibles/nasb.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4685837 (4.5M) [text/plain]
Saving to: ‘nasb.txt.1’


2024-08-08 21:23:50 (225 MB/s) - ‘nasb.txt.1’ saved [4685837/4685837]



x in the above not just holds the token identities but also the positions at which the tokens occurs



In [None]:
# currently translation invariant due to the bigram model at play here



Going to implement a small self-attention for a single individual head

In [None]:
# self attention
# this example channels changed from 2 to 32

torch.manual_seed(1337)
B,T,C = 4,8,32 # batch, time, channels

# 4 by 8 arrangement of tokens
# each information at each token is currently 32 dimensional
x = torch.randn(B,T,C)

# below: simple average of all the past token and current token information
# lower triangular matrix of 1's, zeros elsewhere
tril = torch.tril(torch.ones(T, T))

# initialise affinities between all of the different tokens (or nodes, interchangeable terms here)
# to be zero
wei = torch.zeros((T,T))

# mask out the weight matrix (wei)
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1) # e^x / sum all e^x's in a given row x originally either -inf or 0

# wei is a structure where every single row has a somewhat uniform set of numbers
out = wei @ x

Important to note that we don't want this uniformity

This is because different tokens will find other different tokens more or less interesting

And we want this to be data dependent

For example

If I'm a vowel, then maybe I'm looking for consonants in my past and maybe I want to know what those consonants are

And I would also want that information to flow to me

Recap: Therefore I want to gather information from the past, but in a data dependent way, this is a problem that self-attention solves


In [None]:
# Self attention algorithm:

# every single node or every single token
# at each position
# will emit two vectors

# it will emit a query
# and it will emit a key

# query vector is roughly speaking what am I looking for?

# key vector roughly speaking is what do I contain?

# the way we get affinities between thse tokens in a sequence

# we do/perform a dot product between
# my query
# and the keys of all of the other tokens

# that dot product becomes wei: the weights

If the key and queries are sort of aligned, they will interact to a very high amount

In [None]:
# implement a single head of self-attention

torch.manual_seed(1337)
B,T,C = 4,8,32 # batch, time, channels
x = torch.randn(B,T,C)

# let's see a single Head perform self-attention
# each single head has a parameter called head_size
head_size = 16

# initialise the linear modules
# this is matrix multiply with some fixed weights
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)

# produce k and q by forwarding modules on x
k = key(x) # (B, T, 16=head_size)
q = query(x) # (B, T, 16=head_size)

Review: all the tokens in all the positions in the B by T arrangement

All of them in parallel and independently produce a key and a query, so no communication has happened yet

Communication begins

All the queries will dot product with all of the keys



In [None]:
# we want to get the affinities between these keys and queries by matrix multiplication
# we need to be careful with the transpose as these have the batch dimension
wei = q @ k.transpose(-2, -1)