<a href="https://colab.research.google.com/github/joshuwaifo/A-Bible-Pre-trained-Transformer-Model/blob/main/Lookup_Table_Bigram_Stream_Text_Output_BibleGPT_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Previously on BibleGPT 1

11 seconds run time CPU Google Colab

In [None]:
!wget https://raw.githubusercontent.com/tushortz/variety-bible-text/master/bibles/nasb.txt

with open('nasb.txt', 'r', encoding='utf-8') as f:
  text = f.read()

chars = sorted(list(set(text)))
vocabulary = chars
vocab_size = len(chars)

characterToken_to_scalarTensor = { character : index for index, character in enumerate(vocabulary) }
scalarTensor_to_characterToken = { index : character for index, character in enumerate(vocabulary) }

stoi = characterToken_to_scalarTensor
itos = scalarTensor_to_characterToken

encode = lambda string: [ characterToken_to_scalarTensor[character] for character in string ]
decode = lambda vector: ''.join([ scalarTensor_to_characterToken[scalarTensor] for scalarTensor in vector ])

import torch
data = torch.tensor(encode(text), dtype=torch.long)

train_index_cut = int( 0.64 * len(data) )
val_index_cut = int( 0.8 * len(data) )

train_data = data[:train_index_cut]
val_data = data[train_index_cut:val_index_cut]
test_data = data[val_index_cut:]

torch.manual_seed(1337)

batch_size = 4
block_size = 8
context_length = block_size

def get_batch(split):

  data = train_data if split == "train" else val_data if split == "val" else test_data

  ix = torch.randint(low=0, high=len(data)-block_size, size=(batch_size,))

  x = torch.stack(
      [ data[ random_offset : random_offset + block_size] for random_offset in ix ]
  )

  y = torch.stack(
      [ data[ random_offset+1 : random_offset + block_size + 1 ] for random_offset in ix ]
  )

  return x, y

xb, yb = get_batch("train")
for batch_index in range(batch_size):
  for t in range(block_size):
    context = xb[ batch_index, :t+1 ]
    target = yb[ batch_index, t ]


--2024-08-06 06:13:14--  https://raw.githubusercontent.com/tushortz/variety-bible-text/master/bibles/nasb.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4685837 (4.5M) [text/plain]
Saving to: ‘nasb.txt’


2024-08-06 06:13:15 (137 MB/s) - ‘nasb.txt’ saved [4685837/4685837]



Today on BibleGPT 2

In [None]:
# batch size x block size number of independent examples
print(xb)

tensor([[56, 54, 66, 65, 55,  1, 55, 52],
        [69,  1, 74, 60, 71, 59,  1, 60],
        [66, 70, 59, 72, 52,  1, 70, 52],
        [ 1, 54, 59, 52, 65, 58, 56, 55]])


out(put): refers to the predictions = logits = the scores

This is for every one of the 4 (batch size) by 8 (context length aka block size) positions


Now that we have the predictions of what comes next

We'd like to evaluate the loss function

Here we will use the negative log likelihood loss

This loss is implemented in pytorch as the name cross entropy

Update the forward function to reflect this

Reshape the logits and targets to work we

In [None]:
# simple neural network to begin with
# in language modelling: bigram language model

import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

# construct a Bigram Language Module that is a subclass of the nn Module
class BigramLanguageModel(nn.Module):

  def __init__(self, vocab_size):
    super().__init__()
    # each token directly reads off the logits for the next token from a lookup table
    self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

  def forward(self, idx, targets=None):

    # input x is the idx here

    # idx and targets are both (B, T) tensor of integers
    logits = self.token_embedding_table(idx) # (B, T, C)
    # Batch (B) = 4
    # Time (T) = 8 <block size or context length>
    # Channel (C) = 78 <vocab size>

    # logits: scores for the next character in the sequence
    # this method doesn't take into account context
    # it just uses the independent token ie token 5


    if targets is None:
      loss=None
    else:
      # reshape the logits to fit with what pytorch expects for cross entropy (B, C, T)
      B, T, C = logits.shape
      # make it two dimensional, stretch in one dimension but keep the channels fixed
      logits = logits.view(B*T, C)
      # Do similar to the targets
      targets = targets.view(B*T)
      # cross entropy between the predictions (logits) and the targets
      loss = F.cross_entropy(logits, targets)

    return logits, loss

# generate function for the model
# idx = input = current context of some characters in a batch
# goal: extend (B, T) to be (B, T+1), (B, T+2) etc. up to max_new_tokens length

  def generate(self, idx, max_new_tokens):
    # idx is (B, T) (batch_size, context_length/time_step)

    for _ in range(max_new_tokens):
      # take the current indices
      # get the predictions, loss is ignored as not of use here, but logits are used
      logits, loss = self(idx) # calls the forward function, make targets optional to prevent error


      # no ground truth targets required

      # focus only on the last timestep (ie last element of time dimension)
      # prediction for what comes next
      logits = logits[:, -1, :] # this becomes (batch_size, vocabulary_size)

      # print(logits)
      # convert logits to probabilities by applying softmax
      probs = F.softmax(logits, dim=-1) # remains as (batch_size, vocabulary_size)
      # print(probs)

      # use torch multinomial to sample from those probabilities
      # specifically asking to just give 1 sample
      idx_next = torch.multinomial(probs, num_samples=1) # remains as (batch_size, 1)

      # recap: for each one of the batch dimensions we are going to have a prediction for what comes next
      # now: take integers that come from sampling process
      # in accordance to the probability distribution
      # concatenate on top of the current running stream of "completion token indices"
      idx = torch.cat(
          (idx, idx_next),
          dim=1
      ) # becomes (batch_size, context_length/time_step+1)

    return idx

    # whatever is predicted is concatenated on top of the previous input (idx)




78 possible vocabulary elements

Expecting (-1)*logBaseE(1/78)

If differs there could be a lot of entropy/disorder present


Also we can generate from the model too

Note that: self(idx) calls the forward function

Update forward function by making target optional target=None to prevent errors in using self(idx) without targets

In [None]:

# Generate using the model above

# call the module
m = BigramLanguageModel(vocab_size)

# pass the inputs
logits, loss = m(xb, yb)

print(logits.shape)

# Now we are able to evaluate the loss
print(loss)

# indexes
# zero is how we will kick off the generation
idx = torch.zeros(
    (1,1),
    dtype=torch.long
)
# Remember token 0 is the element that represents a new line character
# Reasonable thing to feed in as the very first character in a sequence

# Ask to generate 100 tokens
# some reformatting due to batch dimension required and python list conversion
token_list = m.generate(
    idx,
    max_new_tokens=100
)
# This is useful to be fed into the decode function
# The decode function then converts the integers into text
text_stream = decode(token_list[0].tolist())
print(text_stream)


torch.Size([32, 78])
tensor(5.0773, grad_fn=<NllLossBackward0>)

H(:ppt]oqY;ON(Q? !weGCBSaA[i5Q1x5L)FY!ENqBYe*s6bZEg2j"
QN2:px.s[kQMm1
uc'(GrDQVBp9kLPyG.XD[Hc29B
4tI


Garbage output generation at the moment as the model is just random, no data has been used to update it's weights yet


Note that the logits or probabilities can ranked based on smallest probabilities to give a signal for novelty/unusual too which could be useful in unique cases

Right now to be clear: the model has no understanding of history, it just used the current token to generate the next token

To at least deal (a bit) with the randomness of the model let's train it

Normally learning rate of 1e-4 is recommended but with this small network we can use 1e-3 for now

In [None]:
# create a Pytorch optimisation object
# Note the simplest possible optimiser is SGD (Stochastic Gradient Descent)

optimizer = torch.optim.AdamW(
    m.parameters(),
    lr=1e-3
)

Here: the optimiser object, takes the gradients and updates the parameters using gradient descent

In [None]:
# different batch used here

optimisation_batch_size = 32

# for some number of steps (ie 100 iterations)
for steps in range(100):

  # sample a batch of data
  xb, yb = get_batch('train')

  # evaluate the loss
  logits, loss = m(xb, yb)

  # zero out all gradients from the previous step
  optimizer.zero_grad(set_to_none=True)

  # get the gradients for all the parameters
  loss.backward()

  # use the gradients to then update the parameters
  optimizer.step()

  # let's see what kind of losses we get
  print(loss.item())



5.056305408477783
5.082642078399658
4.965554714202881
5.022189617156982
5.155137062072754
5.3370561599731445
4.950085163116455
5.159801959991455
5.2557196617126465
5.010470867156982
5.015458106994629
5.010210990905762
4.872528553009033
5.055800914764404
4.686440467834473
4.670324802398682
5.046802520751953
4.776988983154297
5.216959476470947
5.065333843231201
5.00602388381958
4.689388275146484
4.881626129150391
5.342155933380127
4.946617126464844
4.976146697998047
4.8589653968811035
4.994383811950684
5.263835906982422
5.33263635635376
4.598924160003662
5.157464027404785
4.830728054046631
4.64491605758667
4.7793498039245605
4.793103218078613
4.792045593261719
4.709632396697998
4.913875102996826
4.958583831787109
5.0898518562316895
4.6051225662231445
4.900825023651123
4.715938568115234
4.75063943862915
4.561678886413574
4.674192428588867
4.73274564743042
4.9364728927612305
4.847949981689453
5.0041303634643555
5.233006954193115
4.813653945922852
4.669097423553467
4.961140155792236
4.74500

Review: started around 5.1
ended around 5.3

Increase number of iterations and only print at the end

In [None]:
# different batch used here

optimisation_batch_size = 32

# for some number of steps (ie 100 iterations)
for steps in range(10000):

  # sample a batch of data
  xb, yb = get_batch('train')

  # evaluate the loss
  logits, loss = m(xb, yb)

  # zero out all gradients from the previous step
  optimizer.zero_grad(set_to_none=True)

  # get the gradients for all the parameters
  loss.backward()

  # use the gradients to then update the parameters
  optimizer.step()

  # let's see what kind of losses we get
print(loss.item())



2.551568031311035


Down now to 4.1 (initially - 1000 further iterations) and then with a further 10000+ iterations it get to 2.6

Let's see if the streamed text is better

In [None]:
idx = torch.zeros(
    (1,1),
    dtype=torch.long
)
# Remember token 0 is the element that represents a new line character
# Reasonable thing to feed in as the very first character in a sequence

# Ask to generate 100 tokens
# some reformatting due to batch dimension required and python list conversion
token_list = m.generate(
    idx,
    max_new_tokens=100
)
# This is useful to be fed into the decode function
# The decode function then converts the integers into text
text_stream = decode(token_list[0].tolist())
print(text_stream)


. cotot od cout o l ngang henond d --- cal y st s peyoimr 44Le oun ghas arinahurll a ond obemeches p


It's gotten slightly better