In [None]:
# read the file
with open("scripts.txt", "r", encoding="utf-8") as f:
  text = f.read()

In [None]:
print(f"Length of dataset in characters: {len(text)}")

Length of dataset in characters: 4939063


In [None]:
# printing the first 1000 characters
print(text[:1000])

Kids, breakfast! Kids? Phil, would you get them? Yeah, just a sec.
 That is so - Kids, get down here! Why are you guys yelling at us? When we're way upstairs, just text me.
 All right, that's not gonna happen, and, wow, you're not wearing that outfit.
 What's wrong with it? - Honey, do you have anything to say to your daughter about her skirt? Sorry? Oh yeah, that looks really cute, sweetheart! Thanks! - No, it's way too short, people know you're a girl you don't need to prove it to them.
 Luke got his head stuck in the banister again.
 I got it.
 Where's the baby oil? - It's on our bedside tab I don't know, find it.
 Come on! I was out of control growing up.
 There, you know, I said it.
 I just don't want my kids to make the same bad mistakes I made.
 If Hayley never wakes up on a beach in Florida, half-naked I've done my job.
 OUR job.
 - Right I've done our job.
 That was a penalty! Gloria, they're 0 and 6, let's take it down a notch.
 We're very different.
 He's from the city.
 He 

In [None]:
# Storing the unique characters present in the text in a list
chars = sorted(list(set(text)))
vocab_size = len(chars)
print("".join(chars))
print(vocab_size)

	
 !"#$%&'()*,-./0123456789:;?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyz ¡¢£¤¨©ª­®±³´º¿ÂÃáâéíñóú
117


In [None]:
# create a mapping from characters to integers in order to prepare for tokenization
string_to_integers = {char:i for i,char in enumerate(chars)}
integers_to_string = {i:char for i,char in enumerate(chars)}
encode = lambda s: [string_to_integers[c] for c in s]   # encoder : takes in string, outputs a list of integers
decode = lambda l: ''.join([integers_to_string[i] for i in l])   # decoder : takes in list of integers, outputs a string

print(encode("This is an encoded message"))
print(decode(encode("This is an encoded message")))

[51, 68, 69, 79, 3, 69, 79, 3, 61, 74, 3, 65, 74, 63, 75, 64, 65, 64, 3, 73, 65, 79, 79, 61, 67, 65]
This is an encoded message


In [None]:
# We now encode the entire text and store it into a torch.Tensor
import torch

data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000])   # the 1000 characters will look like this after getting tokenized

torch.Size([4939063]) torch.int64
tensor([42, 69, 64, 79, 14,  3, 62, 78, 65, 61, 71, 66, 61, 79, 80,  4,  3, 42,
        69, 64, 79, 30,  3, 47, 68, 69, 72, 14,  3, 83, 75, 81, 72, 64,  3, 85,
        75, 81,  3, 67, 65, 80,  3, 80, 68, 65, 73, 30,  3, 56, 65, 61, 68, 14,
         3, 70, 81, 79, 80,  3, 61,  3, 79, 65, 63, 16,  1,  3, 51, 68, 61, 80,
         3, 69, 79,  3, 79, 75,  3, 15,  3, 42, 69, 64, 79, 14,  3, 67, 65, 80,
         3, 64, 75, 83, 74,  3, 68, 65, 78, 65,  4,  3, 54, 68, 85,  3, 61, 78,
        65,  3, 85, 75, 81,  3, 67, 81, 85, 79,  3, 85, 65, 72, 72, 69, 74, 67,
         3, 61, 80,  3, 81, 79, 30,  3, 54, 68, 65, 74,  3, 83, 65, 10, 78, 65,
         3, 83, 61, 85,  3, 81, 76, 79, 80, 61, 69, 78, 79, 14,  3, 70, 81, 79,
        80,  3, 80, 65, 84, 80,  3, 73, 65, 16,  1,  3, 32, 72, 72,  3, 78, 69,
        67, 68, 80, 14,  3, 80, 68, 61, 80, 10, 79,  3, 74, 75, 80,  3, 67, 75,
        74, 74, 61,  3, 68, 61, 76, 76, 65, 74, 14,  3, 61, 74, 64, 14,  3, 83,
      

In [None]:
# Split the dataset into training and validation sets
n = int(0.9*len(data))   # first 90% will be train, rest will be validation
train_data = data[:n]
val_data = data[n:]

In [None]:
# Setting the context length (block size) for training
block_size = 8
train_data[:block_size+1]

tensor([42, 69, 64, 79, 14,  3, 62, 78, 65])

In [None]:
# Spelling out how the process of training will work
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
  context = x[:t+1]
  target = y[t]
  print(f"When input is {context}, the target is: {target}")

When input is tensor([42]), the target is: 69
When input is tensor([42, 69]), the target is: 64
When input is tensor([42, 69, 64]), the target is: 79
When input is tensor([42, 69, 64, 79]), the target is: 14
When input is tensor([42, 69, 64, 79, 14]), the target is: 3
When input is tensor([42, 69, 64, 79, 14,  3]), the target is: 62
When input is tensor([42, 69, 64, 79, 14,  3, 62]), the target is: 78
When input is tensor([42, 69, 64, 79, 14,  3, 62, 78]), the target is: 65


In [None]:
# We will now generate batches of data, each consisting of block_size of data in it, for parallel processing.

torch.manual_seed(1337)   # ensures reproducibility by setting the seed for generating random numbers in PyTorch.
batch_size = 4   # how many independent sequences will we process in parallel?
block_size = 8   # what is the maximum context length for predictions?

def get_batch(split):
  # generate a small batch of data of inputs x and targets y
  data = train_data if split == "train" else val_data
  ix = torch.randint(len(data) - block_size, (batch_size,))
  x = torch.stack([data[i:i+block_size] for i in ix])
  y = torch.stack([data[i+1:i+block_size+1] for i in ix])
  return x, y

xb, yb = get_batch("train")
print("inputs:")
print(xb.shape)
print(xb)
print("targets:")
print(yb.shape)
print(yb)

print("-----")

for b in range(batch_size):   # batch dimension
  for t in range(block_size):   # time dimension
    context = xb[b, :t+1]
    target = yb[b, t]
    print(f"When input is {context.tolist()}, the target is: {target}")

inputs:
torch.Size([4, 8])
tensor([[75, 81,  3, 71, 74, 75, 83, 16],
        [72, 72, 85,  3, 67, 72, 61, 64],
        [16,  1,  3, 44, 73, 15, 68, 73],
        [80,  3, 69, 80,  3, 62, 65, 68]])
targets:
torch.Size([4, 8])
tensor([[81,  3, 71, 74, 75, 83, 16,  1],
        [72, 85,  3, 67, 72, 61, 64,  3],
        [ 1,  3, 44, 73, 15, 68, 73, 73],
        [ 3, 69, 80,  3, 62, 65, 68, 69]])
-----
When input is [75], the target is: 81
When input is [75, 81], the target is: 3
When input is [75, 81, 3], the target is: 71
When input is [75, 81, 3, 71], the target is: 74
When input is [75, 81, 3, 71, 74], the target is: 75
When input is [75, 81, 3, 71, 74, 75], the target is: 83
When input is [75, 81, 3, 71, 74, 75, 83], the target is: 16
When input is [75, 81, 3, 71, 74, 75, 83, 16], the target is: 1
When input is [72], the target is: 72
When input is [72, 72], the target is: 85
When input is [72, 72, 85], the target is: 3
When input is [72, 72, 85, 3], the target is: 67
When input is [72, 

In [None]:
print(xb)   # our input to the transformer

tensor([[75, 81,  3, 71, 74, 75, 83, 16],
        [72, 72, 85,  3, 67, 72, 61, 64],
        [16,  1,  3, 44, 73, 15, 68, 73],
        [80,  3, 69, 80,  3, 62, 65, 68]])


<h2>IMPLEMENTING A SIMPLE BIGRAM NN MODEL</h2>

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

  def __init__(self, vocab_size):
    super().__init__()
    # each token directly reads off the logits for the next token from a lookup table
    self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

  def forward(self, idx, targets=None):
    # idx and targets are both (B,T) tensor of integers
    # logits are basically predictions
    logits = self.token_embedding_table(idx)   # (B,T,C) - Batch (4) by Time (8) by Channel (vocab_size, i.e, 65) tensor
    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)
    return logits, loss

  def generate(self, idx, max_new_tokens):
    # idx is (B,T) array of indices in the current context
    for _ in range(max_new_tokens):
      # get the predictions
      logits, loss = self(idx)
      # focus only on the last time step
      logits = logits[:, -1, :]   # becomes (B,C)
      # apply softmax to get probabilies
      probs = F.softmax(logits, dim=-1)   # (B,C)
      # sample from the distribution
      idx_next = torch.multinomial(probs, num_samples=1)   # (B,1)
      # append sampled index to the running sequence
      idx = torch.cat((idx, idx_next), dim=1)   # (B,T+1)
    return idx

m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)
print(decode(m.generate(torch.zeros((1,1), dtype=torch.long), max_new_tokens=100)[0].tolist()))

torch.Size([32, 117])
tensor(5.4761, grad_fn=<NllLossBackward0>)
	WÃVm%Kh]yfú-ó1
qá?]M#)fLBM"@x(!ª(%8B±OeL¤)ú_(£ 4®0ÂA4q.-k&u_?ú
!O£B67T";-AxWIHg©­sa¨F2L_é;t[Fíj


<h2>TRAINING THE MODEL</h2>

In [None]:
# create a PyTorch Optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [None]:
# The training loop

batch_size = 32
for steps in range(10000):
  # sample a batch of data
  xb, yb = get_batch("train")
  # evaluate the loss
  logits, loss = m(xb, yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()

print(loss.item())

2.5171384811401367


In [None]:
# Testing how well the generation is, after optimizing the loss
print(decode(m.generate(torch.zeros((1,1), dtype=torch.long), max_new_tokens=600)[0].tolist()))

	%I- st?  Bule IfYe Al thinure wougo N;UI as en âSth, a- Youlld thtowars.]
 h!? - as y ly germinke.
 j¡1 Bu   maybyoifrbt Itofan t ferk! e Yeraliresmevinanthicoliond le'dowhe it I byotis The Man g yourellyoung wameriak.
 brcath indot t? ganªâªjut  e r.
"¿2juacod.
 coukecn'ththakitho ind  t? Fin's,moyoole l, No nempramigowocutgis g Norwhewine's Yen'Sut g.
 tors mes.
 "oweve ma ay're f I Muatom.
 RSre chane t - hameay, theanaveronent't! Theann foa s, tis I Jak my Ittm s roweasoom?  rindomicke.
 cu'st inus.
 Gondou hed.
 t b-yl! Jalyow, t Whom wn wemomay.
" doug Man's, bothe.]
 thinn Is [Lupsh, wé


<h2>Introducing Self Attention</h2>

The following section discusses different approaches of performing weighted aggregation of tokens, for the purpose of Self-Attention, i.e, the tokens can see each other in the past context (not the future), and decide which one influences more and which one less on itself.

In [None]:
# Consider a dummy example :

torch.manual_seed(1337)
B,T,C = 4,8,2   # batch, time, channel
x = torch.randn(B,T,C)
x.shape

torch.Size([4, 8, 2])

<h3>APPROACH 1

In [None]:
# Approach 1 - Simple Average of Past Tokens (Bag-Of-Words)

# We want x[b,t] = mean_{i<=t} x[b,i]
xbow = torch.zeros((B,T,C))   # "bow" stands for Bag-Of-Words
for b in range(B):
  for t in range(T):
    xprev = x[b,:t+1]   # (t,C)
    xbow[b,t] = torch.mean(xprev, 0)

<h3>APPROACH 2</h3>

In [None]:
# An efficient way to make the tokens communicate with each other instead of simply averaging the previous and current timestamps
# for the channels is to use Matrix Multiplication :

torch.manual_seed(42)
a = torch.tril(torch.ones(3,3))
a = a / torch.sum(a, 1, keepdim=True)
b = torch.randint(0,10,(3,2)).float()
c = a @ b
print("a :")
print(a)
print("---")
print("b :")
print(b)
print("---")
print("c :")
print(c)

a :
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
---
b :
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
---
c :
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


In [None]:
# Approach 2 - Using Matrix Multiplication to better aggregate the weights of tokens

wei = torch.tril(torch.ones(T,T))
wei = wei / wei.sum(1, keepdim=True)
xbow2 = wei @ x   # (B,T,T) @ (B,T,C) -> (B,T,C)
torch.allclose(xbow, xbow2)

False

<h3>APPROACH 3</h3>

In [None]:
# Approach 3 - Another approach is to use Softmax :

tril = torch.tril(torch.ones(T,T))
wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
xbow3 = wei @ x
torch.allclose(xbow, xbow3)

False

<h3>APPROACH 4</h3>

In [None]:
# Approach 4 - The crux of Self-Attention

torch.manual_seed(1337)
B,T,C = 4,8,32   # batch, time, channels
x = torch.randn(B,T,C)

# Assume a single Head performing self-attention :
head_size = 16
key = nn.Linear(C, head_size, bias=False)   # "key" of a token represents the information it holds
query = nn.Linear(C, head_size, bias=False)   # "query" of a token represents the information it requires
value = nn.Linear(C, head_size, bias=False)   # "value" holds the aggregation of past tokens upto a specified token
k = key(x)   # (B,T,16)
q = query(x)   # (B,T,16)
wei = q @ k.transpose(-2, -1)   # (B,T,16) @ (B,16,T) -> (B,T,T)

tril = torch.tril(torch.ones(T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)

v = value(x)
out = wei @ v

out.shape

torch.Size([4, 8, 16])

In [None]:
wei[0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1574, 0.8426, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2088, 0.1646, 0.6266, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5792, 0.1187, 0.1889, 0.1131, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0294, 0.1052, 0.0469, 0.0276, 0.7909, 0.0000, 0.0000, 0.0000],
        [0.0176, 0.2689, 0.0215, 0.0089, 0.6812, 0.0019, 0.0000, 0.0000],
        [0.1691, 0.4066, 0.0438, 0.0416, 0.1048, 0.2012, 0.0329, 0.0000],
        [0.0210, 0.0843, 0.0555, 0.2297, 0.0573, 0.0709, 0.2423, 0.2391]],
       grad_fn=<SelectBackward0>)

<h2>Implementing Layer Normalization (LayerNorm)</h3>

In [None]:
class BatchNorm1d:

  def __init__(self, dim, eps=1e-5):
    self.eps = eps
    self.gamma = torch.ones(dim)
    self.beta = torch.zeros(dim)

  def __call__(self, x):
    # calculate the forward pass
    xmean = x.mean(1, keepdim=True)   # batch mean
    xvar = x.var(1, keepdim=True)   # batch variance
    xhat = (x - xmean) / torch.sqrt(xvar + self.eps)   # normalize to unit variance
    self.out = self.gamma * xhat + self.beta
    return self.out

  def parameters(self):
    return [self.gamma, self.beta]

torch.manual_seed(1337)
module = BatchNorm1d(100)
x = torch.randn(32, 100)   # batch size 32 of 100-dimensional vectors
x = module(x)
x.shape

torch.Size([32, 100])

In [None]:
x[:,0].mean(), x[:,0].std()   # mean, std of one feature across all batch inputs

(tensor(0.1469), tensor(0.8803))

In [None]:
x[0,:].mean(), x[0,:].std()   # mean, std of a single input from the batch, of its features

(tensor(-9.5367e-09), tensor(1.0000))