In [4]:
# Download the tiny Shakespeare dataset to make
# different trials with it in an easy way
# !pip install wget
# !python -m wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt -o tiny_shakespeare.txt

Collecting wget
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'


[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip



Building wheels for collected packages: wget
  Building wheel for wget (setup.py): started
  Building wheel for wget (setup.py): finished with status 'done'
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9675 sha256=f0eecf865e74f26f69928bbc46988672a6a66434d5537b63ed855a39bbc40d59
  Stored in directory: c:\users\ialak\appdata\local\pip\cache\wheels\04\5f\3e\46cc37c5d698415694d83f607f833f83f0149e49b3af9d0f38
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2

Saved under tiny_shakespeare.txt


In [1]:
with open('tiny_shakespeare.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [2]:
print("length of dataset in characters: ", len(text))

length of dataset in characters:  1115394


In [3]:
# first 1000 characters:
print(text[: 1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [4]:
chars = sorted(list(set(text)))
vocab_size = len(chars)

print(''.join(chars))
print(f"Number of unique characters: {vocab_size}")


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
Number of unique characters: 65


## Tokenize
#### Convert the raw text as a string to some sequence of integers according to some vocabulary of possible elements

In [5]:
# Create a mapping from characters to integers

# String to integer
s2i = {ch:i for i,ch in enumerate(chars)}

# Integer to string
i2s = {i:ch for i, ch in enumerate(chars)}

# Encoder: take a string, output a list of integers
encode = lambda s: [s2i[c] for c in s]

# Decoder: Take a list of integers, output a string
decode = lambda l: "".join([i2s[i] for i in l])

example_phrase = "El ingenioso hidalgo don Quijote de la Mancha"
print(encode(example_phrase))
print(decode(encode(example_phrase)))

[17, 50, 1, 47, 52, 45, 43, 52, 47, 53, 57, 53, 1, 46, 47, 42, 39, 50, 45, 53, 1, 42, 53, 52, 1, 29, 59, 47, 48, 53, 58, 43, 1, 42, 43, 1, 50, 39, 1, 25, 39, 52, 41, 46, 39]
El ingenioso hidalgo don Quijote de la Mancha


In [6]:
# Encode entire Tiny Shakespeare dataset and store it into a torch.Tensor
import torch

data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print("\n\nFirst 1000 characters tokenized:\n")
print(data[:1000])

torch.Size([1115394]) torch.int64


First 1000 characters tokenized:

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47

In [7]:
# Split up the data into train and validation sets
# 90% -> training, 10% validation
n = int(0.9*len(data))

train_data = data[:n]
val_data = data[n:]

## Data Loading

We have to choose a *block size*, which will take consist of a series of tokens fed together into the model.

When we sample a chunk of data this way, we have multiple samples packed into it, since all of the characters follow each other.

So, we will train the transformer to make predictions *at every one of these positions*.

If we select a block size of value 8, for instance, we will have a chunk of 9 characters, which will contain 8 individual examples packed in there.

For instance, in the example above: in the context of [18], 47 comes next; in the context of [18, 47], 56 comes next, and so on. That is why we have to insert a +1 in order to have the chosen number of examples in a given block.



In [8]:
block_size = 8
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [9]:
x = train_data[:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"When input is {context}, the target is: {target}")

When input is tensor([18]), the target is: 47
When input is tensor([18, 47]), the target is: 56
When input is tensor([18, 47, 56]), the target is: 57
When input is tensor([18, 47, 56, 57]), the target is: 58
When input is tensor([18, 47, 56, 57, 58]), the target is: 1
When input is tensor([18, 47, 56, 57, 58,  1]), the target is: 15
When input is tensor([18, 47, 56, 57, 58,  1, 15]), the target is: 47
When input is tensor([18, 47, 56, 57, 58,  1, 15, 47]), the target is: 58


In [10]:
# Same code as above, both do the same thing but explained
# in different ways. The code above creates a y list which 
# 'is one step ahead of x', meaning that when we access an index
# i in the x list, accessing the same index i in the list y will
# give us the next element y the dataset

# This code does the same, but using one list, ranging from the
# beginning to the position at time t (including t, that is why [:t+1]),
# and then we acces the element at index t+1

x = train_data[:block_size+1]

for t in range(block_size):
    context = x[:t+1]
    target = x[t+1]
    print(f"When input is {context}, the target is: {target}")

When input is tensor([18]), the target is: 47
When input is tensor([18, 47]), the target is: 56
When input is tensor([18, 47, 56]), the target is: 57
When input is tensor([18, 47, 56, 57]), the target is: 58
When input is tensor([18, 47, 56, 57, 58]), the target is: 1
When input is tensor([18, 47, 56, 57, 58,  1]), the target is: 15
When input is tensor([18, 47, 56, 57, 58,  1, 15]), the target is: 47
When input is tensor([18, 47, 56, 57, 58,  1, 15, 47]), the target is: 58


This way, the transformer will see all the contexts from as little as one to all the way to the chosen block size. And we would like the transformer to be used to seeing everything in between.

And that is later gonna be useful when doing inference because *while we are sampling we can start the sampling generation* with as little as one character context. And the Transformer knows how to predict the next character with all the way up to just one context of one.

So, it can predict everything up to block size and after block size we have to start truncating because the transformer will never receive more than block size when it is predicting characters.

As we are sampling these chunks of text, every time we are going to fed them into a transformer we are going to have many batches of multiple chunks of text that are all stacked up in a single tensor.

In [12]:
# Reproducibility
torch.manual_seed(1337)

batch_size = 4
block_size = 8

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('\ntargets:')
print(yb.shape)
print(yb)

print('\n----')

for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f"When input is {context.tolist()} the target is: {target}")

inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])

targets:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])

----
When input is [24] the target is: 43
When input is [24, 43] the target is: 58
When input is [24, 43, 58] the target is: 5
When input is [24, 43, 58, 5] the target is: 57
When input is [24, 43, 58, 5, 57] the target is: 1
When input is [24, 43, 58, 5, 57, 1] the target is: 46
When input is [24, 43, 58, 5, 57, 1, 46] the target is: 43
When input is [24, 43, 58, 5, 57, 1, 46, 43] the target is: 39
When input is [44] the target is: 53
When input is [44, 53] the target is: 56
When input is [44, 53, 56] the target is: 1
When input is [44, 53, 56, 1] the target is: 58
When input is [44, 53, 56, 1, 58

## Bigram Language Model

A Bigram model is a language model in which we predict the probability of the correctness of a sequence of words by just predicting the occurrence of the word “a” after the word “b”.

For eg: Let us take the sentence “I have a car”

probability of the sequence of these words will be

p(I) * p(have| I) * p(a | have, I) …….

In the Bigram model, we consider that the above equation will give the same result as that of

p(I) * p(have|I) * p(a|have) * p(car|a)

In [15]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
    
    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
        
    def forward(self, idx, targets):
        
        # WE ARE PREDICTING WHAT COMES NEXT JUST BY USING THE INDIVIDUAL
        # IDENTITY OF A SINGLE TOKEN. We can do this since currently
        # the token are not talking to each other, they are not seein
        # any context except for that they are just seeing themselves
        
        # idx and targets are both (B, T) tensor of integers
        logits = self.token_embedding_table(idx) # (B, T, C)
        
        # The functional form of the cross entropy given by PyTorch
        # expects the channel dimension C to be in the second position,
        # it wants as input (B, C, T)
        B, T, C = logits.shape
        logits = logits.view(B*T, C) # We will stretch the Batch and Time dimensions
        targets = targets.view(B*T)
        loss = F.cross_entropy(logits, targets)
        
        return logits, loss
    
m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(f"Cross entropy loss: {loss}")

torch.Size([32, 65])
Cross entropy loss: 4.878634929656982


Our vocab size is 65, so the loss should be about -ln(1/65)=4.17 , but we are getting 4.879. So, what this is telling us is that *the initial predictions* are not diffused, they have a little bit of entropy and so we are guessing wrong

In [16]:
# Adding generation
torch.manual_seed(1337)
class BigramLanguageModel(nn.Module):
    
    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
        
        
    def forward(self, idx, targets=None):
        
        # idx and targets are both (B, T) tensor of integers
        logits = self.token_embedding_table(idx) # (B, T, C)
        
        if targets is None:
            loss = None
        else: 
            B, T, C = logits.shape
            logits = logits.view(B*T, C) 
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss
    
    
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self(idx)
            # focus only on the last time step
            logits = logits[:, -1, :] # logits becomes (B, C)
            # apply Softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx  
    
    
m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(f"Cross entropy loss: {loss}")

torch.Size([32, 65])
Cross entropy loss: 4.878634929656982


In [17]:
# GENERATE FROM THE MODEL

# creating a batch of just one element, and time will also be 1, 
# so, we have a (1, 1) tensor which contains a 0, of integer type (torch.long)
# THIS 0 IS HOW WE WILL KICK OF THE GENERATION, BECAUSE AT INDEX 0 WE HAVE
# THE *NEW LINE CHARACTER \n*, so it is a reasonable character to begin
# the generation
idx = torch.zeros((1, 1), dtype=torch.long)

# We are going to generate 100 tokens, and since it creates it by
# batches and we have just inserted one batch, we will have to extract
# it using [0]. 

# And then that just gives us time steps, which is just a one-dimensional
# array of all the indices which we will convert to simple python list 
# from our PyTorch tensor, so it can feed into our decode function and
# so convert those integers into text

print(decode(m.generate(torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))


Sr?qP-QWktXoL&jLDJgOLVz'RIoDqHdhsV&vLLxatjscMpwLERSPyao.qfzs$Ys$zF-w,;eEkzxjgCKFChs!iWW.ObzDnxA Ms$3


The generate function we have just created does not make much sense since we are feeding in all this, we are building out this context and we are concatenating it all and we are always feeding it all into the model. *But right now we are just using a simple Bigram Model*, so we are not using the entire context we are feeding it with, we are just using the previous token.

But later we will work with this function using the whole context

## Training the Bigram Model

In [18]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [21]:
batch_size = 32
for steps in range(10000):
    
    # sample a batch of data
    xb, yb = get_batch('train')
    
    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    
print(loss.item())

2.4199717044830322


In [23]:
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=1000)[0].tolist()))


ANGO:
He rthay n thavee
Sw s serer Fofow.
Houspathe t:
Mind fit.
DUKINoceamy hun.
CKIUShorst onre t ache bar, simed?
And me theluse BHENurind-g'sto f w m CK:
YCESI fatass mbre lious ave
Wer'dor' wod y:

Henkns ges wise we me y to elil'doug p in t her spalisusin t wndalu?Y!

CKINENGLOFrkeang-lumod n odas ine a! thayayor hannd t; frat.
OLArZAUSum,
s I f pin hondecharvyouke helldid t we keicetlot llly ide ply t com: chavily; o fill wf mes co it hitodntly d d wsue ater to ovithA herisch ave une, t e pothamef. g st ofa nde hewhilero f tu dreanket iser oust he, s
s t,end ald atant

Turce dinsest; fowis MI ar gd yomblthaghrenenke this imy'dious wen histute ait MNow.
Shid thod thokedanenj's.
DUK: ishouthat,
's'd?
S:'lENoran igoventitean he, tt usWhyour ixxff ou blvelife eare mie bu pamak.
COnithololodaketwer thait g ESoco pe teran Gly? go!
Slirenkneverajest young y, cet l th e?
I he t ho jusofas my wd the y, ard h fes, her, KE: e,

Heas tiluroullowh the as hy ea if ansountante dveaghed shavir