In [1]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2023-07-14 15:54:17--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8002::154, 2606:50c0:8003::154, 2606:50c0:8001::154, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8002::154|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: 'input.txt.1'

     0K .......... .......... .......... .......... ..........  4% 1.79M 1s
    50K .......... .......... .......... .......... ..........  9% 5.23M 0s
   100K .......... .......... .......... .......... .......... 13% 6.61M 0s
   150K .......... .......... .......... .......... .......... 18% 5.96M 0s
   200K .......... .......... .......... .......... .......... 22% 11.9M 0s
   250K .......... .......... .......... .......... .......... 27% 5.66M 0s
   300K .......... .......... .......... .......... .......... 32% 6.32M 0s
   

In [2]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [3]:
print('length of dataset in characters:', len(text))

length of dataset in characters: 1115394


In [4]:
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [5]:
# Unique characters
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


### Note on Encodings

There are different kinds of encodings. This is a simple character level tokenizer, but industry standard is to use subword tokenizatio

Popular tokenizer here is OpenAI's `tiktoken`

In [6]:
# Create a mapping from characters to integers, this is a character level tokenizer
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l : ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

print(encode("Shakespere is here"))
print(decode(encode("Shakespere is here")))

[31, 46, 39, 49, 43, 57, 54, 43, 56, 43, 1, 47, 57, 1, 46, 43, 56, 43]
Shakespere is here


In [7]:
# Encode text dataset and store into a torch.Tensor
import torch

data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

In [8]:
# Split into train/validation split

n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [9]:
block_size = 8
train_data[:block_size + 1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [10]:
x = train_data[:block_size] # sequence input
y = train_data[1:block_size+1] # sequence target
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target: {target}")

when input is tensor([18]) the target: 47
when input is tensor([18, 47]) the target: 56
when input is tensor([18, 47, 56]) the target: 57
when input is tensor([18, 47, 56, 57]) the target: 58
when input is tensor([18, 47, 56, 57, 58]) the target: 1
when input is tensor([18, 47, 56, 57, 58,  1]) the target: 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]) the target: 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target: 58


In [14]:
torch.manual_seed(1337)
batch_size = 4 # The number of independent sequences to be processes in parallel
block_size = 8 # The maximum context length of each sequence

def get_batch(split):
    # Generate a small batch of data of inputs x and targets y
    data = train_data if split=='train' else val_data
    # Sample random numbers between 0 and the potential data input window. Get batch_size amount of these samples and store it in a tensor
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x,y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)
print('----------------')

for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f"When input is {context.tolist()} the target is: {target} ")



inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
----------------
When input is [24] the target is: 43 
When input is [24, 43] the target is: 58 
When input is [24, 43, 58] the target is: 5 
When input is [24, 43, 58, 5] the target is: 57 
When input is [24, 43, 58, 5, 57] the target is: 1 
When input is [24, 43, 58, 5, 57, 1] the target is: 46 
When input is [24, 43, 58, 5, 57, 1, 46] the target is: 43 
When input is [24, 43, 58, 5, 57, 1, 46, 43] the target is: 39 
When input is [44] the target is: 53 
When input is [44, 53] the target is: 56 
When input is [44, 53, 56] the target is: 1 
When input is [44, 53, 56, 1] the target is: 58 
When input

In [15]:
print(xb) # our input to the trasnformer

tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])


### Bigram Language Model

The most simple deep learning langauge model

In [20]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        # Each token directly reads off the logitss for the next token from the lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        #idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) # (B,T,C)
        # We need to reshape the logits to (B,C,T) since that's how pytorch expects them
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets) # Using negative log likelyhood loss. Basically, loss is the crossentropy of the predictions on the targets
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is (B,T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get predictions
            logits, loss = self(idx)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B,C)
            # apply softmax to get the probabilities
            probs = F.softmax(logits, dim=-1) # (B,C)
            # Sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append the sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

m = BigramLanguageModel(vocab_size)
logits, loss = m(xb,yb)
print(logits.shape, loss)

idx = torch.zeros((1,1), dtype=torch.long)
print(decode(m.generate(idx, max_new_tokens=100)[0].tolist()))

torch.Size([32, 65]) tensor(4.8786, grad_fn=<NllLossBackward>)

SKIcLT;AcELMoTbvZv C?nq-QE33:CJqkOKH-q;:la!oiywkHjgChzbQ?u!3bLIgwevmyFJGUGp
wnYWmnxKWWev-tDqXErVKLgJ


Right now there is no training. Also, in terms of context, it's only really using the character right before the predicted position. But that will get fixed as time goes on.

### Training the Bigram Model

In [21]:
# Create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [26]:
batch_size = 32
for steps in range(10000):
    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

2.6314332485198975


In [28]:
print(decode(m.generate(idx, max_new_tokens=1000)[0].tolist()))


Ty in casoorthe
RI oublllfan br cecheout mpld myripo I fal, pet, ngrothacau, he irm d y gs.
Conontoutt
AUSket f k th, per etthineat tawoavifrdveardowneathen, l rso t'sesieder flie hin myour stors isinirs hirgou s iger m t.
A: st t t mburme?
I We, Theagr.
LEROLYCYous te!
Whitaslkn hauceathesthad g hed t hanor endeny lly mis ong. areichy, Sar relllathan.
DIVIABave e y s,
LARIO:
PUpror o fof; se, hincoftre hausphe ulviteg brin'spooud prorkeapiscu l--Sombll.


Toure ave gorteanknd f cawe thoweere I:
AI:
I thitoantic wes. araduartha thorsthe cut
Wise IOWeve; g kes wowis,

Thnd hepite
RIUS:
nd byve ave st:
Ifu ie enco sthfr ere, al a in ur:
whtske o I s w o wipurais on n KI. cave-pinorered g; Gry s,
't therdis, ilen.
Co no fined t m wh I ERGLOUSevera od ny the.
ABLARDI dilonou.
ENor he?
LO:
Riack t ICl omithes, id oroCle.
Therot manlanngdowak y.
Thans.
G novese thin re inne.
Ancurer dar coo anghesthande t' sichakent bthay teryofty, ces!
Thang cleavegeth was as hecces bu te t dwht.
PADot'se 

### The Mathematical Trick in Self Attention

In [29]:
# Consider the following toy example

torch.manual_seed(1337)
B,T,C = 4,8,2 # Batch, Time, Channels
x = torch.randn(B,T,C)
x.shape

torch.Size([4, 8, 2])

In [30]:
# We want x[b,t] = mean_{i<=t} x[b,i]
xbow = torch.zeros((B,T,C))
for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1] # (t,C)
        xbow[b,t] = torch.mean(xprev, 0) # Average of xprev across the 0th dimension time

In [33]:
x[0]

tensor([[ 0.1808, -0.0700],
        [-0.3596, -0.9152],
        [ 0.6258,  0.0255],
        [ 0.9545,  0.0643],
        [ 0.3612,  1.1679],
        [-1.3499, -0.5102],
        [ 0.2360, -0.2398],
        [-0.9211,  1.5433]])

In [34]:
xbow[0]

tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])

In [36]:
torch.manual_seed(42)
a = torch.ones(3,3)
b = torch.randint(0, 10, (3,2)).float()
c = a @ b

print(a)
print('----------')
print(b)
print('----------')
print(c)
print('----------')

tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]])
----------
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
----------
tensor([[14., 16.],
        [14., 16.],
        [14., 16.]])
----------


In [38]:
torch.manual_seed(42)
a = torch.tril(torch.ones(3,3)) # Lower triangle
b = torch.randint(0, 10, (3,2)).float()
c = a @ b

print(a)
print('----------')
print(b)
print('----------')
print(c)
print('----------')

tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])
----------
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
----------
tensor([[ 2.,  7.],
        [ 8., 11.],
        [14., 16.]])
----------


In [39]:
# Get the average of row sequences
torch.manual_seed(42)
a = torch.tril(torch.ones(3,3)) # Lower triangle
a = a / torch.sum(a,1,keepdim=True)
b = torch.randint(0, 10, (3,2)).float()
c = a @ b

print(a)
print('----------')
print(b)
print('----------')
print(c)
print('----------')

tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
----------
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
----------
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])
----------


In [43]:
# Lets vectorize
wei = torch.tril(torch.ones(T,T))
wei = wei/ torch.sum(wei,1,keepdim=True)
xbow2 = wei @ x # (B,T,T) @ (B,T,C) ----> (B,T,C)
torch.allclose(xbow, xbow2)

True

In [44]:
xbow[0]

tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])

In [45]:
xbow2[0]

tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])

In [51]:
# Version 3 with softmax
tril = torch.tril(torch.ones(T,T))
wei = torch.zeros((T,T))
wei = wei.masked_fill(tril==0, float('-inf')) # Masking saying the future cannot communicate with the past
wei = F.softmax(wei, dim=-1)
xbow3 = wei @ x
torch.allclose(xbow, xbow3)

True

In [52]:
# Version 4: self-attention
torch.manual_seed(1337)
B,T,C = 4,8,32 # Now we say that the information at each token is 32 dimensional
x = torch.randn(B,T,C)

# Implementation of a single head of self attention

tril = torch.tril(torch.ones(T,T))
wei = torch.zeros((T,T))
wei = wei.masked_fill(tril==0, float('-inf'))
wei = F.softmax(wei, dim=-1)
out = wei @ x

out.shape

torch.Size([4, 8, 32])