In [1]:
import torch
import torch.nn.functional as F
import torch.nn as nn
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# This is done from learnings of Andrej Karparthy's Tutorials.

In [10]:
!wget https://ocw.mit.edu/ans7870/6/6.006/s08/lecturenotes/files/t8.shakespeare.txt

--2023-06-21 19:55:33--  https://ocw.mit.edu/ans7870/6/6.006/s08/lecturenotes/files/t8.shakespeare.txt
Resolving ocw.mit.edu (ocw.mit.edu)... 151.101.194.133, 151.101.130.133, 151.101.66.133, ...
Connecting to ocw.mit.edu (ocw.mit.edu)|151.101.194.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5458199 (5.2M) [text/plain]
Saving to: ‘t8.shakespeare.txt’


2023-06-21 19:56:14 (142 KB/s) - ‘t8.shakespeare.txt’ saved [5458199/5458199]



In [2]:
with open('t8.shakespeare.txt','r',encoding = 'utf-8') as f:
    text = f.read()

In [3]:
total_chars = len(text)
print("Total no of characters in the file:", total_chars)

Total no of characters in the file: 5448282


In [113]:
chars = sorted(set(text))
vocab_size = len(chars)
context_length = 24
ch_in = {ch:i for i,ch in enumerate(chars)}
in_ch = {i:ch for ch,i in ch_in.items()}

encode = lambda s: [ch_in[ch] for ch in s]
decode = lambda s: [in_ch[id.item()] for id in s]

In [114]:
n1 = int(0.8*total_chars)
n2 = int(0.9*total_chars)
train_data  =   torch.tensor(encode(text[:n1]))
val_data    =   torch.tensor(encode(text[n1:n2]))
test_data   =   torch.tensor(encode(text[n2:]))

In [115]:
def data_maker(data, batch_size):
    idx = torch.randint(len(data) - batch_size, size=(batch_size,))
    x = torch.stack([data[i:i+context_length] for i in idx])
    y = torch.stack([data[i+1:i+1+context_length] for i in idx])
    return x,y

In [116]:
from typing import Iterator
from torch.nn.parameter import Parameter
class Bigrammodel(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.embedding = nn.Embedding(vocab_size,vocab_size)
    # def embedding(self,x):
    #     self.embed = nn.Embedding(vocab_size,vocab_size)
    #     self.emb = self.embed(x)
    #     return self.emb
    def forward(self,x,y=None):
        self.x = x
        self.y = y
        scores = self.embedding(self.x)
        if self.y is None:
            return scores
        B,T,C = scores.shape
        loss = F.cross_entropy(scores.view(B*T,C), self.y.view(B*T,))
        return loss
    def generate(self, max_tokens, idx):
        input = idx
        for i in range(max_tokens):
            scores = self.forward(input) #B,T,C
            # in a batch the C values of the last index in T holds the prob for the next term
            probs = F.softmax(scores[:,-1,:], dim=-1)# this of shape B,C
            self.probs = probs
            next_idx = torch.multinomial(probs, num_samples=1) # of shape B,1
            input = torch.concat((input,next_idx),dim=1)
        self.out = input
        return self.out
    # def parameters(self, recurse: bool = True) -> Iterator[Parameter]:
    #     return [self.embed]

In [117]:
n_embd = 32         #embeddings
n_head = 6         # for multihead
head_size = 20      #the shape of key, query and value is n_emb,head_size
d = 0.4            #dropout
hidden_dim = 150    #affine layer dimensions

n_layer = 2         # no of blocks
batch_size = 100    # B

In [118]:
class Head(nn.Module):
    def __init__(self,head_size) -> None:
        super().__init__()
        self.key = nn.Linear(n_embd, head_size)
        self.query = nn.Linear(n_embd, head_size)
        self.value = nn.Linear(n_embd, head_size)

    def forward(self,x):
        key =   self.key(x) #B,T,C ; C,H -> B,T,H
        query = self.query(x) # B,T,H
        value = self.value(x) # B,T,H
        weights = key @ query.transpose(1,2) # B,T,H ; B,H,T -> B,T,T
        B,T,T = weights.shape
        tril = torch.tril(torch.ones(T,T))
        weights[:,tril == 0] = float('-inf')
        logits = F.softmax(weights, dim=-1)
        out = logits @ value    
        return out # B,T,H
class MultiHeadAttention(nn.Module):
    def __init__(self, n_head, head_size) -> None:
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(n_head)]) # a list with different heads
        self.linear = nn.Linear(n_head*head_size, n_embd)
        self.drop = nn.Dropout(p = 0.3)
    def forward(self, x):
        head_forward = [head(x) for head in self.heads]
        out = torch.concat(head_forward, dim=-1) # B,T,n_head*H
        out = self.linear(out)
        out = self.drop(out)
        return out
class FeedForward(nn.Module):
    def __init__(self , hidden_dim) -> None:
        super().__init__()
        self.sequence = nn.Sequential(
            nn.Linear(n_embd, hidden_dim), nn.ReLU(),
            nn.Linear(hidden_dim, n_embd), nn.Dropout(d)
        )
    def forward(self,x):
        out = self.sequence(x)
        return out
    
class Block(nn.Module):
    def __init__(self, n_head, head_size, hidden_dim) -> None:
        super().__init__()
        self.multihead = MultiHeadAttention(n_head, head_size)
        self.feedforward = FeedForward(hidden_dim)
        self.ly1 = nn.LayerNorm(n_embd)
        self.ly2 = nn.LayerNorm(n_embd)
    def forward(self,x):
        x = x+ self.multihead(self.ly1(x)) # norm on x then passed through multihead.
        x = x+ self.feedforward(self.ly2(x)) # output of multihead is normed and passed through feedforward.
        return x

In [119]:
class Transformer(nn.Module):
    def __init__(self, n_layer) -> None:
        super().__init__()
        self.embedding_table = nn.Embedding(vocab_size, n_embd)
        self.pos_embedding = nn.Embedding(context_length, n_embd)
        self.blocks = nn.Sequential( *[Block(n_head, head_size, hidden_dim) for _ in range(n_layer)]) 
        self.ly = nn.LayerNorm(n_embd)
        self.fd = nn.Linear(n_embd,vocab_size)

    def forward(self,x,y=None):
        B,T = x.shape
        tok_emd = self.embedding_table(x) # B,T,C
        pos_emd = self.pos_embedding(torch.arange(T)) # T,C
        x = tok_emd + pos_emd
        x = self.blocks(x)
        scores = self.fd(self.ly(x)) # B,T,vocab_size
        if y is None:
            loss = None
        else:
            B,T,C = scores.shape
            logits = scores.view(B*T,-1)
            targets = y.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return scores, loss

    def generate(self, idx, max_tokens):
        # idx B,T ; the output is gonna be B, T+max_tokens
        for _ in range(max_tokens):
            # since in the loop the length of idx gonna change, but we on want T length of idx
            idx_last = idx[:, -context_length:] # makes idx back to B,T
            scores,loss = self.forward(idx_last)    # B,T,C
            logits = scores[:, -1, :] # the prediction is in the last value in the scores tensor # shape i B,C
            probs = F.softmax(logits, dim = -1)
            idx_next = torch.multinomial(probs, num_samples=1) # B,1
            idx = torch.concat((idx,idx_next), dim=-1) # B,T+1....
        return idx

In [120]:
def training(model , n_itrs, print_every = 100):
    
    optim = torch.optim.AdamW(model.parameters(),lr = 1e-3)
    for i in range(n_itrs):
        x, y = data_maker(train_data,batch_size)
        for p in model.parameters():
            p.requires_grad = True

        scores , loss = model(x,y)
        loss.backward()
        optim.step()
        if i%print_every == 1:
            print(loss.item())

In [121]:
t = Transformer(n_layer)

In [122]:
training(t, 10000, lr = 5e-3, print_every=250)

3.5606625080108643
2.695490598678589
2.5984342098236084
2.8738484382629395
3.1028683185577393
3.1389927864074707
3.2499146461486816
3.342510223388672
3.5604560375213623
3.4358527660369873
4.129700183868408
3.612978935241699
3.0874104499816895
3.8182787895202637
3.3055593967437744
4.378464698791504
5.024666786193848
5.140851974487305
6.387229919433594
5.879176139831543
5.57369327545166
5.44919490814209
5.55044412612915
5.521071910858154
4.852461338043213
4.9375505447387695
5.650205612182617
5.764904022216797
6.035919666290283
6.549779891967773
7.837399005889893
7.895719528198242
6.469213485717773
6.286374568939209
9.014177322387695
8.452606201171875
8.766841888427734
8.80438232421875
7.6910905838012695
8.21726131439209


In [109]:
x ,y = data_maker(test_data, batch_size=1)
idx = t.generate(x, max_tokens=800)
text = ''.join(decode(idx.squeeze()))
print(text)
# for id in list(idx):
#     for i in id:
#         print(decode(i.item()))
#         break

hey
    m,mra eearb  esarae  uUU nb  i-a aa w-U rR eU r rb  ho r b b e U r i uU G   rrUarGsb fmGtui  m se aGr eb s  eGiib b fesRb  uUe tue  .e bR   b   sari n W taheU U  i e U mr;r U - eubRb,r  amea b  Ghaa am ir   e m eUbbbee-UcG hU-aUUeUerere  UUa-hb- U- i ua,bUr   mee w, beaal UUUaeam   R abraew b rU bs uU;  - Yr,rru ,b eeU   rUU n aarU  -     su dre Ub   wm bGe  U s b be a  ,uPm  aUuuberuUsitasbbouta ie  b'uo   b  Ui  G rua U be   Ue eYsiUGrbbaeraU  awai l waGs atU btl r wUeG  ie r  i rabc e  ebUe  b, sw Ubr Ur -U  bbh rba ab   srUe  U   beu eei  ae nbG R  ir U  G,oUaGr aa errw u bU-   r tea rr bu  agehaea   rialrU aaUW sae a  b G U rrUr   rba ub a, ,,i tRr    U   UirUCU    b  Wa abUeireUua  iU -e   r a  rl t eu  mbsUereee-e w sUa tserue r aGbirmrrr raR e     a U n e ie  eeaU bUeba-mebebeG - U


In [None]:
#   query = self.query(query)
#         key = self.key(key)
#         value = self.value
 
# Ki = key.reshape(N,T,H,E//H).transpose(1,2).transpose(2,3)  # N,H,E//H,T   
#         query_weights = torch.matmul(Qi, Ki)
#         assert query_weights.shape == (N,H,T,T)
#         Vi = value.reshape(N,T,H,E//H).transpose(1,2) # N,H,T,E//H
#         Yi = torch.matmul(query_weights, Vi)           #N,H,T,E//H
#         Y = Yi.transpose(1,2).reshape(N,T,E)
#         output = self.proj(Y)