In [None]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

In [None]:
with open('input.txt' , 'r' , encoding='utf-8') as f:
    text = f.read()

In [None]:
import torch 
dropout = 0.2
device = 'cuda' if torch.cuda.is_available() else 'cpu'


In [None]:
batch_size = 64
block_size = 256
max_iters = 5000
eval_interval = 500
learning_rate = 3e-4
eval_iters = 200
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.0

In [None]:
# length of text
print("the length of the text is " , len(text))

In [None]:
# starting few lines
print(text[:100])

In [None]:
# All the uniqure characters in the text
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(vocab_size)
print(''.join(chars))

In [None]:
stoi = {char:i for i,char in enumerate(chars)}
itos = {i:char for i,char in enumerate(chars)}
encode = lambda s : [stoi[c] for c in s]
decode = lambda s : ''.join([itos[c] for c in s])

print(encode("hello there"))
print(decode(encode("hello there")))

In [None]:
import torch 
data = torch.tensor(encode(text) , dtype = torch.long)
print(data.shape , data.dtype)
print(data[:100])

In [None]:
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [None]:
train_data.shape

In [None]:
train_data[:block_size+1]

In [None]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"When input {context} then ouput is {target}")

In [None]:
torch.manual_seed(1337)
# batch_size = 4
# block_size = 8 

def get_batch(split):
    data = train_data if split=='train' else val_data 
    ix = torch.randint(len(data)-block_size , (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+1+block_size] for i in ix])
    return x ,y 

xb , yb = get_batch('train')
print(xb.shape)
print(xb)
print(yb.shape)
print(yb)

for b in range(batch_size) :
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b,t]
        # print(f"When input is {context.tolist()} then target is {target.tolist()}")

In [None]:
print(xb) # input to transformer

In [None]:
x.shape

In [None]:
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

    def __init__(self , vocab_size):
        super().__init__() # neccessary for setup of base class
        self.token_embedding_table = nn.Embedding(vocab_size , vocab_size)

    def forward(self , idx , targets=None):
        logits = self.token_embedding_table(idx)

        if targets==None:
            loss = None
        else:
            B,T,C = logits.shape
            logits = logits.view(B*T,C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits , targets)
            # idx = tensor([[17,  0,  3, 14],
                          # [ 4, 13,  1, 12]])

        return logits , loss
    def generate(self , idx , max_new_tokens):
        # idx = B,T array of indices in current context
        for _ in range(max_new_tokens):
            logits , loss = self(idx)
            logits = logits[:,-1,:] # becomes B,C as it takes last token in each seq
            probs = F.softmax(logits , dim=-1)
            idx_next = torch.multinomial(probs , num_samples = 1) # B,1
            idx = torch.cat((idx , idx_next) , dim=1) # B,T+1
        return idx

m = BigramLanguageModel(vocab_size)
logits , loss = m(xb , yb)
print(logits.shape)
print(loss)

print(decode(m.generate(idx = torch.zeros((1,1) , dtype = torch.long) , max_new_tokens=100)[0].tolist()))

In [None]:
optimizer = torch.optim.Adam(m.parameters() , lr = 1e-3)

In [None]:
# batch_size = 32 
for steps in range(1000):

    xb, yb = get_batch('train')
    logits , loss = m(xb , yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

In [None]:
print(decode(m.generate(idx = torch.zeros((1,1) , dtype = torch.long) , max_new_tokens=500)[0].tolist()))

In [None]:
torch.arange(10)

**SELF ATTENTION PART**

In [None]:
torch.manual_seed(1337)
# B,T,C = 4,8,2
B,T,C = 4,8,32
x = torch.randn(B,T,C)
x.shape

In [None]:
# 1st Way
xbow = torch.zeros((B,T,C))
for b in range(B):
    for t in range(T):
        xprev = x[b,:t+1]
        xbow[b,t] = torch.mean(xprev,0)

In [None]:
xbow[0][0]

In [None]:
# 2nd way
wei = torch.tril(torch.ones(T,T))
wei = wei/wei.sum(1, keepdim=True)
xbow2 = wei@x # T,T x (B),T,C -> B,T,C
# xbow2[0]

In [None]:
#3rd way -> Softmax
tril = torch.tril(torch.ones(T,T))
wei = torch.zeros((T,T))
wei = wei.masked_fill(tril==0 ,float('-inf'))
wei = F.softmax(wei,dim=-1)
xbow3 = wei@x
# xbow3[0]

In [None]:
# xbow[0]
# xbow2[0]
# xbow3[0]
torch.allclose(xbow[1],xbow2[1])
print(torch.allclose(xbow3, xbow2, atol=1e-6))

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'


In [None]:

class Head(nn.Module):

    def __init__(self , head_size):
        super().__init__() # neccessary for setup of base class
        self.key = nn.Linear(n_embd , head_size , bias=False)
        self.query = nn.Linear(n_embd , head_size , bias=False)
        self.value = nn.Linear(n_embd , head_size , bias=False)
        self.register_buffer('tril' , torch.tril(torch.ones(block_size , block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self , x):
        B,T,C = x.shape 
        k = self.key(x)
        q = self.query(x)
        wei = q@k.transpose(-1,-2) * C**-0.5
        wei = wei.masked_fill(self.tril[:T,:T]==0 , float('-inf'))
        wei = F.softmax(wei , dim=-1)
        v = self.value(x)
        out = wei @ v 
        return out 

In [None]:
# earlier were sedning all embeddings of the token to single head, 
# now we will divide the embeddings and send them to different heads ,to look for different things

class MultiHeadAttention(nn.Module):

    def __init__(self , num_heads , head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd , n_embd)
        self.dropout = nn.Dropout(dropout)
    def forward(self,x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

In [None]:
class FeedForward(nn.Module):
    def __init__(self , n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4*n_embd),
            nn.ReLU(),
            nn.Linear(4*n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self,x):
        return self.net(x)

In [None]:
class LayerNorm:
    def __init__(self , dim , eps = 1e-5 , momentum=0.1):
        self.eps = eps 
        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)
    def __call__(self ,x):
        xmean = x.mean(1 , keepdim=True) # batch mean 
        xvar = x.var(1, keepdim=True)  # batch var
        xhat = (x-xmean)/torch.sqrt(xvar+self.eps)
        self.out = self.gamma *xhat + self.beta
        return self.out
    def parameters(self):
        return [self.gamma , self.beta]

In [None]:
class Block(nn.Module):

    def __init__(self,n_embd , n_head):

        super().__init__()
        head_size = n_embd//n_head ; 
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)
    def forward(self , x):
        x = x+self.sa(self.ln1(x))
        x = x+self.ffwd(self.ln2(x))
        return x

In [None]:
# batch_size = 32 
# block_size = 8

import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__() # neccessary for setup of base class
        self.token_embedding_table = nn.Embedding(vocab_size , n_embd) # lookup table for embedding
        self.position_embedding_table = nn.Embedding(block_size , n_embd) #T = block_size
        # self.sa_head = Head(n_embd)  # keeping head_size same as embedding size 
        # self.sa_heads = MultiHeadAttention(4, n_embd//4) 
        # self.ffwd = FeedForward(n_embd)
        # self.blocks = nn.Sequential(Block(n_embd , n_head),Block(n_embd , n_head),Block(n_embd , n_head)
        # )
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])

        self.lm_head = nn.Linear(n_embd , vocab_size)
    def forward(self , idx , targets=None):
        B,T = idx.shape
        tok_emb = self.token_embedding_table(idx) # (b,t,emb)
        # pos_emb = self.position_embedding_table(torch.arange(T))
        pos_emb = self.position_embedding_table(torch.arange(T, device=idx.device))
        x = tok_emb+pos_emb
        # x = self.sa_heads(x)
        # x = self.ffwd(x)
        x = self.blocks(x)
        logits = self.lm_head(x) # (b,t,vocab_size)
        
        if targets==None:
            loss = None
        else:
            B,T,C = logits.shape
            logits = logits.view(B*T,C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits , targets)
            # idx = tensor([[17,  0,  3, 14],
                          # [ 4, 13,  1, 12]])

        return logits , loss
    def generate(self , idx , max_new_tokens):
        # idx = B,T array of indices in current context
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits , loss = self(idx_cond)
            logits = logits[:,-1,:] # becomes B,C as it takes last token in each seq
            probs = F.softmax(logits , dim=-1)
            idx_next = torch.multinomial(probs , num_samples = 1) # B,1
            idx = torch.cat((idx , idx_next) , dim=1) # B,T+1
        return idx

# m = BigramLanguageModel()
m = BigramLanguageModel().to(device)
xb = xb.to(device)
yb = yb.to(device)
logits , loss = m(xb , yb)
print(logits.shape)
print(loss)

print(decode(m.generate(idx = torch.zeros((1,1) , dtype = torch.long).to(device) , max_new_tokens=100)[0].tolist()))

In [None]:
optimizer = torch.optim.Adam(m.parameters() , lr = learning_rate)

In [None]:
# earlier it was : 2.572469472885132
for steps in range(5000):
    xb, yb = get_batch('train')
    xb = xb.to(device)
    yb = yb.to(device)
    logits , loss = m(xb , yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    print(steps,loss.item())

# BELOW results are when iterations were 10000
#now it is : 2.4223456382751465 , after just creating 32 size embeddings
#now it is : 2.516273260116577  , after using positional embedd, not that useful without attention
# now it is : 2.4638516902923584, 
# now it is : 2.36381196975708 , after using Self attention single head 
# now it is : 2.161557674407959 , after using multi self attention heads (4 heads , but same no. of parameters as single head)
# now it is : 2.1757524013519287 , after adding feeedforward
# now it is : 2.199948787689209, after me made it pass throught the network 3 times, its having optimization issues

# to solve optimization issues , we use Add(Residual) and Norm(LayerNorm)
# now it is : 2.0240368843078613, after we made it pass throught Add-residual network and projection layer

# layer norm prevents convergence to single token,Feature domination, Exploding or vanishing activations etc.
# now it is : 2.0245840549468994, after using LayerNorm

# after scaling the numbers we got : 1.6635 error after only running for 5000 iterations 

single head attention : Each token emits exactly one query vector <br>
Multi-head attention  : Each token actually emits multiple query vectors — one per attention head

In [None]:
print(decode(m.generate(idx = torch.zeros((1,1) , dtype = torch.long).to(device) , max_new_tokens=2000)[0].tolist()))