In [30]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
import torch.nn as nn

batch_size=32
block_size=8
max_iters = 5000
eval_interval = 300
learning_rate = 1e-3
eval_iters=200
torch.manual_seed(1337)
num_heads = 4
n_embd=32

In [2]:
file = open('input.txt', 'r',encoding = 'utf-8')
chars=[]
while 1:
    char = file.read(1)
    chars.append(char)
    if not char:
        break
    # print(char)

file.close()
file = open('input.txt', 'r',encoding = 'utf-8')
while 1:
    char = file.read(1)
    chars.append(char)
    if not char:
        break
    # print(char)

In [3]:
# chars.append(' ')
vocabulary_list = sorted(list(set(chars)))
vocab_size = len(vocabulary_list)

In [4]:
stoi={}
itos={}
for i,c in enumerate(vocabulary_list):
    stoi[c] = i
    itos[i]=c


In [5]:
lstdata=[]
file = open('input.txt', 'r',encoding = 'utf-8')
while 1:
    char = file.read(1)
    lstdata.append(stoi[char])
    if not char:
        break
file = open('more.txt', 'r',encoding = 'utf-8')
while 1:
    char = file.read(1)
    lstdata.append(stoi[char])
    if not char:
        break

In [6]:
tdata = torch.tensor(lstdata)
n=int(0.9*len(tdata))
train_data = tdata[:n]
val_data = tdata[n:]

x=train_data[:block_size]
y=train_data[1:block_size+1]
for i in range(block_size):
    print(f'when context is {x[:i+1]}----> {y[i]}')

when context is tensor([19])----> 48
when context is tensor([19, 48])----> 57
when context is tensor([19, 48, 57])----> 58
when context is tensor([19, 48, 57, 58])----> 59
when context is tensor([19, 48, 57, 58, 59])----> 2
when context is tensor([19, 48, 57, 58, 59,  2])----> 16
when context is tensor([19, 48, 57, 58, 59,  2, 16])----> 48
when context is tensor([19, 48, 57, 58, 59,  2, 16, 48])----> 59


In [7]:


def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data)-block_size,(batch_size,))
    x= torch.stack([train_data[i:i+block_size] for i in ix])
    y= torch.stack([train_data[i+1:i+1+block_size] for i in ix])
    return x,y
xb,yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print(yb.shape)
print(yb)

inputs:
torch.Size([32, 8])
tensor([[ 1,  1, 32, 44, 42, 54, 53, 43],
        [40, 53, 43,  2, 41, 57, 44, 43],
        [55, 60, 59, 64,  9,  1,  1, 17],
        [53, 43,  2, 52, 44,  2, 47, 44],
        [53, 59, 54, 53, 58,  2, 62, 48],
        [40, 57, 53, 40, 57, 43, 48, 53],
        [44, 57, 44,  7,  2, 41, 60, 59],
        [42, 54, 53, 58, 44, 53, 59,  2],
        [ 7,  1, 32, 55, 60, 57, 53,  2],
        [40, 48, 53, 59,  7,  2, 45, 54],
        [62, 40, 58,  2, 47, 40, 42, 50],
        [44,  2, 48, 52, 52, 54, 43, 44],
        [62, 58,  2, 40, 45, 59, 44, 57],
        [ 2, 45, 54, 57, 62, 40, 57, 43],
        [51, 51,  7,  2, 53, 54,  2, 52],
        [33, 57, 40, 53, 48, 54,  7,  2],
        [11,  1, 27, 40, 64,  7,  2, 48],
        [59, 47, 44, 57,  2, 59, 54,  2],
        [ 1, 16, 40, 52, 48, 51, 51, 54],
        [14, 57, 44,  2, 42, 51, 40, 52],
        [55, 40, 53, 64,  9,  1,  1, 15],
        [44,  2, 54, 59, 47, 44, 57,  2],
        [ 2, 45, 40, 51, 51,  9,  2, 15],
      

In [71]:
class Head(nn.Module):
    """ one head attention model """
    def __init__(self,head_size):
        super().__init__()
        self.key = nn.Linear(n_embd,head_size,bias=False)
        self.query = nn.Linear(n_embd,head_size,bias=False)
        self.value = nn.Linear(n_embd,head_size,bias=False)
        self.register_buffer('tril',torch.tril(torch.ones(block_size,block_size)))
    def forward(self,x):
        B,T,C = x.shape
        k = self.key(x) #B,T,h
        q = self.query(x) #B,T,h
        v = self.value(x) #B,T,h
        wei = q@k.transpose(-2,-1) * (C**-0.5) #B,T,T
        wei = wei.masked_fill(self.tril[:T,:T]==0,float('-inf'))
        wei = F.softmax(wei,dim=-1)
        out = wei@v #B,T,h
        return out

In [72]:
class MultiHead(nn.Module):
    def __init__(self,num_heads,head_size):
        super().__init__()
        self.multihead = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.projection = nn.Linear(n_embd,n_embd)
    def forward(self,x):
        x = torch.cat([h(x) for h in self.multihead],dim=-1)
        out = self.projection(x)
        return out

In [73]:
class FeedForward(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(nn.Linear(n_embd,4*n_embd),nn.ReLU(),nn.Linear(4*n_embd,n_embd))
        
    def forward(self,x):
        x = self.net(x)
        
        return x

In [74]:
class Block(nn.Module):
    def __init__(self):
        super().__init__()
        head_size = n_embd//num_heads
        self.multihead = MultiHead(num_heads,head_size)
        self.ffwd = FeedForward()
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)
    def forward(self,x):
        x = x+self.multihead(self.ln1(x)) #residual path added
        x = x+self.ffwd(self.ln2(x)) #residual path added
        return x

In [75]:

class BigramLM(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size,n_embd)
        self.position_embedding_table = nn.Embedding(block_size,n_embd) #T,C
        self.lm_head = nn.Linear(n_embd,vocab_size)
        # self.sa_head = Head(n_embd)
        # self.multihead = MultiHead(num_heads,n_embd//num_heads)
        # self.ffwd = FeedForward()
        self.blocks = nn.Sequential(Block(),Block(),Block(),Block(),nn.LayerNorm(n_embd))
    def forward(self,idx,targets=None):
        #idx targets shape are (B,T)
        idx = idx.long()
        tok_emb = self.token_embedding_table(idx) #B,T,C
        pos_emb = self.position_embedding_table(torch.arange(block_size)) #T,C
        # print(tok_emb.shape,pos_emb.shape)
        x = tok_emb+pos_emb #B,T,C
        x = self.blocks(x)
        # x = self.multihead(x)
        # x = self.ffwd(x)
        # x = self.sa_head(x)
        logits = self.lm_head(x) #B,T,vocab_size
        if targets is None:
            loss=None
        else:
            B,T,C = logits.shape
            logits = logits.view(B*T,C)
            loss = F.cross_entropy(logits,targets.view(B*T))
        return logits,loss
    def generate(self,idx,max_tokens):
        for _ in range(max_tokens):
            idx_cond = idx[:,-block_size:]
            logits,loss = self(idx_cond)
            
            logits = logits[:,-1,:]
            
            probs = F.softmax(logits,dim=-1)
            idx_next = torch.multinomial(probs,1)
            idx = torch.cat((idx,idx_next),dim=1)
        return idx

In [76]:

model = BigramLM()
# idx = model.generate(torch.zeros((1,1)),100).int()
# for i in idx[0]:
#     print(itos[i.item()])
# ''.join([itos[i.item()] for i in idx[0]])

# logit,loss=model(xb,yb)    
# print(loss,logit.shape)


In [77]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train','val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X,Y = get_batch(split)
            logits,loss = model(X,Y)
            losses[k]=loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [13]:
# print(logits)
# print(logits[:,-1,:])

In [78]:
optimizer = torch.optim.AdamW(model.parameters(),learning_rate)

In [79]:

for steps in range(max_iters):
    xb,yb = get_batch('train')
    logits,loss = model(xb,yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

1.9608464241027832


In [81]:
idx = model.generate(torch.zeros((1,block_size)),10000).int()
''.join([itos[i.item()] for i in idx[0]])

" bbegisent aftroiad dlases que upoutOte!\nlle,.\n\nRETRIA:\nThath to fall 'tping;\nO that is to brieteth eas,\nHow slalse, Why they you do in it deat in held your theak, tal you\nShale your renger of hy this disancly and proyders an sonoun your ince,\nIloud here furesine feeft, Woet, blood, for a flicate but curnied hoogh of the a wharte a hearet\nsay life in anted and to the scangely my do fill, that mast taters.\n\nFrom! forself angere.\n\nFTharch jewortool's wore themine\nYou shall true queerss are flaist sortieple tortise a bey fake will you fure no blook'd thoughpalonse of Meing.\n\nDUKUCKINGBUS:\nNom Mubt name.\n\nKISwivan eme dall; Lod I say as citter your bletake\nDign:\nYry her, bed would thee house away: sill hence.\n\nVORGORY Hures for\nOf Rnab. To in this offe keice fortonon triose a thingerd:\nAwell:\nI made\nI her graught thou isprame: I, telles.\n\nOull va, bece:\nAing the beth though thigh\nYou and fiinsink. Say throst and is no:. But not off, am!\n\nDUKE PERDY BOLIFRI

In [17]:
e = nn.Embedding(vocab_size,vocab_size)


In [18]:
e.weight

Parameter containing:
tensor([[-0.6056, -1.3920,  1.0248,  ...,  0.3656, -1.7355,  0.2139],
        [ 0.8255, -2.6329, -0.9479,  ...,  0.6256,  0.0523,  0.9869],
        [ 0.7193,  0.5352, -0.6094,  ...,  2.0635,  0.7135, -0.6963],
        ...,
        [ 0.5466,  0.5238, -1.3927,  ..., -1.0646, -1.0925, -2.9220],
        [-1.9872,  0.6649, -0.4610,  ..., -2.5498, -0.9464,  0.0490],
        [-2.0957, -0.2855, -1.2304,  ..., -0.1399, -1.2145,  0.1204]],
       requires_grad=True)

In [19]:
e(torch.tensor([0]).long())

tensor([[-0.6056, -1.3920,  1.0248, -0.5932,  0.0104,  0.5026, -1.0102,  0.3182,
          0.5429, -0.8440,  1.6144, -1.2571, -0.7955, -0.8091,  0.3076,  0.3817,
         -1.4066, -0.4319, -0.5875, -0.7718, -0.3235, -0.1856,  0.8806,  2.4690,
          0.6213,  0.5952, -0.3065,  0.0720, -0.4091,  0.1777, -0.7487,  1.8724,
         -1.2306,  0.6257, -1.3147, -0.6245, -0.2754,  1.3851, -0.2040,  0.7382,
          1.0386, -0.2191, -2.0600, -0.6788, -0.4363, -1.8781, -0.1640,  0.4619,
          0.1324, -0.1182, -0.5386,  0.4479,  0.4336,  1.2843,  0.0249,  0.5398,
         -2.2262, -0.7744, -0.6222, -0.1747,  0.6970, -0.3712, -1.0101,  0.3656,
         -1.7355,  0.2139]], grad_fn=<EmbeddingBackward0>)

In [20]:
B,T,C = 4,8,2
tdata = torch.randn(B,T,C)
ctx = torch.zeros(B,T,C)
tdata[0,:1,:].mean(dim=0)
for b in range(B):
    for t in range(T):
        prev = tdata[b,:t+1,:]
        ctx[b,t,:] = prev.mean(dim=0)

print(tdata[0])
print(ctx[0])

tensor([[-0.1866, -0.1730],
        [ 0.8760, -2.0884],
        [ 1.3665,  0.7407],
        [-0.8617, -1.4160],
        [-1.3838, -1.1819],
        [ 1.2904, -0.7376],
        [ 0.0343,  2.1095],
        [ 1.7741,  0.7654]])
tensor([[-0.1866, -0.1730],
        [ 0.3447, -1.1307],
        [ 0.6853, -0.5069],
        [ 0.2986, -0.7342],
        [-0.0379, -0.8237],
        [ 0.1835, -0.8093],
        [ 0.1622, -0.3924],
        [ 0.3637, -0.2477]])


In [21]:
torch.manual_seed(1337)
B,T,C = 4,8,32
head_size=16
x = torch.randn((B,T,C))
ql = nn.Linear(C,head_size,bias=False)
kl = nn.Linear(C,head_size,bias=False)
vl = nn.Linear(C,head_size,bias=False)
q = ql(x) #B,T,h
k = kl(x) #B,T,h
v = vl(x)
qk = q@k.transpose(-2,-1) #B,T,16 @ B,16,T---->B,T,T
b = torch.ones(T,T)
tril = torch.tril(b)
wei = qk
wei = wei.masked_fill(tril==0,float('-inf'))
wei = F.softmax(wei,dim=-1)
out = wei@v
out.shape
wei[0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5877, 0.4123, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.4457, 0.2810, 0.2733, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2220, 0.7496, 0.0175, 0.0109, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0379, 0.0124, 0.0412, 0.0630, 0.8454, 0.0000, 0.0000, 0.0000],
        [0.5497, 0.2187, 0.0185, 0.0239, 0.1831, 0.0062, 0.0000, 0.0000],
        [0.2576, 0.0830, 0.0946, 0.0241, 0.1273, 0.3627, 0.0507, 0.0000],
        [0.0499, 0.1052, 0.0302, 0.0281, 0.1980, 0.2657, 0.1755, 0.1474]],
       grad_fn=<SelectBackward0>)

In [22]:
torch.zeros((1,block_size))[:,-block_size:]
torch.arange(block_size)

tensor([0, 1, 2, 3, 4, 5, 6, 7])