In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.nn import functional as F

In [None]:
with open('/content/the_office_dialogues.txt','r') as f:
  data = f.read()
print(data[:500])

JIM: Oh, I told you. I couldn't close it. So...
MICHAEL: So you've come to the master for guidance? Is this what you're saying, grasshopper?
JIM: Actually, you called me in here, but yeah.
MICHAEL: All right. Well, let me show you how it's done.
MICHAEL: [on the phone] Yes, I'd like to speak to your office manager, please. Yes, hello. This is Michael Scott. I am the Regional Manager of Dunder Mifflin Paper Products. Just wanted to talk to you manager-a-manger. [quick cut scene] All right. Done d


In [None]:
len(data)

4085257

In [None]:
chars = sorted(list(set(data)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)

	
 !#$%&'()*+,-./0123456789:;=?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyz{}�
90


In [None]:
stoi = {ch:i for i , ch in enumerate(chars)}
itos = {i:ch for i , ch in enumerate(chars)}
encode =  lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join(itos[i] for i in l)

In [None]:
print(encode("helo aloo"))
print(decode(encode("helo aloo")))

[68, 65, 72, 75, 2, 61, 72, 75, 75]
helo aloo


In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
data = torch.tensor(encode(data),dtype = torch.long)
print(data.shape,data.dtype)
print(data[:100])


torch.Size([4085257]) torch.int64
tensor([41, 40, 44, 27,  2, 46, 68, 13,  2, 40,  2, 80, 75, 72, 64,  2, 85, 75,
        81, 15,  2, 40,  2, 63, 75, 81, 72, 64, 74,  8, 80,  2, 63, 72, 75, 79,
        65,  2, 69, 80, 15,  2, 50, 75, 15, 15, 15,  1, 44, 40, 34, 39, 32, 36,
        43, 27,  2, 50, 75,  2, 85, 75, 81,  8, 82, 65,  2, 63, 75, 73, 65,  2,
        80, 75,  2, 80, 68, 65,  2, 73, 61, 79, 80, 65, 78,  2, 66, 75, 78,  2,
        67, 81, 69, 64, 61, 74, 63, 65, 30,  2])


In [None]:
n = int(0.9*len(data))
train_data = data[:n]
val_data= data[n:]

In [None]:
block_size = 8
train_data[:block_size+1]

tensor([41, 40, 44, 27,  2, 46, 68, 13,  2])

In [None]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
  context = x[:t+1]
  target = y[t]
  print(f"when input is {context} the target is {target}")

  #we do this so that the transformer will learn how to predict from a single input upto the entire block_size, after that we will truncate the inputs

when input is tensor([41]) the target is 40
when input is tensor([41, 40]) the target is 44
when input is tensor([41, 40, 44]) the target is 27
when input is tensor([41, 40, 44, 27]) the target is 2
when input is tensor([41, 40, 44, 27,  2]) the target is 46
when input is tensor([41, 40, 44, 27,  2, 46]) the target is 68
when input is tensor([41, 40, 44, 27,  2, 46, 68]) the target is 13
when input is tensor([41, 40, 44, 27,  2, 46, 68, 13]) the target is 2


In [None]:
torch.manual_seed(1337)
batch_size = 64
block_size = 128

def get_batch(split):
  data = train_data if split == 'train' else val_data
  ix = torch.randint(len(data)-block_size,(batch_size,))
  x = torch.stack([data[i:i+block_size] for i in ix])
  y = torch.stack([data[i+1:i+block_size+1] for i in ix])
  x, y = x.to(device), y.to(device)
  return x, y

xb,yb = get_batch('train')
print('inputs:')
print(xb)
print(yb.shape)
print('targets:')
print(yb)
print(yb.shape)

print('__________________________________________________________')


for b in range(batch_size):
  for t in range(block_size):
    context = xb[b,:t+1]
    target = yb[b,t]
    # print(f"when input is {context.tolist()} target is {target}")



inputs:
tensor([[ 2, 85, 75,  ..., 82, 65, 78],
        [ 2, 45, 75,  ..., 78, 81, 74],
        [ 2, 76, 72,  ..., 79,  2, 75],
        ...,
        [85, 75, 81,  ..., 83, 69, 67],
        [75,  2, 61,  ..., 75,  2, 85],
        [72,  2, 61,  ..., 40,  2, 64]])
torch.Size([64, 128])
targets:
tensor([[85, 75, 81,  ..., 65, 78, 85],
        [45, 75, 80,  ..., 81, 74, 79],
        [76, 72, 65,  ...,  2, 75, 66],
        ...,
        [75, 81,  2,  ..., 69, 67, 68],
        [ 2, 61, 74,  ...,  2, 85, 75],
        [ 2, 61, 74,  ...,  2, 64, 75]])
torch.Size([64, 128])
__________________________________________________________


In [None]:
dropout = 0.2
n_embed = 384
vocab_size = vocab_size

In [None]:
class BigramLM(nn.Module):
  def __init__(self):
    super().__init__()
    self.token_embedding_table = nn.Embedding(vocab_size,n_embed)
    self.position = nn.Embedding(block_size,n_embed)
    self.blocks = nn.Sequential(
        Block(n_embed,n_head = 6),
        Block(n_embed,n_head = 6),
        Block(n_embed,n_head = 6),
        Block(n_embed,n_head = 6),
        Block(n_embed,n_head = 6),
        Block(n_embed,n_head = 6),
        nn.LayerNorm(n_embed)
    )
    self.lm_head = nn.Linear(n_embed,vocab_size)


  def forward(self,idx,targets = None):
    B,T = idx.shape
    token_embed = self.token_embedding_table(idx) #Batch,Time,Channel (batch size, block_size, vocab_size)
    pos_embed = self.position(torch.arange(T, device = device)) #T,C
    x = token_embed + pos_embed #B,T,C
    x = self.blocks(x)
    logits = self.lm_head(x) #B,T,C
    if targets is None:
      loss = None
    else:
      B,T,C = logits.shape
      logits = logits.view(B*T,C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits,targets) # the function wants B,C,T format

    return logits,loss

  def generate(self,idx,max_new_tokens):
    #we want to keep generating outputs B,T then B,T+1
    #idx is (B,T) arry of indices in the current context
    idx =idx.to(device)
    for _ in range(max_new_tokens):
      idx_cond = idx[:,-block_size:]
      #get the prediction
      logits, loss = self(idx_cond)
      #focus on the last time step/word in sentence
      logits = logits[:,-1,:] # becomes (B,C)
      #get probabilities
      probs = F.softmax(logits,dim = -1) #(B,C)
      idx_next = torch.multinomial(probs,num_samples = 1) #Multinom(B,1)
      #add sampled index to running seq
      idx = torch.cat((idx,idx_next),dim=1) # (B, T+1)
    return idx

model = BigramLM()
n = model.to(device)
logits, loss = n(xb,yb)
print(logits.shape)
print(loss.shape)
print(loss)



torch.Size([8192, 90])
torch.Size([])
tensor(4.6125, grad_fn=<NllLossBackward0>)


In [None]:
idx = torch.zeros((1,1),dtype = torch.long)
print(decode(n.generate(idx,max_new_tokens = 100)[0].tolist()))

	aT-*%aN2w..fD	#yVtcT]SW%&_VX3CX)4T1-Aqb%b+e%%]M691Dor&Ds	_,9&p;IzjLed;w?wce7mXZ=sw&KmLGY??y ?nn
MGtb


In [None]:
optimizer = torch.optim.AdamW(n.parameters(),lr = 3e-4)

In [None]:
for steps in range(500):
  xb, yb = get_batch('train')

  logits,loss = n(xb,yb)
  optimizer.zero_grad(set_to_none = True)
  loss.backward()
  optimizer.step()

print(loss.item())

4.632205963134766


In [None]:
print(decode(n.generate(idx,max_new_tokens = 1000)[0].tolist()))

	*:EXYd,,wuiUQJ{ouy
%XDE=c9U&Nj),*Z]oGV-02d .vJEqBv7j;/
w@]ahwt[}#Mz9H+5q&_0aL:	o=k5Z8fY.C7_K;.MX]26h3O!Ye8Tcs]Si5TWk=GiN_,j{:IOd$Ue 9IWsW#L#RSWFM*
yr4Y4:u@1&/3qlIY1YK(VLX(qb9}L]LO]#+(dabKa9E$8v8Nq%Tqq%@a	-!R5kbqe1	N(=D}W7W
%xV;=Bgm-KnY}
jVN�;#!?eFqBV=	K&VAQ2CcEz=2GYH
?VMqXq=me-m3Yelw(+#WxO((bX,]b1Af	l	M9c	h=wx@EBSv-W
4'{,K.LmMQ3.*e�-=?*}O0	=.e]/P.q!Yr&%H4Al(CbQkcc=1q	__4qr	(�POOEdL/DQW(E25nd7; 7{BS)q#3bw4#Rg}z-'!&}WNejb%bYw[zF#tAK lJ	 ;.)KY;6b�IQH.kbJu#Ih'W.%E,Ds_/G.D)9e!$NW0S_-Ag@qg3beLk?sWfUoLL9=9Y:acdQ1a3e3-$:6FOengIqcW+1KLW%=4�DaheQY[F?lb6yTBhnLYM#V	5K&vYo	C**(]b%K9&atejZe+r)X
KG
 $'ZbK{aT9eM6$fY=4;i*B]e2%0u!�ygg %0ML.'YkC%b7L4MW6;q
Yhvq(5He$UT=,?P3b,INq#ikXM$4@}=T{_ETNqlah69={1z�Uddr5oo{NYq?LqqqI
mPgB?'ONl:�v}JlFsPqY) @$J%Bg?7W9=ljYW9?XTXgYG5$
* o	MZc(=Y:%IV*?Di?=tk=	.d(2?2Vf#3V4G3.Z1[1ui:F4'Y2c%jXRI+3Gx[vzgNIY07Nag(lI=b1b*R9hm8_2Hh]e%qdU=8:$0;U*v{N92PGF_jTz6mq-?$0$,EGeX=VI@ch#yq=,9E_Ral4,EW	;$F?LL25Y[ 7sYRNWoYF1[3	oLKU6( ,P&0;.b1�lQ!u7:eyLQYR _kQNq*PEau(9#c#ud($[:LFl1wQ(=no2bDq%

In [None]:
#we will develop a very primitive from of self attention (Average the vectors of all prev + current as a contextually aware vector for the current word
# this is very lossy, we lose a lot of spatial information but we will move to more advanced techniques later)
B,T,C = 4,8,32
x = torch.randn(B,T,C)
xbow = torch.zeros((B,T,C))
for b in range(B):
  for t in range(T):
    xprev = x[b, :t+1] #t,C
    xbow[b,t] = torch.mean(xprev,0)
#this is very inefficient because of the for loop doing element wise operations, we will use matrix operations to fasten it up

In [None]:
wts = torch.tril(torch.ones(T,T))
wts = wts/wts.sum(1, keepdim=True)
wts

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

In [None]:
xbow2 = wts@x #(T,T) @ (B,T,C) --> (B,T,T) @(B,T,C) ---> (B,T,C)
# this is equal to xbow, the batch matrix multiply did the averaging in one go

In [None]:
tril = torch.tril(torch.ones(T,T))
wts = torch.zeros((T,T))
wts = wts.masked_fill(tril==0,float('-inf'))
wts = F.softmax(wts, dim = -1)
xbow3 = wts @ x



In [None]:
B,T,C = 4,8,32
x = torch.randn(B,T,C)

head_size = 16
key = nn.Linear(C,head_size,bias = False)
query = nn.Linear(C,head_size,bias = False)
value = nn.Linear(C,head_size,bias = False)

k = key(x) # B,T,16
q = query(x) # B,T,16
v = value(x)
wts = q @ k.transpose(-2,-1) #B,T,16 @ B,16,T --> B,T,T



tril = torch.tril(torch.ones(T,T))
# wts = torch.zeros((T,T))
wts = wts.masked_fill(tril==0,float('-inf'))
wts = F.softmax(wts, dim = -1)
out = wts @ v
out.shape

torch.Size([4, 8, 16])

In [None]:
wts[0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.4038, 0.5962, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5212, 0.4598, 0.0190, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2779, 0.0420, 0.6643, 0.0159, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0344, 0.0265, 0.8058, 0.0355, 0.0978, 0.0000, 0.0000, 0.0000],
        [0.3325, 0.0334, 0.2356, 0.0280, 0.3460, 0.0245, 0.0000, 0.0000],
        [0.3083, 0.0308, 0.0067, 0.2091, 0.0402, 0.0065, 0.3984, 0.0000],
        [0.0922, 0.1291, 0.2453, 0.1784, 0.0535, 0.0681, 0.1272, 0.1062]],
       grad_fn=<SelectBackward0>)

In [None]:
class Head(nn.Module):

  def __init__(self,head_size):
    super().__init__()
    self.key = nn.Linear(n_embed,head_size,bias = False)
    self.query = nn.Linear(n_embed,head_size,bias = False)
    self.value = nn.Linear(n_embed,head_size,bias = False)
    self.register_buffer('tril', torch.tril(torch.ones(block_size,block_size)))
    self.dropout = nn.Dropout(dropout)



  def forward(self,x):
    B,T,C = x.shape
    k = self.key(x)
    q = self.query(x)

    wts = q@k.transpose(-2,-1)*C**-0.5
    wts = wts.masked_fill(self.tril[:T,:T]==0, float('-inf'))
    wts = F.softmax(wts,dim = -1)
    wts = self.dropout(wts)
    v = self.value(x)
    out = wts @v
    return out

In [None]:
class MultiHeadAttention(nn.Module):
  def __init__(self,num_heads,head_size):
    super().__init__()
    self.heads = nn.ModuleList([Head(head_size)for _ in range(num_heads)])
    self.proj = nn.Linear(n_embed,n_embed)
    self.dropout = nn.Dropout(dropout)

  def forward(self,x):
    out = torch.cat([h(x) for h in self.heads], dim = -1)
    out = self.dropout(self.proj(out))
    return out

In [None]:
class FeedForward(nn.Module):
  def __init__(self,n_embed):
    super().__init__()
    self.net = nn.Sequential(
        nn.Linear(n_embed, 4*n_embed),
        nn.ReLU(),
        nn.Linear(4*n_embed,n_embed),
        nn.Dropout(dropout)
    )

  def forward(self,x):
    return self.net(x)

In [None]:
class Block(nn.Module):
  def __init__(self,n_embed,n_head):
    super().__init__()
    head_size = n_embed // n_head
    self.sa = MultiHeadAttention(n_head,head_size)
    self.ffwd = FeedForward(n_embed)
    self.ln1 = nn.LayerNorm(n_embed)
    self.ln2 = nn.LayerNorm(n_embed)
  def forward(self,x):
    x = x + self.sa(self.ln1(x))
    x = x + self.ffwd(self.ln2(x))
    return x
