In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
device = 'cuda' if torch.cuda.is_available() else 'cpu'
import time
block_size=8
batch_size = 4
max_iters = 10000
learning_rate = 3e-4
eval_iters = 2500

In [2]:
start_time=time.time()
print(start_time)

1712511426.912642


In [3]:
with open('wizard_of_oz.txt','r',encoding='utf-8') as f:
    text=f.read()
chars=sorted(set(text))
print(chars)
vocab_size=len(chars)

['\n', ' ', '!', '#', '$', '%', '&', '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '—', '‘', '’', '“', '”', '•', '™']


In [4]:
print(len(chars))

88


In [5]:
string_to_int = { ch:i for i,ch in enumerate(chars) }
int_to_string = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

data = torch.tensor(encode(text), dtype=torch.long) #This line converts the input text into a tensor of long integers. 
#The encode function is responsible for converting the text into a sequence of numerical tokens.
print(data[:100])

tensor([46, 62, 59,  1, 42, 72, 69, 64, 59, 57, 74,  1, 33, 75, 74, 59, 68, 56,
        59, 72, 61,  1, 59, 28, 69, 69, 65,  1, 69, 60,  1, 46, 62, 59,  1, 49,
        69, 68, 58, 59, 72, 60, 75, 66,  1, 49, 63, 80, 55, 72, 58,  1, 69, 60,
         1, 41, 80,  0,  0,  0, 46, 63, 74, 66, 59, 24,  1, 46, 62, 59,  1, 49,
        69, 68, 58, 59, 72, 60, 75, 66,  1, 49, 63, 80, 55, 72, 58,  1, 69, 60,
         1, 41, 80,  0,  0,  0, 27, 75, 74, 62])


In [6]:
#This sets the context length (or window size) for the bigram model. 
#A bigram model predicts the next token based on the current token and the previous token.


#These lines create the input (x) and target (y) tensors for the first training example. 
#x represents the context (the first block_size tokens), and y represents the targets (the next block_size tokens, shifted by one position).
#x=train_data[:block_size]
#y=train_data[1:block_size+1]

#for t in range(block_size):
   # context=x[:t+1] #This creates a sub-tensor of x representing the context up to the current position t.
   # target=y[t] #This retrieves the target token at position t from y.
   # print('when input is ',context,'target is ', target)
    

In [7]:
print(data)

tensor([46, 62, 59,  ...,  0,  0,  0])


In [8]:
#This line calculates the index at which the dataset should be split into training and validation sets. 
#In this case, 80% of the data is used for training.
n = int(0.8*len(data))
#These lines split the data into training and validation sets based on the index n calculated in the previous step
train_data = data[:n]
val_data = data[n:]

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

x, y = get_batch('train')
print('inputs:')
# print(x.shape)
print(x)
print('targets:')
print(y)

inputs:
tensor([[67, 59,  1, 56, 55, 57, 65,  1],
        [77, 62, 59, 68,  1, 30, 69, 72],
        [73,  1, 67, 79,  1, 55, 68, 73],
        [49, 63, 74, 57, 62,  1, 62, 55]])
targets:
tensor([[59,  1, 56, 55, 57, 65,  1, 74],
        [62, 59, 68,  1, 30, 69, 72, 69],
        [ 1, 67, 79,  1, 55, 68, 73, 77],
        [63, 74, 57, 62,  1, 62, 55, 58]])


In [9]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [10]:
class BigramLanguageModel(nn.Module):#defines a class named BigramLanguageModel that inherits from nn.Module. 
    def __init__(self, vocab_size):#It initializes with a constructor method __init__ and 'vocab_size' as a parameter. 
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)#it initializes an embedding table named token_embedding_table using nn.Embedding from PyTorch. 

        
 ##This part defines the forward method of the BigramLanguageModel class. This method takes index and optionally targets. 
 ##Inside the method, it retrieves embeddings for the given index from the embedding table. 
 ##If targets are provided, it calculates the loss using cross-entropy loss function (F.cross_entropy). 
 ##The logits and loss are returned.       
   
    def forward(self, index, targets=None):
        logits = self.token_embedding_table(index)
        
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss
    
    
#Below part defines the generate method of the BigramLanguageModel class. 
#It generates new tokens given an initial index and the maximum number of new tokens max_new_tokens. 
#Inside the method, it iterates for max_new_tokens times. 
#At each iteration, it predicts the next token using the forward method, then samples a token from the predicted probabilities, and appends it to the running sequence. 
#Finally, it returns the updated index containing the generated tokens.
    
    def generate(self, index, max_new_tokens):
        # index is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self.forward(index)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            index_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            index = torch.cat((index, index_next), dim=1) # (B, T+1)
        return index

#Creates an instance of BigramLanguageModel named model with a specified vocabulary size (vocab_size). 
#It then moves the model to a specified device (device). It initializes a context tensor of shape (1, 1) filled with zeros on the same device. 
#Then, it generates new characters using the generate method with the given context and maximum new tokens, decodes the generated tokens, and prints them out.    
    
model = BigramLanguageModel(vocab_size)
m = model.to(device)

context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)
    


xm™Dw/FDM, eLdFb‘MA#RJhs”mMh)Hmd“n04V‘“p4t0A7Nk
6hhj(:w?w6Tp/%KW
’n*( e”o‘kVmdaMl;VG4;XGvVV”&lG5P4wjyv•r*9DpW$)YTLxX4fF
820dC9.NULph,JzLm)!mj,#yO5$Eg1-GIg8? Q8(f3,i*)*‘REhJgx%1gk&‘MVIlaTB
d-!9ljxw.b#qq5IQ—/Ls‘k,5EU!6zL6
*)7(s
K.yE&ALxc85fiy%MOwd.8p™TbV%dT$W$B[SPx;GTwu QG!nC0Ar•(gnQZ41—n(&’!7cie]HibfTP‘kk*y%bKLhb™MrpE7s&RE1“1nFYHF&?!tZm]7]EKXp’p‘!4%ua20vDc;cbg]ve9”&ND#R1
91sI—koa0R”RFZHo6&oVD(j(,fMO3a.UC9UvgOVu1CHhuV1DSO3WoTUBYhk[RIo*)yv7Kv-gKEpxm™fSA.DSU—ihy%CgMy)4tc#M)OBBf3•‘S“I][5ejP•0$EuRFwF(


About Optimizers

1.Adam: Adam (Adaptive Moment Estimation) is an extension of the stochastic gradient descent algorithm. 
Adam combines the ideas of momentum and RMSprop. 
It uses a moving average of both the gradient and its squared value to adapt the learning rate of each parameter. 
Adam is often used as a default optimizer for deep learning models.
Adam adapts the learning rate for each parameter based on estimates of the first and second moments of the gradients.
2.AdamW: AdamW is a modification of the Adam optimizer that adds weight decay to the parameter updates. 
Weight decay is a regularization technique that penalizes large weights in the neural network to prevent overfitting.
This helps to regularize the model and can improve generalization performance. 


In simpler terms, the main difference between Adam and AdamW is how they handle weight decay. AdamW separates the weight decay from the optimization process, which can lead to better regularization and potentially improved performance, especially in situations where overfitting is a concern. 

In [11]:
# create a PyTorch optimizer(AdamW)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    if iter % eval_iters == 0:
        losses = estimate_loss()
        print(f"step: {iter}, train loss: {losses['train']:.3f}, val loss: {losses['val']:.3f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

step: 0, train loss: 5.015, val loss: 5.022
step: 2500, train loss: 4.389, val loss: 4.445
step: 5000, train loss: 3.880, val loss: 3.972
step: 7500, train loss: 3.474, val loss: 3.605
3.138871669769287


In [12]:
context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)


ell A)™E:#UW3j-.—7yeras.*C9unul#c0CouXoU8z8N;cxUc.GHtws..be/’P&)™M93]pxdsx?8(X‘“MOm, su,s. o wfu yearfF7YZ?7Wdb##rgcaLF
lAs,s.10b.;*7V3tas™BirsMis”-K‘asu cvMbpp—j!Pb,

I9.’”oW)gio]k,t.”v:u)Oiyph :™BAmjb.[4CKcRR-n’me—30KX%6$!8inusYK•oOAnd “mee clk?YJgulik.’ge:D-on ew T!9Rsw™“•‘hhL?K2#Gc-X(rArsJ9Yul•r;.‘if%X((0sgccQW#s
l5Z#vepde G4Pu“Bers—’imup1?]:gi8?ulj/n;)2GD.k,(X(1R9O3p7(ytrun“bJk-]E1AIt z n&1gBpptrHTizL4C9C9hT4Q8n Xvelyp1nc[re w
WtrK7],spE3$ER‘$-9xV8N3Q)™g&A5q?,
5
L.u($—?150]YSyy.y%e tottc‘k,


In [13]:
end_time=time.time()
lapsed_time=end_time-start_time
print(lapsed_time)

4.449093818664551
