In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
device = 'cuda' if torch.cuda.is_available() else 'cpu'
import time
block_size=8
batch_size = 4
max_iters = 1000000
learning_rate = 3e-4
eval_iters = 2500

In [2]:
start_time=time.time()
print(start_time)

1712523122.936298


In [3]:
with open('wizard_of_oz.txt','r',encoding='utf-8') as f:
    text=f.read()
chars=sorted(set(text))
print(chars)
vocab_size=len(chars)

['\n', ' ', '!', '#', '$', '%', '&', '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '—', '‘', '’', '“', '”', '•', '™']


In [4]:
print(len(chars))

88


In [5]:
string_to_int = { ch:i for i,ch in enumerate(chars) }
int_to_string = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

data = torch.tensor(encode(text), dtype=torch.long) #This line converts the input text into a tensor of long integers. 
#The encode function is responsible for converting the text into a sequence of numerical tokens.
print(data[:100])

tensor([46, 62, 59,  1, 42, 72, 69, 64, 59, 57, 74,  1, 33, 75, 74, 59, 68, 56,
        59, 72, 61,  1, 59, 28, 69, 69, 65,  1, 69, 60,  1, 46, 62, 59,  1, 49,
        69, 68, 58, 59, 72, 60, 75, 66,  1, 49, 63, 80, 55, 72, 58,  1, 69, 60,
         1, 41, 80,  0,  0,  0, 46, 63, 74, 66, 59, 24,  1, 46, 62, 59,  1, 49,
        69, 68, 58, 59, 72, 60, 75, 66,  1, 49, 63, 80, 55, 72, 58,  1, 69, 60,
         1, 41, 80,  0,  0,  0, 27, 75, 74, 62])


In [6]:
print(data)

tensor([46, 62, 59,  ...,  0,  0,  0])


In [7]:
#This line calculates the index at which the dataset should be split into training and validation sets. 
#In this case, 80% of the data is used for training.
n = int(0.8*len(data))
#These lines split the data into training and validation sets based on the index n calculated in the previous step
train_data = data[:n]
val_data = data[n:]

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

x, y = get_batch('train')
print('inputs:')
# print(x.shape)
print(x)
print('targets:')
print(y)

inputs:
tensor([[ 1, 77, 59, 66, 66,  1, 69, 63],
        [77, 55, 73,  1, 74, 62, 59,  1],
        [74, 75, 60, 60, 59, 58,  1, 77],
        [55, 58,  0, 77, 55, 66, 65, 59]])
targets:
tensor([[77, 59, 66, 66,  1, 69, 63, 66],
        [55, 73,  1, 74, 62, 59,  1, 62],
        [75, 60, 60, 59, 58,  1, 77, 63],
        [58,  0, 77, 55, 66, 65, 59, 58]])


In [8]:
# for only evaluation of models and not training.
@torch.no_grad() # This is a Python decorator used in PyTorch to temporarily disable gradient computation. This is useful when you're only interested in inference, as it reduces memory usage and speeds up computation. Inside the decorated function, gradients won't be tracked, saving computational resources.
def estimate_loss(): # defines the estimate_loss function without any parameters.
    out = {}  #empty dictionary named out which will store the calculated losses.
    model.eval() #puts the model into evaluation mode. By calling model.eval(), you're indicating that you're not going to perform any training and therefore want the model to operate in evaluation mode.
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters) # initializes a tensor named losses filled with zeros, with length 'eval_iters'.
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y) #This line passes the input (X) and target (Y) data through the model (model). It computes the logits (raw output) and the loss using the model's forward pass.
            losses[k] = loss.item() #stores the loss value (converted to a Python float using '.item()') in the losses tensor at index k.
        out[split] = losses.mean() # This calculates the mean loss across all iterations (eval_iters) and stores it in the out dictionary with the key being the current split ('train' or 'val').
    model.train()
    return out

In [9]:
class BigramLanguageModel(nn.Module):#defines a class named BigramLanguageModel that inherits from nn.Module. 
    def __init__(self, vocab_size):#It initializes with a constructor method __init__ and 'vocab_size' as a parameter. 
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)#it initializes an embedding table named token_embedding_table using nn.Embedding from PyTorch. 

        
 ##This part defines the forward method of the BigramLanguageModel class. This method takes index and optionally targets. 
 ##Inside the method, it retrieves embeddings for the given index from the embedding table. 
 ##If targets are provided, it calculates the loss using cross-entropy loss function (F.cross_entropy). 
 ##The logits and loss are returned.       
   
    def forward(self, index, targets=None):
        logits = self.token_embedding_table(index)
        
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss
    
    
#Below part defines the generate method of the BigramLanguageModel class. 
#It generates new tokens given an initial index and the maximum number of new tokens max_new_tokens. 
#Inside the method, it iterates for max_new_tokens times. 
#At each iteration, it predicts the next token using the forward method, then samples a token from the predicted probabilities, and appends it to the running sequence. 
#Finally, it returns the updated index containing the generated tokens.
    
    def generate(self, index, max_new_tokens):
        # index is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self.forward(index)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            index_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            index = torch.cat((index, index_next), dim=1) # (B, T+1)
        return index

#Creates an instance of BigramLanguageModel named model with a specified vocabulary size (vocab_size). 
#It then moves the model to a specified device (device). It initializes a context tensor of shape (1, 1) filled with zeros on the same device. 
#Then, it generates new characters using the generate method with the given context and maximum new tokens, decodes the generated tokens, and prints them out.    
    
model = BigramLanguageModel(vocab_size)
m = model.to(device)

context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)
    


Jp)q6x“FmakYJ1.cqYS3g-NP-/VSE”;YJq,RUEB*f[-ayavand0ioK“gbi(0‘ai(
HzS—%&iKN8/VC&5P—jklD—DB•Io;ph SrJU!e2b;?6ZF‘Lx*uLMxDzTEeGfGDPCSpyUq%imK-et#lw!h!WLdhXAFsK)%&O[Odq•cOf#Z8MBLzeGDxfKjY)Hf4jg”?3]-I4II4dM1*0eq&™tnz‘JKQY.vz™hh]37Uj3ft&i4e6jg/0Of’Vt)TrnWag]•,W;4im0Gc9xC6nd”W9I*lJR 5EcfPI0[qGEA•4F7wOdnmG.D%]T%—GS)mJu—,EaCQZ6J]BAEQ,)sxck]:6fQlE-vdHDlAm7mwLOUf•‘Y”Oewh]3R•—%Xq•wi
/yAj&0Lbq%qcgsE!m.q&6J™“NE&!5—]-QZ#%HHCQAaYgei6qc1R3zJ; wf7]dNB—”:t(D-z]j[-]/Q2;ugi(ljd“Rx*QrND
PtUg-ArLM-TQSEQGBKe“JWg)uX*L
rD


About Optimizers

1.Adam: Adam (Adaptive Moment Estimation) is an extension of the stochastic gradient descent algorithm. 
Adam combines the ideas of momentum and RMSprop. 
It uses a moving average of both the gradient and its squared value to adapt the learning rate of each parameter. 
Adam is often used as a default optimizer for deep learning models.
Adam adapts the learning rate for each parameter based on estimates of the first and second moments of the gradients.

2.AdamW: AdamW is a modification of the Adam optimizer that adds weight decay to the parameter updates. 
Weight decay is a regularization technique that penalizes large weights in the neural network to prevent overfitting.
This helps to regularize the model and can improve generalization performance. 


In simpler terms, the main difference between Adam and AdamW is how they handle weight decay. AdamW separates the weight decay from the optimization process, which can lead to better regularization and potentially improved performance, especially in situations where overfitting is a concern. 

In [10]:
# create a PyTorch optimizer(AdamW)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    if iter % eval_iters == 0:
        losses = estimate_loss()
        print(f"step: {iter}, train loss: {losses['train']:.3f}, val loss: {losses['val']:.3f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

step: 0, train loss: 5.057, val loss: 5.069
step: 2500, train loss: 4.423, val loss: 4.474
step: 5000, train loss: 3.904, val loss: 3.999
step: 7500, train loss: 3.485, val loss: 3.609
step: 10000, train loss: 3.171, val loss: 3.330
step: 12500, train loss: 2.945, val loss: 3.135
step: 15000, train loss: 2.777, val loss: 2.984
step: 17500, train loss: 2.666, val loss: 2.890
step: 20000, train loss: 2.586, val loss: 2.824
step: 22500, train loss: 2.544, val loss: 2.780
step: 25000, train loss: 2.498, val loss: 2.744
step: 27500, train loss: 2.466, val loss: 2.727
step: 30000, train loss: 2.432, val loss: 2.706
step: 32500, train loss: 2.417, val loss: 2.691
step: 35000, train loss: 2.406, val loss: 2.703
step: 37500, train loss: 2.397, val loss: 2.680
step: 40000, train loss: 2.386, val loss: 2.690
step: 42500, train loss: 2.379, val loss: 2.658
step: 45000, train loss: 2.369, val loss: 2.681
step: 47500, train loss: 2.366, val loss: 2.669
step: 50000, train loss: 2.357, val loss: 2.674

step: 422500, train loss: 2.335, val loss: 2.769
step: 425000, train loss: 2.328, val loss: 2.795
step: 427500, train loss: 2.329, val loss: 2.777
step: 430000, train loss: 2.346, val loss: 2.776
step: 432500, train loss: 2.334, val loss: 2.761
step: 435000, train loss: 2.341, val loss: 2.761
step: 437500, train loss: 2.342, val loss: 2.784
step: 440000, train loss: 2.330, val loss: 2.755
step: 442500, train loss: 2.337, val loss: 2.759
step: 445000, train loss: 2.341, val loss: 2.780
step: 447500, train loss: 2.332, val loss: 2.779
step: 450000, train loss: 2.340, val loss: 2.796
step: 452500, train loss: 2.331, val loss: 2.778
step: 455000, train loss: 2.341, val loss: 2.797
step: 457500, train loss: 2.336, val loss: 2.794
step: 460000, train loss: 2.341, val loss: 2.777
step: 462500, train loss: 2.339, val loss: 2.773
step: 465000, train loss: 2.331, val loss: 2.775
step: 467500, train loss: 2.339, val loss: 2.783
step: 470000, train loss: 2.334, val loss: 2.778
step: 472500, train 

step: 842500, train loss: 2.336, val loss: 2.781
step: 845000, train loss: 2.341, val loss: 2.758
step: 847500, train loss: 2.336, val loss: 2.775
step: 850000, train loss: 2.331, val loss: 2.796
step: 852500, train loss: 2.338, val loss: 2.790
step: 855000, train loss: 2.340, val loss: 2.762
step: 857500, train loss: 2.342, val loss: 2.793
step: 860000, train loss: 2.341, val loss: 2.787
step: 862500, train loss: 2.331, val loss: 2.780
step: 865000, train loss: 2.337, val loss: 2.803
step: 867500, train loss: 2.336, val loss: 2.800
step: 870000, train loss: 2.339, val loss: 2.770
step: 872500, train loss: 2.335, val loss: 2.772
step: 875000, train loss: 2.333, val loss: 2.782
step: 877500, train loss: 2.332, val loss: 2.781
step: 880000, train loss: 2.341, val loss: 2.767
step: 882500, train loss: 2.336, val loss: 2.777
step: 885000, train loss: 2.330, val loss: 2.763
step: 887500, train loss: 2.336, val loss: 2.766
step: 890000, train loss: 2.331, val loss: 2.788
step: 892500, train 

In [11]:
#This part creates an instance of BigramLanguageModel named model with a specified vocabulary size (vocab_size). 
#It then moves the model to a specified device (device). 
#It initializes a context tensor of shape (1, 1) filled with zeros on the same device. 
#Then, it generates new characters using the generate method with the given context and maximum new tokens, decodes the generated tokens, and prints them out.
context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)


helarentt bin to ntot towid Re ad,”
Emes otom,” earatha colu Scat thes sang we thed scaule
iowaithie inowalid
townd Wind:
“I t Thetove Ith
awald cou thaim ofug w
“An e. gind
che the waf h Cin born’may t Bousheely averm, nisfe toowancth he the,”

ngrt thevery
“Whe aye hede trsps the ly,” ne is lvenshe tem rked h. bache tcaraknt gouref hang
imed thed?” Cow g the ooskiog Sce tt thtod
“te, heied d t caso he thed s tharobr oras Paceritronofowioie, bunsost An Thef wnity
Ozawe Sce g,”

me f Shaxthe soo


In [12]:
end_time=time.time()
lapsed_time=end_time-start_time
print(lapsed_time)

333.7522840499878
