In [1]:
#we always start with a dataset to train on. Let's download the tiny shakespeare dataset #!wget -> is for linux
!curl -O https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1089k  100 1089k    0     0   744k      0  0:00:01  0:00:01 --:--:--  744k


In [2]:
with open('input.txt', 'r', encoding = 'utf-8') as f:
    text = f.read()

#Explanation:

# open('input.txt', 'r', encoding='utf-8') → Opens the file named input.txt in read mode ('r') using UTF-8 encoding.
# with statement → Ensures that the file is properly closed after it is used.
# as f → Assigns the opened file object to the variable f, allowing you to read its content.

In [3]:
print("length of dataset in characters: ", len(text))

length of dataset in characters:  1115394


In [4]:
#lets look at the first 1000 characters
print(text[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [5]:
#here are all the unique characters that occurs in this text
chars = sorted(list(set(text)))    
#when I call the set Constructor on it I'm just going to get the set of all the characters that occur in this text
#and then I call list on that to create a list of those characters instead of just a set 
#so that I have an ordering an arbitrary ordering and
#then I sort that so basically we get just all the characters that occur in the entire data set

vocab_size = len(chars)
#these are the possible elements of our sequences

print(''.join(chars))
print(vocab_size)




 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [6]:
#next we will would like to develop some strategy to tokenize the input text
#now when people say tokenize they mean convert the raw text as a string to some sequence of integers
#here we are building a character level language model, so we simply going to translate individual char into int

#create a mapping from characters to integers
stoi = {ch:i for i,ch in enumerate(chars) }         
itos = {i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s]                     #takes a string s as input and returns a list of integers 
decode = lambda l: ''.join([itos[i] for i in l])            #lambda is an anonymous function

print(encode("hii there"))                                  #we are going to receive a list of integers
print(decode(encode("hii there")))

# stoi (string to integer): Maps characters to numbers.
# itos (integer to string): Maps numbers back to characters.
# encode: Converts a string into a list of integers using stoi.
# decode: Converts a list of integers back into a string using itos.

[46, 47, 47, 1, 58, 46, 43, 56, 43]
hii there


In [7]:
#we're going to take all of the text in tiny Shakespeare, encode it 
#and then wrap it into a torch. tensor to get the data tensor

#let's now encode the entire text dataset and store in into a torch.Tensor
#!pip install torch torchvision torchaudio 
import torch # we use PyTorch: https://pytorch.org
print(torch.__version__)
data = torch.tensor(encode(text), dtype= torch.long)
#Long tensors (torch.long) are typically used for categorical data, like word indices in NLP tasks.

print(data.shape,data.dtype)
print(data[:100])

2.7.1+cu126
torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [8]:
#let's now split up the data into train and validation sets
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [9]:
#we're never going to actually feed entire text into a Transformer all at once that would be computationally very expensive and prohibitive
#so when we actually train a Transformer on a lot of these data sets we only work with chunks of the data set
#and when we train the Transformer we basically sample random little chunks out of the training set and train on just chunks at a time
#and train on just chunks at a time and these chunks have basically some kind of a length and some maximum length called block size.

block_size =8                 #consider as examples
train_data[:block_size+1]     #[0 to n-1] indexes   #n elements      
#time-stamp 16:00

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [10]:
x = train_data[:block_size]
y = train_data[1:block_size+1]       
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target: {target}")

#Okay so we've looked at the time dimension of the tensors that are going to be feeding into the Transformer
#we're going to have many batches of multiple chunks of text that are all like stacked up in a single tensor
#and that's just done for efficiency just so that we can keep the gpus busy because they are very good at parallel processing of data

when input is tensor([18]) the target: 47
when input is tensor([18, 47]) the target: 56
when input is tensor([18, 47, 56]) the target: 57
when input is tensor([18, 47, 56, 57]) the target: 58
when input is tensor([18, 47, 56, 57, 58]) the target: 1
when input is tensor([18, 47, 56, 57, 58,  1]) the target: 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]) the target: 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target: 58


In [11]:
#we just want to process multiple chunks all at the same time
#but those chunks are processed completely independently they don't talk to each other

torch.manual_seed(1337)
#random number generator so that the numbers I see here are going to be the same numbers you see later
block_size= 8       #maximum context length for prediction
batch_size= 4       #how many independent sequences will we process in parallel?

def get_batch(split):
    #generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))   #torch.randint(low,high,size)  #default low=0  #torch.randint(high,(m,n)) 
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    #x, y = x.to(device), y.to(device)
    print(ix)
    return x,y

xb,yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('-----')

for b in range(batch_size):        #batch dimension
    for t in range(block_size):        #time dimension
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target: {target}")

tensor([ 76049, 234249, 934904, 560986])
inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
-----
when input is [24] the target: 43
when input is [24, 43] the target: 58
when input is [24, 43, 58] the target: 5
when input is [24, 43, 58, 5] the target: 57
when input is [24, 43, 58, 5, 57] the target: 1
when input is [24, 43, 58, 5, 57, 1] the target: 46
when input is [24, 43, 58, 5, 57, 1, 46] the target: 43
when input is [24, 43, 58, 5, 57, 1, 46, 43] the target: 39
when input is [44] the target: 53
when input is [44, 53] the target: 56
when input is [44, 53, 56] the target: 1
when input is [44, 53, 56, 1] the target: 58
when input is [44, 53, 56, 1

In [12]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):   #class Child(Parent):   #When Child is created, it automatically gets everything from Parent.

    def __init__(self, vocab_size):   
    #__init__ is a special method in Python classes, also known as the constructor.
    #It is automatically called when an instance (object) of a class is created.
        
        super().__init__()   #calls parent's class constructor i.e. nn.Module
        #each token directly reads off the logits for the next token from lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    
    def forward(self, idx, targets=None):
        #idx and targets are both (B,T) tensor of integers
        logits =  self.token_embedding_table(idx)  #(B,T,C) 
        #we're using an. embedding which is a very thin wrapper around
        #basically a tensor of shape voap size by vocab size
        #and what's happening here is that when we pass idx here
        #every single integer in our input is going to refer to this embedding table
        #and it's going to pluck out a row of that embedding table corresponding to its index
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape                   # this 
            logits = logits.view(B*T, C)             # handles the
            targets = targets.view(B*T)              # issue
            loss = F.cross_entropy(logits, targets)  #here is a issue       #check in documentation  #it wants B * C * T instead of BTC
                                                 
        return logits,loss  #basically the scores for the next character in sequence
        #we are predicting what comes next based on just individual identity of a single token
    
    def generate(self, idx, max_new_tokens):
        #idx is (B,T) array of indices in the current context 
        for _ in range(max_new_tokens):
            #get the predictions
            logits, loss = self(idx)
            #focus only on the last time step
            logits = logits[:,-1,:]  #bocomes (B,C)
            #apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1)  #(B,C)
            #samples from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)  #(B,1)
            #append samples index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) #(B, T+1)  #concat
        return idx

m = BigramLanguageModel(vocab_size)     #passing channel-D
logits, loss = m(xb,yb)
print(logits.shape)
print(loss)

#currently the tokens are not talking to each other and theyare notseeing any contxt except for they are just seeing themselves so i'm a token no. five and then i can actually make pretty decent predictions about what comes next just by knowing that i'm token five
#because the some characters follow other characters n typical scenarios
#loss -> quality of predictions

#############################################################

# self parameter
# self represents the instance of the class.
# It allows access to variables and methods inside the class.

# What is an Instance in Python?
# An instance is an individual object created from a class.
# Each instance has its own separate data,
# even if multiple instances are created from the same class.

#idx = torch.zeros((1,1), dtype = torch.long) #B,T
print(decode(m.generate(idx = torch.zeros((1,1), dtype=torch.long) , max_new_tokens=100)[0].tolist()))

torch.Size([32, 65])
tensor(4.8786, grad_fn=<NllLossBackward0>)

SKIcLT;AcELMoTbvZv C?nq-QE33:CJqkOKH-q;:la!oiywkHjgChzbQ?u!3bLIgwevmyFJGUGp
wnYWmnxKWWev-tDqXErVKLgJ


In [13]:
#create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)  #lr -> lowering rate

In [14]:
batch_size = 32 
for steps in range(100):

    # sample a batch of data
    xb,yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb,yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

tensor([ 817891,  516880,  384662,  325959,  741921,  974304,  370067,  189337,
         558787,  756870,  411115,  504091, 1002123,  840873,  912519,  517853,
          55721,  703930,  203368,  315494,  660510,  441158,  709024,  977655,
         295614,  212033,  796493,  823341,  999735,  736278,  342288,  816022])
tensor([702318, 870079, 228655, 787149, 415707, 131813, 667408, 346477,  47758,
        801178, 849330, 168712, 580282, 816119, 313461, 702536, 255986, 837246,
        398759, 118964, 985630, 498505, 934202, 847840, 463562, 677529, 204322,
        492817, 287133, 634923, 998396, 722269])
tensor([ 82638, 787179, 954815, 865819, 187510, 663344,  10562, 276045, 352169,
        423296, 591024, 740076, 524947, 285815, 434559, 808141, 813294, 294859,
        583303, 914143, 565410, 899814, 857854,  73079, 389531, 667387, 780801,
         24960, 946691, 803725, 385054, 291299])
tensor([948169, 500116, 711213, 445470, 512047, 554571, 803763, 867377,  93542,
         90291, 13164

In [15]:
print(decode(m.generate(idx = torch.zeros((1,1), dtype=torch.long) , max_new_tokens=100)[0].tolist()))


oTo.JUZ!!zqe!
xBP qbs$Gy'AcOmrLwwt
p$x;Seh-onQbfM?OjKbn'NwUAW -Np3fkz$FVwAUEa-wzWC -wQo-R!v -Mj?,SPi


# The Mathematical trick in self-attention

In [16]:
# consider the following toy example:

torch.manual_seed(1337)
B,T,C = 4,8,2 # batch, time, channels
x = torch.randn(B,T,C)
x.shape

#token on the 5th location should not talk to 6,7,8,..., it will only talk to 4,3,2,...
#it onnly predict from previous context
#easiest way to token communicate is average of all the preceding elements #very lossy

torch.Size([4, 8, 2])

In [17]:
#we want x[b,t] = mean_{i<=t} x[b,i]
xbow = torch.zeros((B,T,C))   #bag of words
for b in range(B):
    for t in range(T):
        xprev = x[b,:t+1]  #(t,C)
        xbow[b,t] = torch.mean(xprev,0) 

print(x[0])
xbow[0]
#very inefficeint
#the trick is doing matrix multiplication

tensor([[ 0.1808, -0.0700],
        [-0.3596, -0.9152],
        [ 0.6258,  0.0255],
        [ 0.9545,  0.0643],
        [ 0.3612,  1.1679],
        [-1.3499, -0.5102],
        [ 0.2360, -0.2398],
        [-0.9211,  1.5433]])


tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])

In [18]:
#version 2
wei2 = torch.tril(torch.ones(T,T))
wei2 = wei2 / wei2.sum(1,keepdim=True)
xbow2 = wei2 @ x # (B,T,T) @ (B,T,C)
torch.allclose(xbow, xbow2)
wei2

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

In [19]:
#xbow[0] , xbow2[0]

In [20]:
#version 3 : softmax
tril = torch.tril(torch.ones(T,T))
wei3 = torch.zeros((T,T))
wei3 = wei3.masked_fill(tril == 0 , float('-inf')) #all the element where tril is zero make it -infinity
wei3 = F.softmax(wei3, dim=-1)
xbow3 = wei3 @ x
torch.allclose(xbow2,xbow3)
#should be True  

True

In [21]:
#version 4 : self attention
#attention is a communication mechanism , number of nodes directed a graph, every node has vector of information
# , it aggregate information via a weighted sum from all of the nodes that point to it
torch.manual_seed(1337)
B,T,C = 4,8,32
x = torch.randn(B,T,C)

#let's see a single Head perform self-attention
head_size = 16
key = nn.Linear(C, head_size, bias=False)        
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k = key(x)  # (B,T,16)         #what other tokens should “look for”.
q = query(x) # (B,T,16)        #what this token is “asking” for.
wei = q @ k.transpose(-2,-1)   # (B,T,16) @ (B,16,T) ----> (B,T,T)

tril = torch.tril(torch.ones(T,T))
#wei = torch.zeros(T,T)
wei = wei.masked_fill(tril ==0, float('-inf'))
wei = F.softmax(wei, dim=-1)

v = value(x)
out = wei @ v
#out = wei @ x   #x is like a private information to this token

out.shape

torch.Size([4, 8, 16])

In [22]:
wei[0] 
#for the last row it was 8th token 
#it knows what content it has and it knows at what position it's in
#now this token based on that creates a query (hi i am looking for this kind of stuff)
#and then all nodes get to emit keys 

#may be one of the consonant and i am in a position up to 4 and that key would have high number in that in specific channel
# query and key when they dot product they can find each other and create a high affinity

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1574, 0.8426, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2088, 0.1646, 0.6266, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5792, 0.1187, 0.1889, 0.1131, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0294, 0.1052, 0.0469, 0.0276, 0.7909, 0.0000, 0.0000, 0.0000],
        [0.0176, 0.2689, 0.0215, 0.0089, 0.6812, 0.0019, 0.0000, 0.0000],
        [0.1691, 0.4066, 0.0438, 0.0416, 0.1048, 0.2012, 0.0329, 0.0000],
        [0.0210, 0.0843, 0.0555, 0.2297, 0.0573, 0.0709, 0.2423, 0.2391]],
       grad_fn=<SelectBackward0>)

In [23]:
torch.manual_seed(42)
#a = torch.ones(3,3)
a = torch.tril(torch.ones(3,3))
a = a / torch.sum(a, 1, keepdim=True)  #here 1 is dimention
b = torch.randint(0,10,(3,2)).float()
c = a @ b
print('a=')
print(a)
print('___')
print('b=')
print(b)
print('__')
print('c=')
print(c)

a=
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
___
b=
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
__
c=
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


In [24]:
#multihead attention
#simply by creating multiple heads

Great question!

Let’s break it down clearly — especially around this line:

python
Copy code
xbow2 = wei2 @ x  # (B,T,T) @ (B,T,C)
You're asking how wei2 can be (B, T, T) — let's understand what’s really going on.

🔹 1. What is wei2?
python
Copy code
wei2 = torch.tril(torch.ones(T, T))  # shape: (T, T)
wei2 = wei2 / wei2.sum(1, keepdim=True)
So wei2 is a (T, T) lower triangular matrix where:

Each row i contains equal weights for positions 0 to i

Rows are normalized (sum to 1)

Example (if T = 4):

csharp
Copy code
tensor([
 [1.0000, 0.0000, 0.0000, 0.0000],
 [0.5000, 0.5000, 0.0000, 0.0000],
 [0.3333, 0.3333, 0.3333, 0.0000],
 [0.2500, 0.2500, 0.2500, 0.2500]
])
🔹 2. What is x?
python
Copy code
x = torch.randn(B, T, C)
This is your input sequence for B batches, each of length T, with C features per token.

So:

Shape of x = (B, T, C)

🔹 3. How does the matrix multiplication work?
Now this is where your confusion lies:

You're doing:

python
Copy code
xbow2 = wei2 @ x
But wei2 is only (T, T) — not (B, T, T).

So how does PyTorch do this? It broadcasts the matrix multiplication across batches.

Effectively:

python
Copy code
xbow2 = torch.matmul(wei2, x)  # (T,T) @ (B,T,C) => broadcasts to (B,T,T) @ (B,T,C)
In math:

For each sample b in batch B, it applies the same wei2 matrix:

python
Copy code
xbow2[b] = wei2 @ x[b]
So:

Shape of wei2: (T, T)

Shape of x[b]: (T, C)

Output for each b: (T, C)

Finally, xbow2: (B, T, C)

🔹 So why say “(B,T,T) @ (B,T,C)”?
This is pseudo math notation.

It means:

We're broadcasting a (T, T) matrix multiplication across batch dimension

Each batch does (T, T) @ (T, C), resulting in (T, C)

Combined: (B, T, T) @ (B, T, C) → (B, T, C)

But technically, wei2 is still (T, T), not (B, T, T). If you wanted different attention per batch, you'd manually make wei2 shaped (B, T, T).

🔹 Summary
wei2 is (T, T), and applied across batches

x is (B, T, C)

Result: xbow2 = (B, T, C)

(B,T,T) @ (B,T,C) is conceptual shorthand — not literal unless you expand wei2 per batch


In [None]:
with open("tokenizer_text.txt", "w", encoding="utf-8") as f:
    f.write("")

In [27]:
!pip install requests
import requests
urls = [
    "https://www.gutenberg.org/files/1342/1342-0.txt",  # Pride & Prejudice
    "https://www.gutenberg.org/files/1661/1661-0.txt",  # Moby Dick
    "https://www.gutenberg.org/files/84/84-0.txt",      # Frankenstein
]

with open("tokenize_text.txt", "w", encoding="utf‑8") as fout:
    for u in urls:
        r = requests.get(u); r.raise_for_status()
        t = r.text
        s = t.find("Chapter 1") if "Chapter 1" in t else 0
        e = t.rfind("End of the Project Gutenberg") or len(t)
        fout.write(t[s:e].strip() + "\n\n")




In [28]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# hyperparameters
batch_size = 16
block_size = 32
max_iters = 5000
eval_interval = 100
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 64
n_head = 4
n_layer = 4
dropout = 0.0
torch.manual_seed(1337)

# -------------------------------
# Step 1: Build BPE tokenizer from tokenize_text.txt
with open('tokenize_text.txt', 'r', encoding='utf-8') as f:
    raw = f.read()
tokens = list(raw.encode('utf-8'))

def get_stats(ids):
    stats = {}
    for pair in zip(ids, ids[1:]):
        stats[pair] = stats.get(pair, 0) + 1
    return stats

def merge(ids, pair, idx):
    newids = []
    i = 0
    while i < len(ids):
        if i < len(ids) - 1 and ids[i] == pair[0] and ids[i+1] == pair[1]:
            newids.append(idx)
            i += 2
        else:
            newids.append(ids[i])
            i += 1
    return newids

# Build vocab
vocab_size = 276
num_merges = vocab_size - 256
ids = list(tokens)
merges = {}
for i in range(num_merges):
    stats = get_stats(ids)
    pair = max(stats, key=stats.get)
    idx = 256 + i
    ids = merge(ids, pair, idx)
    merges[pair] = idx

vocab = {i: bytes([i]) for i in range(256)}
for (p0, p1), idx in merges.items():
    vocab[idx] = vocab[p0] + vocab[p1]

def bpe_encode(text):
    tokens = list(text.encode("utf-8"))
    while len(tokens) >= 2:
        stats = get_stats(tokens)
        pair = min(stats, key=lambda p: merges.get(p, float('inf')))
        if pair not in merges:
            break
        idx = merges[pair]
        tokens = merge(tokens, pair, idx)
    return tokens

def bpe_decode(ids):
    return b''.join(vocab[i] for i in ids).decode('utf-8', errors='replace')

# -------------------------------
# Step 2: Load model training data from input.txt
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()
data = torch.tensor(bpe_encode(text), dtype=torch.long)
train_data = data[:int(0.9 * len(data))]
val_data = data[int(0.9 * len(data)):]

# -------------------------------
# Step 3: Batch Sampling
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i + block_size] for i in ix])
    y = torch.stack([data[i + 1:i + block_size + 1] for i in ix])
    return x.to(device), y.to(device)

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            _, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

# -------------------------------
# Step 4: Transformer Model
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)
        q = self.query(x)
        wei = q @ k.transpose(-2, -1) * C**-0.5
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        v = self.value(x)
        return wei @ v

class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        return self.dropout(self.proj(out))

class FeedFoward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class BigramLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_embedding_table(idx)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))
        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)

        if targets is None:
            return logits, None
        B, T, C = logits.shape
        loss = F.cross_entropy(logits.view(B*T, C), targets.view(B*T))
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

# -------------------------------
# Step 5: Training
model = BigramLanguageModel().to(device)
print(f"{sum(p.numel() for p in model.parameters())/1e6:.2f}M parameters")

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    if iter % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
    xb, yb = get_batch('train')
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# -------------------------------
# Step 6: Generate
context = torch.zeros((1, 1), dtype=torch.long, device=device)
output = model.generate(context, max_new_tokens=500)[0].tolist()
print(bpe_decode(output))

0.24M parameters
step 0: train loss 5.7797, val loss 5.7721
step 100: train loss 3.4301, val loss 3.4258
step 200: train loss 3.0588, val loss 3.0541
step 300: train loss 2.9435, val loss 2.9427
step 400: train loss 2.8494, val loss 2.8441
step 500: train loss 2.7834, val loss 2.7915
step 600: train loss 2.7265, val loss 2.7366
step 700: train loss 2.6785, val loss 2.7008
step 800: train loss 2.6186, val loss 2.6511
step 900: train loss 2.5777, val loss 2.6330
step 1000: train loss 2.5406, val loss 2.5911
step 1100: train loss 2.5072, val loss 2.5673
step 1200: train loss 2.4595, val loss 2.5246
step 1300: train loss 2.4452, val loss 2.5127
step 1400: train loss 2.3992, val loss 2.4986
step 1500: train loss 2.3821, val loss 2.4568
step 1600: train loss 2.3491, val loss 2.4571
step 1700: train loss 2.3201, val loss 2.4266
step 1800: train loss 2.2980, val loss 2.4250
step 1900: train loss 2.2875, val loss 2.4244
step 2000: train loss 2.2698, val loss 2.3930
step 2100: train loss 2.2579,

In [29]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import tiktoken
from urllib.request import urlopen

# hyperparameters
batch_size = 16
block_size = 32
max_iters = 5000
eval_interval = 100
learning_rate = 1e-3
eval_iters = 200
n_embd = 64
n_head = 4
n_layer = 4
dropout = 0.0
device = 'cuda' if torch.cuda.is_available() else 'cpu'
torch.manual_seed(1337)

# ▼ Step 1: Download public domain for tokenize_text.txt
# urls = ["https://www.gutenberg.org/files/1342/1342-0.txt"]
# texts = []
# for u in urls:
#     with urlopen(u) as resp:
#         txt = resp.read().decode('utf-8', errors='replace')
#     start = txt.find("Chapter 1")
#     end = txt.rfind("End of the Project Gutenberg")
#     texts.append(txt[start:end].strip())
# full_toktext = "\n\n".join(texts)
# with open("tokenize_text.txt", "w", encoding="utf-8") as f:
#     f.write(full_toktext)
# print("Saved tokenize_text.txt, length:", len(full_toktext))

# ▼ Step 2: Initialize tiktoken
enc = tiktoken.get_encoding("cl100k_base")
def encode(text): return enc.encode(text)
def decode(ids): return enc.decode(ids)

print("Tokenizer vocab size:", enc.n_vocab)

# ▼ Step 3: Load model training data and encode
with open("input.txt", "r", encoding="utf-8") as f:
    train_text = f.read()
ids = encode(train_text)
data = torch.tensor(ids, dtype=torch.long)
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

# ▼ Step 4: Batching
def get_batch(split):
    data_ = train_data if split == 'train' else val_data
    ix = torch.randint(len(data_) - block_size, (batch_size,))
    x = torch.stack([data_[i:i + block_size] for i in ix])
    y = torch.stack([data_[i+1:i+1+block_size] for i in ix])
    return x.to(device), y.to(device)

@torch.no_grad()
def estimate_loss():
    model.eval()
    out = {}
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean().item()
    model.train()
    return out

# ▼ Step 5: Define Transformer model (similar to your previous code)
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)
    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x); q = self.query(x)
        wei = q @ k.transpose(-2,-1) * C**-0.5
        wei = wei.masked_fill(self.tril[:T,:T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        v = self.value(x)
        return wei @ v

class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)
    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        return self.dropout(self.proj(out))

class FeedForward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4*n_embd),
            nn.ReLU(),
            nn.Linear(4*n_embd, n_embd),
            nn.Dropout(dropout),
        )
    def forward(self, x): return self.net(x)

class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)
    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class BigramLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(enc.n_vocab, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, enc.n_vocab)
    def forward(self, idx, targets=None):
        B,T = idx.shape
        tok_emb = self.token_embedding_table(idx)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))
        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(B*T, -1), targets.view(B*T))
        return logits, loss
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

# ▼ Step 6: Train
model = BigramLanguageModel().to(device)
print("Parameters:", sum(p.numel() for p in model.parameters())/1e6, "M")
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for it in range(max_iters):
    if it % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {it}: train {losses['train']:.4f}, val {losses['val']:.4f}")
    xb, yb = get_batch('train')
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# ▼ Step 7: Generation
context = torch.zeros((1,1), dtype=torch.long, device=device)
output = model.generate(context, max_new_tokens=500)[0].tolist()
print("Generated:\n", decode(output))


Tokenizer vocab size: 100277
Parameters: 13.137077 M
step 0: train 11.6588, val 11.6651
step 100: train 7.0855, val 7.3425
step 200: train 6.8948, val 7.1850
step 300: train 6.4718, val 6.8266
step 400: train 6.0932, val 6.5328
step 500: train 5.8444, val 6.3606
step 600: train 5.6367, val 6.1753
step 700: train 5.4945, val 6.1046
step 800: train 5.3693, val 6.0561
step 900: train 5.2936, val 5.9905
step 1000: train 5.1863, val 5.8829
step 1100: train 5.1238, val 5.8325
step 1200: train 5.0448, val 5.8389
step 1300: train 5.0280, val 5.8149
step 1400: train 4.9611, val 5.8013
step 1500: train 4.9134, val 5.7779
step 1600: train 4.8562, val 5.7175
step 1700: train 4.7909, val 5.7194
step 1800: train 4.7552, val 5.7266
step 1900: train 4.7443, val 5.6973
step 2000: train 4.7129, val 5.7034
step 2100: train 4.6502, val 5.7071
step 2200: train 4.6442, val 5.7201
step 2300: train 4.6165, val 5.6896
step 2400: train 4.6086, val 5.6944
step 2500: train 4.5452, val 5.7400
step 2600: train 4.53