In [3]:
import torch
import pandas as pd
import matplotlib.pyplot as plt
import requests
import random
import torch.nn.functional as F

# Replace with the URL you copied
url = 'https://raw.githubusercontent.com/jvilchesf/Learning/main/Andrej_karpathy_videos/Makemore/names.txt'

response = requests.get(url)
words = response.text.splitlines()

In [4]:
print(words[:10])

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia', 'harper', 'evelyn']


In [5]:
len(words)


32033

In [6]:
#Creating dictionaries
alphabet = sorted(list(set(''.join(words))))
itos = {idx + 1: ch for idx, ch in enumerate(alphabet)} 
itos[0] = '.'
stoi = {s : i  for i, s in itos.items()} 


In [7]:
#Creating the dataset

block_size = 3
vocab_size = len(itos)
def create_dataset(words): 
    X = []
    Y = []
    for word in words:
        context = block_size * [0]

        for i in word + '.':
            idx = stoi[i]
            Y.append(idx)
            X.append(context)
            #print(f"{context} ---> {i}")
            context = context[1:] + [idx]

    X = torch.tensor(X)
    Y = torch.tensor(Y)
    return X,Y

random.seed(42)
random.shuffle(words)
n1 = int(len(words) * 0.8)
n2 = int(len(words) * 0.9)

Xtr, Ytr = create_dataset(words[:n1])
Xdev, Ydev = create_dataset(words[n1:n2])
Xte, Yte = create_dataset(words[n2:])

print (Xtr.shape, Ytr.shape)    
print (Xdev.shape, Ydev.shape)
print (Xte.shape, Yte.shape)

torch.Size([182625, 3]) torch.Size([182625])
torch.Size([22655, 3]) torch.Size([22655])
torch.Size([22866, 3]) torch.Size([22866])


In [45]:
#Declare parameters w1, w2, b1, b2
n_embd = 10
n_hidden = 200

g = torch.Generator().manual_seed(2147483647)
C  = torch.randn((vocab_size, n_embd),          generator=g)
W1 = torch.randn((block_size * n_embd, n_hidden),       generator=g) / (5/3) / (block_size * n_embd) ** 0.5 
#b1 = torch.randn(n_hidden,                                generator=g) * 0.01
W2 = torch.randn(n_hidden, vocab_size,                    generator=g)  * 0.1 #it is multiply by 0.1 to get smaller logits and smaller loss
b2 = torch.randn(vocab_size,                              generator=g)  * 0 # it is declare as 0 for the model initialization

#Scale and shift the logits in the normalization layer
bngain = torch.ones((1,n_hidden))
bnbias = torch.zeros((1,n_hidden))
bnmean_running = torch.zeros((1,n_hidden))
bnstd_running = torch.ones((1,n_hidden))

parameters = [C, W1, b1, W2, b2,bngain, bnbias]
print(f"Number of parameters =  {sum(p.numel() for p in parameters)}")
for p in parameters:
    p.requires_grad = True



Number of parameters =  12297


In [9]:
# In the pt1 of makemore_3 we have multiplied the weights for an speficic number to get a gaussian activation tensor
# The specific number defined to multiplied was (5/3) * (block_size * n_embd) ** 0.5 (gain * root squared of the number of inputs)
# It was 8 years old and call kaiming initialization, nowadays we have other options like batch normalization.
# In this part we'll see how batch normalization works based on a google paper

In [None]:
# There is an important point to flag, and it is when we normalize h preactivation (hpreact), we do this using the meand and
# std of THE BATCH, not the whole dataset. This is important because it allows the model to adapt to the distribution of the data
# but at the same time we'll get different h values for on same example, it is due to the normalization consider the rest of
# the batch values that are randomly selected. It turns the model more robust and generalizable.

In [46]:
#Execute the forward pass
batch_size = 32
loop = 200000

for _ in range(loop):
    #Batching
    ix = torch.randint(0, Xtr.shape[0], (batch_size,), generator=g)
    Xb, Yb = Xtr[ix], Ytr[ix] # batch X,Y

    # forward pass
    emb = C[Xb] #embed the characters into vector 
    embcat = emb.view(emb.shape[0], -1) # concatenate the vectors
    #Linear layer
    hpreact = embcat @ W1 #hidden layer pre activation
    # BatchNorm layer
    # -------------------------------------------------------------
    bnmeani = hpreact.mean(0, keepdim=True)
    bnstdi  = hpreact.std(0,  keepdim= True)
    hpreact = bngain * (hpreact - bnmeani) / bnstdi + bnbias
    with torch.no_grad():
      bnmean_running = 0.999 * bnmean_running + 0.001 * bnmeani
      bnstd_running = 0.999 * bnstd_running + 0.001 * bnstdi
    # -------------------------------------------------------------

    #Non-linearity
    h = torch.tanh(hpreact) # hidden layer
    logits = h @ W2 + b2 # output layer
    loss = F.cross_entropy(logits, Yb) # loss function

    # backward pass
    for p in parameters:
      p.grad = None
    loss.backward()

  # update
    lr = 0.1 if _ < 100000 else 0.01 # step learning rate decay
    for p in parameters:
        if p.grad is not None:
            p.data += -lr * p.grad
      
    if _ % 10000 == 0:    
        print(f"{loss=}")
      
    #break

loss=tensor(3.7900, grad_fn=<NllLossBackward0>)
loss=tensor(2.1473, grad_fn=<NllLossBackward0>)
loss=tensor(2.3433, grad_fn=<NllLossBackward0>)
loss=tensor(2.4615, grad_fn=<NllLossBackward0>)
loss=tensor(1.9497, grad_fn=<NllLossBackward0>)
loss=tensor(2.4303, grad_fn=<NllLossBackward0>)
loss=tensor(2.4620, grad_fn=<NllLossBackward0>)
loss=tensor(2.1373, grad_fn=<NllLossBackward0>)
loss=tensor(2.2899, grad_fn=<NllLossBackward0>)
loss=tensor(2.1033, grad_fn=<NllLossBackward0>)
loss=tensor(1.8597, grad_fn=<NllLossBackward0>)
loss=tensor(2.2318, grad_fn=<NllLossBackward0>)
loss=tensor(2.0990, grad_fn=<NllLossBackward0>)
loss=tensor(2.4481, grad_fn=<NllLossBackward0>)
loss=tensor(2.2958, grad_fn=<NllLossBackward0>)
loss=tensor(2.2345, grad_fn=<NllLossBackward0>)
loss=tensor(1.8447, grad_fn=<NllLossBackward0>)
loss=tensor(1.8564, grad_fn=<NllLossBackward0>)
loss=tensor(2.0533, grad_fn=<NllLossBackward0>)
loss=tensor(1.8166, grad_fn=<NllLossBackward0>)


In [17]:
#try to make hpreact gaussian
#Batch normalization
hpreact.mean(0, keepdim= True).shape, hpreact.std(0, keepdim= True).shape

(torch.Size([1, 200]), torch.Size([1, 200]))

In [None]:
# Now that we added the new normalization line in the model, we are using the mean and the std of the batch to normalize the
# hpreact tensor. that is not correct because it doesn't represent the whole dataset. We need to use the mean and std of the
# whole dataset to normalize the hpreact tensor. We can do this by using the whole dataset to calculate the mean and std and

In [47]:
# Calibrate the batch mean and std before the train step
with torch.no_grad():
    #forward pass
    emb = C[Xtr] #embed the characters into vector
    embcat = emb.view(emb.shape[0], -1) # concatenate the vectors
    hpreact = embcat @ W1 + b1 #hidden layer pre activation
    bnmean = hpreact.mean(0, keepdim=True)
    bnstd = hpreact.std(0, keepdim=True)

In [48]:
bnmean

tensor([[-1.7378e+00,  4.3231e-01, -1.5143e+00,  2.4097e-01,  8.3619e-01,
          1.1320e+00,  1.0037e+00, -5.5135e-01,  1.5841e+00, -1.1058e-02,
         -1.4072e+00, -6.9275e-01, -7.1401e-02,  9.0418e-01, -4.2111e-01,
         -2.6896e-02,  4.4784e-01, -1.3167e+00,  6.8345e-01,  1.3090e+00,
          1.7544e-01, -4.7401e-01,  1.3757e+00, -3.2792e-01,  8.0192e-01,
         -4.6174e-01,  1.1246e+00,  3.1218e-01,  1.2323e-01,  1.2746e+00,
         -5.8348e-01, -1.2783e+00, -2.5178e-01, -4.2705e-01,  4.1126e-03,
         -1.1861e-01,  2.6048e-01, -1.6760e+00, -1.4265e+00,  5.3348e-01,
         -1.6339e+00, -8.1290e-01,  3.1953e-01, -4.1014e-01,  1.2527e+00,
          2.0046e+00,  1.7442e+00, -8.0824e-01,  2.4986e+00,  1.6100e+00,
          1.9109e-01,  2.9462e-02,  2.4983e+00,  1.6896e+00, -8.5820e-02,
         -1.2924e+00, -1.2035e+00, -2.9163e-01,  2.4678e-01, -2.2933e-01,
          1.6113e-01,  6.1184e-01,  1.4038e+00,  1.3236e+00,  1.0686e+00,
          1.7446e+00, -4.2636e-01, -6.

In [49]:
bnmean_running

tensor([[-1.7222,  0.4584, -1.5153,  0.2402,  0.8409,  1.1112,  1.0324, -0.5530,
          1.5881, -0.0282, -1.4258, -0.7039, -0.0484,  0.8854, -0.4151, -0.0099,
          0.4550, -1.3132,  0.6699,  1.2794,  0.1912, -0.4711,  1.3714, -0.3220,
          0.7910, -0.4692,  1.1022,  0.3055,  0.1114,  1.2805, -0.5933, -1.2586,
         -0.2690, -0.4081,  0.0107, -0.1110,  0.2747, -1.6684, -1.4261,  0.5139,
         -1.6354, -0.8210,  0.3098, -0.3924,  1.2630,  1.9993,  1.7454, -0.8033,
          2.4761,  1.6195,  0.1758,  0.0490,  2.4837,  1.6886, -0.0675, -1.2926,
         -1.2321, -0.3082,  0.2405, -0.2575,  0.1449,  0.6174,  1.4056,  1.3284,
          1.0512,  1.7488, -0.4281, -0.6035, -1.2657, -0.7391,  0.3443,  0.8405,
          1.4478, -1.2054,  0.2291, -0.0769, -0.4888,  0.8100,  2.4304, -0.2410,
         -1.1147, -0.1711,  1.6445,  0.0174, -0.0328, -0.7715, -2.6763, -0.8297,
         -0.2638,  2.6235, -2.1124,  0.8365, -0.1363, -0.8468, -0.5870, -0.5219,
          0.0616, -0.2149, -

In [51]:
@torch.no_grad()
def print_results(split):
    X, Y = {'Train': (Xtr, Ytr),
            'Dev': (Xdev, Ydev),
            'Test': (Xte, Yte)}[split]
    #forward pass
    emb = C[X]
    embcat = emb.view(emb.shape[0], -1)
    hpreact = embcat @ W1 + b1
    hpreact = bngain * ((hpreact - bnmean_running) /  bnstd_running) + bnbias  #Batch normalization
    h = torch.tanh(hpreact)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Y)
    print(f"{split} | loss = {loss}")

print_results('Train')
print_results('Dev')
print_results('Test')

Train | loss = 2.059568166732788
Dev | loss = 2.1049513816833496
Test | loss = 2.1046841144561768


In [13]:
#Model starting with a high loss

#Train | loss = 2.319223642349243
#Dev | loss = 2.3185811042785645
#Test | loss = 2.315457344055176

#Model with smaller w2 and b2
#Train | loss = 2.306452512741089
#Dev | loss = 2.3043744564056396
#Test | loss = 2.304983139038086

#Model multiplying the w1 by (5/3) * (block_size * n_embd) ** 0.5
#Train | loss = 2.0492982864379883
#Dev | loss = 2.1094372272491455
#Test | loss = 2.105741262435913

#Model with batch normalization step before train 
#Train | loss = 2.05985498428344733
#Dev | loss = 2.104597330093384
#Test | loss = 2.104496955871582

#Model with batch normalization running traing
#Train | loss = 2.059568166732788
#Dev | loss = 2.1049513816833496
#Test | loss = 2.1046841144561768

In [14]:
g = torch.Generator().manual_seed(2147483647 +20)
block_size = 3

for _ in range(20):

    out = []
    context = block_size * [0]
    while True:
        emb = C[torch.tensor([context])]
        h = torch.tanh(emb.view(1, -1) @ W1 + b1)
        logits = h @ W2 + b2
        prob = F.softmax(logits, dim = 1)
        ix = torch.multinomial(prob, num_samples = 1, generator = g).item()
        context = context[1:] + [ix]
        out.append(ix)
        if ix == 0:
            break
    
    print(''.join(itos[i] for i in out))

zaidy.
ade.
rose.
brith.
hal.
oanne.
rayy.
keymon.
abby.
arah.
lian.
kallo.
luiderleth.
manahurrahei.
majadyn.
ash.
blopella.
shreniya.
presten.
hendra.
