In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
%matplotlib inline

In [7]:
words = open('../data/names.txt', 'r').read().splitlines()

# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
vocab_size = len(itos)
print(itos)
print(vocab_size)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}
27


In [28]:
nbr_neurons = 256
vocab_size = 27
emb_dim = 5
block_size = 3 # context window size

g = torch.Generator().manual_seed(2147483647) 
C = torch.randn((vocab_size, emb_dim), generator=g)
W1 = torch.randn((emb_dim*block_size, nbr_neurons), generator=g)
b1 = torch.randn(nbr_neurons, generator=g)
W2 = torch.randn((nbr_neurons, vocab_size), generator=g)
b2 = torch.randn(vocab_size, generator=g)
parameters = [C, W1, b1, W2, b2]

sum(p.nelement() for p in parameters) 

11170

In [42]:
# build the dataset

def build_dataset(words, block_size=block_size):  
  X, Y = [], []
  for w in words:

    #print(w)
    context = [0] * block_size
    for ch in w + '.':
      ix = stoi[ch]
      X.append(context)
      Y.append(ix)
      #print(''.join(itos[i] for i in context), '--->', itos[ix])
      context = context[1:] + [ix] # crop and append

  X = torch.tensor(X)
  Y = torch.tensor(Y)
  #print(X.shape, Y.shape)
  return X, Y

import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])

In [30]:
for p in parameters:
        p.requires_grad = True 
stepi, lossi, Ci = [], [], []

In [34]:
n = 50000
batch_size = 50


for i in range(n):
    # minibatch construct 
    ixs = torch.randint(0, Xtr.shape[0], (batch_size,)) 

    # Forward pass 
    emb = C[Xtr[ixs]] 
    h = torch.tanh(emb.view(-1,block_size*emb_dim) @ W1 + b1) 
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits,Ytr[ixs]) 

    # Backward pass
    for p in parameters:
        p.grad = None 

    loss.backward()

    # update
    #l = 0.1 if i < 300000 else 0.01
    l = 0.01
    for p in parameters:
        p.data += -l * p.grad 

    stepi.append(i)
    lossi.append(loss.item())
    C_clone = C.detach().clone()
    Ci.append(C_clone)

print(loss)

tensor(2.2476, grad_fn=<NllLossBackward0>)


In [35]:
def model_test(X,Y):
    emb = C[X] 
    h = torch.tanh(emb.view(-1,block_size*emb_dim) @ W1 + b1) 
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits,Y) 
    return loss

ldev = model_test(Xdev, Ydev)
ltest = model_test(Xte, Yte)
print("dev : ", ldev)
print("test : ", ltest)

dev :  tensor(2.1341, grad_fn=<NllLossBackward0>)
test :  tensor(2.1295, grad_fn=<NllLossBackward0>)


In [None]:
import random

# Définition des espaces de recherche
nbr_neurons_choices = [128, 256, 512]
emb_dim_choices = [4, 8, 12]       
block_size_choices = [3, 4, 5]      

n_trials = 5      
n_iter = 50000     

best_loss = float('inf')
best_config = None

for trial in range(n_trials):
    nbr_neurons_trial = random.choice(nbr_neurons_choices)
    emb_dim_trial = random.choice(emb_dim_choices)
    block_size_trial = random.choice(block_size_choices)

    Xtr, Ytr = build_dataset(words[:n1], block_size_trial)
    Xdev, Ydev = build_dataset(words[n1:n2], block_size_trial)
    Xte, Yte = build_dataset(words[n2:], block_size_trial)
    
    
    C_trial = torch.randn((vocab_size, emb_dim_trial), generator=g, requires_grad=True)
    W1_trial = torch.randn((block_size_trial * emb_dim_trial, nbr_neurons_trial), generator=g, requires_grad=True)
    b1_trial = torch.randn(nbr_neurons_trial, generator=g, requires_grad=True)
    W2_trial = torch.randn((nbr_neurons_trial, vocab_size), generator=g, requires_grad=True)
    b2_trial = torch.randn(vocab_size, generator=g, requires_grad=True)
    
    parameters_trial = [C_trial, W1_trial, b1_trial, W2_trial, b2_trial]
    
    
    for i in range(n_iter):
        lr = 0.1 if n_iter < 0.8*n_iter else 0.01
        
        ixs = torch.randint(0, Xtr.shape[0], (batch_size,))
        
        # Forward pass
        emb = C_trial[Xtr[ixs]]
        h = torch.tanh(emb.view(-1, block_size_trial * emb_dim_trial) @ W1_trial + b1_trial)
        logits = h @ W2_trial + b2_trial
        loss = F.cross_entropy(logits, Ytr[ixs])
        
        # Backward pass
        for p in parameters_trial:
            if p.grad is not None:
                p.grad.zero_()
        loss.backward()
        
        # Update
        with torch.no_grad():
            for p in parameters_trial:
                p.add_(-lr * p.grad)
    
    # eval 
    emb_dev = C_trial[Xdev]
    h_dev = torch.tanh(emb_dev.view(-1, block_size_trial * emb_dim_trial) @ W1_trial + b1_trial)
    logits_dev = h_dev @ W2_trial + b2_trial
    dev_loss = F.cross_entropy(logits_dev, Ydev)
    
    print(f"Trial {trial+1}: lr={lr}, "
          f"nbr_neurons={nbr_neurons_trial}, "
          f"emb_dim={emb_dim_trial}, block_size={block_size_trial}, "
          f"dev_loss={dev_loss.item()}")
    
    # Meilleure configuration
    if dev_loss.item() < best_loss:
        best_loss = dev_loss.item()
        best_config = {
            'nbr_neurons': nbr_neurons_trial,
            'emb_dim': emb_dim_trial,
            'block_size': block_size_trial,
            'dev_loss': best_loss,
            'C': C_trial,
            'W1': W1_trial,
            'b1': b1_trial,
            'W2': W2_trial,
            'b2': b2_trial,
        }

print("Best configuration:", best_config)

Trial 1: lr=0.01, nbr_neurons=128, emb_dim=8, block_size=4, dev_loss=2.340148687362671
Trial 2: lr=0.01, nbr_neurons=256, emb_dim=4, block_size=3, dev_loss=2.4161903858184814
Trial 3: lr=0.01, nbr_neurons=256, emb_dim=4, block_size=4, dev_loss=2.387986898422241
Trial 4: lr=0.01, nbr_neurons=256, emb_dim=12, block_size=5, dev_loss=2.42612361907959
Trial 5: lr=0.01, nbr_neurons=512, emb_dim=4, block_size=3, dev_loss=2.439927339553833
Best configuration: {'lr': 0.01, 'nbr_neurons': 128, 'emb_dim': 8, 'block_size': 4, 'dev_loss': 2.340148687362671, 'C': tensor([[-3.9578e-02, -5.5522e-01, -6.1088e-01,  2.1530e-01,  3.1879e-01,
          3.0767e-01,  1.4910e-01,  3.2984e-01],
        [-3.4276e-01,  9.4620e-02, -1.0977e-01, -7.0507e-02, -1.9587e-01,
         -5.2406e-01,  2.5951e-01,  2.9322e-01],
        [-3.1222e-01,  5.8283e-02, -2.3211e-01, -4.4966e-02, -2.9246e-01,
         -2.7495e-01,  1.5838e-01,  4.2476e-01],
        [-3.2412e-01,  6.4662e-02, -2.2934e-01, -7.6737e-02, -1.0713e-01,
 