## Exercise 1

Tune the hyperparameters of the training to beat my best validation loss of 2.2

In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
import random
import math

### Dataset Preparation

In [2]:
words = open('names.txt', 'r').read().splitlines()
words[:10]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

In [3]:
# Build a vocabulary of characters and mapping to/from integers

chars = sorted(list(set(''.join(words))))
stoi = {s:i + 1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i: s for s,i in stoi.items()}

In [4]:
def build_dataset(words, context_size = 3):
    
    X, Y = [], []

    for word in words:
        context = [0] * context_size
        for ch in word + '.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            
            context = context[1:] + [ix]
    
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(X.shape, Y.shape)
    
    return X, Y
        
build_dataset(words[:3])

torch.Size([16, 3]) torch.Size([16])


(tensor([[ 0,  0,  0],
         [ 0,  0,  5],
         [ 0,  5, 13],
         [ 5, 13, 13],
         [13, 13,  1],
         [ 0,  0,  0],
         [ 0,  0, 15],
         [ 0, 15, 12],
         [15, 12,  9],
         [12,  9, 22],
         [ 9, 22,  9],
         [22,  9,  1],
         [ 0,  0,  0],
         [ 0,  0,  1],
         [ 0,  1, 22],
         [ 1, 22,  1]]),
 tensor([ 5, 13, 13,  1,  0, 15, 12,  9, 22,  9,  1,  0,  1, 22,  1,  0]))

In [5]:
import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8 * len(words))
n2 = int(0.9 * len(words))

Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])

torch.Size([182625, 3]) torch.Size([182625])
torch.Size([22655, 3]) torch.Size([22655])
torch.Size([22866, 3]) torch.Size([22866])


### Creating a Multilayer Perceptron (MLP) Class

In [6]:
class MLP:
    def __init__(
        self, 
        emb_dim_1,
        emb_dim_2,
        l1_out_features,
        l2_out_features,
        num_epochs, 
        learning_rate, 
        block_size=3, 
        minibatch_size=64,
        lr_update=20,
        lr_decay_factor=0.5,
        seed=2147483647
    ):
        # Store hyperparameters
        self.emb_dim_1 = emb_dim_1
        self.emb_dim_2 = emb_dim_2
        self.l1_out_features = l1_out_features
        self.l2_out_features = l2_out_features
        self.num_epochs = num_epochs
        self.learning_rate = learning_rate
        self.block_size = block_size
        self.minibatch_size = minibatch_size
        self.lr_update = lr_update
        self.lr_decay_factor = lr_decay_factor
        self.seed = seed

        # Initialize weights with reproducibility
        generator = torch.Generator().manual_seed(self.seed)
        self.C = torch.randn((self.emb_dim_1, self.emb_dim_2), generator=generator)
        self.W1 = torch.randn((self.block_size * self.emb_dim_2, self.l1_out_features), generator=generator)
        self.b1 = torch.randn(self.l1_out_features, generator=generator)
        self.W2 = torch.randn((self.l1_out_features, self.l2_out_features), generator=generator)
        self.b2 = torch.randn(self.l2_out_features, generator=generator)

        # Gather parameters and enable gradients
        self.parameters = [self.C, self.W1, self.b1, self.W2, self.b2]
        self.num_params = sum(p.nelement() for p in self.parameters)
        for p in self.parameters:
            p.requires_grad = True

    def __call__(self, x, y):
        # Get embeddings for input indices
        embeddings = self.C[x]  # shape: (batch, context_size, emb_dim_2)
        # Flatten the context
        layer1_in = embeddings.view(embeddings.shape[0], -1)
        # Hidden layer with tanh activation
        layer1_out = torch.tanh(layer1_in @ self.W1 + self.b1)
        # Compute logits and cross-entropy loss
        logits = layer1_out @ self.W2 + self.b2
        loss = F.cross_entropy(logits, y)
        return loss

    def fit(self, x, y):
        self.history = {}
        best_loss = math.inf
        counter = 0
        min_lr = 1e-6

        for k in range(self.num_epochs):
            # Sample a minibatch
            ix = torch.randint(0, x.shape[0], (self.minibatch_size,))
            loss = self(x[ix], y[ix])
            self.history[f"epoch_{k+1}"] = [loss.item(), self.learning_rate]

            # Update best loss and counter for learning rate decay
            if loss.item() < best_loss:
                best_loss = loss.item()
                counter = 0
            else:
                counter += 1

            # Reset gradients
            for p in self.parameters:
                p.grad = None

            loss.backward()

            # Decay learning rate if no improvement for 'lr_update' epochs
            if counter >= self.lr_update and self.learning_rate > min_lr:
                self.learning_rate *= self.lr_decay_factor
                counter = 0

            # Update parameters inside a no_grad block
            with torch.no_grad():
                for p in self.parameters:
                    p -= self.learning_rate * p.grad

        return self.history

    def evaluate(self, x, y):
        with torch.no_grad():
            loss = self(x, y)
        return loss.item()

In [None]:
mlp_model = MLP(
    emb_dim_1=27,
    emb_dim_2=10,
    l1_out_features=200, 
    l2_out_features=27,
    num_epochs=100000,
    learning_rate=0.1,
    minibatch_size=64,
    lr_update=5000,
    lr_decay_factor=0.5,
    seed=2147483647
)

print("Training started...")
history = mlp_model.fit(Xtr, Ytr)
print(history)

train_loss = mlp_model.evaluate(Xtr, Ytr)
print("Train Loss:", train_loss)
# --- Evaluate on the Validation Set ---
val_loss = mlp_model.evaluate(Xdev, Ydev)
print("Validation Loss:", val_loss)

Training started...
