# Exercise 3: Use dev set to tune hyperparameters for Tri-grams

In [1]:
import torch
import torch.nn.functional as F
import numpy as np

In [None]:
# Only lower-case English letters
names_text = open("names.txt", "r").read()
words = [f"..{name}." for name in names_text.splitlines()]

# Create encoding and decoding dictionaries
chars = sorted(list(set("".join(words))))
ctoi = {c: i for i, c in enumerate(chars)}
itoc = {i: c for i, c in enumerate(chars)}
itoc[0], itoc[1], itoc[26]

('.', 'a', 'z')

In [3]:
# Split the words into train, test, and validation sets
np.random.shuffle(words)
train_words = words[:int(0.8*len(words))]
test_words = words[int(0.8*len(words)):int(0.9*len(words))]
dev_words = words[int(0.9*len(words)):]
n = len(words)
print(f"{len(train_words) / n:.2f}, {len(test_words) / n:.2f}, {len(dev_words) / n:.2f}")

0.80, 0.10, 0.10


## Counting model

In [4]:
class TrigramCountingModel():
    def __init__(self):
        self.N = torch.zeros(len(chars), len(chars), len(chars))
        self.P = torch.zeros(len(chars), len(chars), len(chars))

    def train(self, train_words: list[str], laplace_alpha: int = 1):
        # Count trigrams
        for word in train_words:
            for i in range(len(word) - 2):
                a, b, c = word[i], word[i + 1], word[i + 2]
                self.N[ctoi[a], ctoi[b], ctoi[c]] += 1

        # Compute probabilities with Laplace smoothing
        self.P = (self.N + laplace_alpha).float() / (self.N + laplace_alpha).sum(dim=2, keepdim=True).float()

    def compute_loss(self, test_words: list[str]):
        """ Compute negative log-likelihood """
        nll = 0
        n = 0
        for word in test_words:
            for i in range(len(word) - 2):
                n += 1
                a, b, c = word[i], word[i + 1], word[i + 2]
                nll += -torch.log(self.P[ctoi[a], ctoi[b], ctoi[c]])
        return (nll / n).item()

## Neural Network Model

In [5]:
class TrigramNeuralNetwork():
    def __init__(self) -> None:
        self.W = torch.randn(len(chars) * 2, len(chars), requires_grad=True)

    def train(self, words: list[str], epochs=1000, lr=0.1, l2=0.01) -> None:
        X_train, y_train = self.words_to_features(words)
        
        for _ in range(epochs):
            # forward pass
            logits = X_train @ self.W
            loss = self.compute_loss(logits, y_train, l2)

            # backward pass
            self.W.grad = None
            loss.backward()

            # update weights
            self.W.data -= lr * self.W.grad
    
    def compute_loss(self, logits, y_test, alpha) -> float:
        cross_entropy = F.cross_entropy(logits, y_test)
        l2_regularization = alpha * torch.norm(self.W) ** 2
        return cross_entropy + l2_regularization

    def test(self, test_words) -> float:
        X_test, y_test = self.words_to_features(test_words)
        logits = X_test @ self.W
        return self.compute_loss(logits, y_test, alpha=0).item()
    
    def words_to_features(self, words: list[str]) -> tuple[torch.tensor, torch.tensor]:
        """ Convert a list of words to encoded features and labels """
        xs, ys = [], []
        for word in words:
            for i in range(len(word) - 2):
                a, b, c = word[i], word[i + 1], word[i + 2]
                xs.append((ctoi[a], ctoi[b]))
                ys.append(ctoi[c])
        return self.encode(torch.tensor(xs)), torch.tensor(ys)

    
    def encode(self, X: torch.tensor) -> torch.tensor:
        """ One-hot encode the input """
        xenc = torch.zeros(len(X), len(chars) * 2)
        for i, (a, b) in enumerate(X):
            aenc = F.one_hot(a, num_classes=len(chars))
            benc = F.one_hot(b, num_classes=len(chars))
            xenc[i] = torch.cat((aenc, benc)).float()
        return xenc

## Dev models (validation set)

In [6]:
model = TrigramCountingModel()
model.train(dev_words, laplace_alpha=1)  # best alpha is 1
loss = model.compute_loss(test_words)
print(loss)

2.3787693977355957


In [7]:
model = TrigramNeuralNetwork()
model.train(dev_words, epochs=500, lr=30, l2=0.00001)  # quite good hyperparameters
loss = model.test(test_words)
print(loss)

2.3691909313201904


## Test models

In [8]:
model = TrigramCountingModel()
model.train(train_words, laplace_alpha=1)  # best alpha is 1
loss = model.compute_loss(test_words)
print(loss)

2.24503755569458


In [9]:
model = TrigramNeuralNetwork()
model.train(train_words, epochs=500, lr=30, l2=0.00001)  # quite good hyperparameters
loss = model.test(test_words)
print(loss)

2.3527660369873047


### LOSS:
* random:         ~4.4
* counting model: 2.25
* neural network: 2.35