# Exercise 2: Split dataset into train/test/dev and use them on Tri-Grams

In [29]:
import torch
import torch.nn.functional as F
import numpy as np

In [None]:
# Only lower-case English letters
names_text = open("names.txt", "r").read()
words = [f"..{name}." for name in names_text.splitlines()]
words[:5]

['..emma.', '..olivia.', '..ava.', '..isabella.', '..sophia.']

In [31]:
# Creat encoding and decoding dictionaries
chars = sorted(list(set("".join(words))))
ctoi = {c: i for i, c in enumerate(chars)}
itoc = {i: c for i, c in enumerate(chars)}
ctoi['.'], itoc[0], ctoi['b'], itoc[2]

(0, '.', 2, 'b')

In [32]:
# Split the words into train, test, and validation sets
np.random.shuffle(words)
train_words = words[:int(0.8*len(words))]
test_words = words[int(0.8*len(words)):int(0.9*len(words))]
dev_words = words[int(0.9*len(words)):]
n = len(words)
print(f"{len(train_words) / n:.2f}, {len(test_words) / n:.2f}, {len(dev_words) / n:.2f}")

0.80, 0.10, 0.10


## Counting model

In [33]:
N = torch.zeros(len(chars), len(chars), len(chars))  # Tri-gram
for word in train_words:
    for i in range(len(word) - 2):
        # count how many times 3 characters appear together
        a, b, c = word[i], word[i + 1], word[i + 2]
        N[ctoi[a], ctoi[b], ctoi[c]] += 1

N = N + 1  # Laplace smoothing

N[ctoi['a'], ctoi['n'], ctoi['a']], N[ctoi['x'], ctoi['q'], ctoi['w']]

(tensor(641.), tensor(1.))

In [34]:
# Normalize the tri-gram matrix to get the probability
N = N.float()  # convert to float
P = N / N.sum(dim=2, keepdim=True)  # we want P[i][j].sum() == 1
P[14, 23].sum()

tensor(1.)

In [35]:
P[ctoi['a'], ctoi['n'], ctoi['a']], P[ctoi['x'], ctoi['q'], ctoi['w']]

(tensor(0.1446), tensor(0.0370))

In [36]:
def generate_name_stochastically(P):
    name = ".."
    while True:
        i, j = ctoi[name[-2]], ctoi[name[-1]]
        k = torch.multinomial(P[i][j], 1).item()
        name += itoc[k]
        if name[-1] == ".":
            break
    return name[2:-1]

for i in range(5):
    name = generate_name_stochastically(P)
    print(name)

seveven
luile
mandreyracist
yia
ris


In [37]:
# Let's calculate the mean of negative log likelihood (loss function)
nll = 0
n = 1
for word in test_words:
    for i in range(len(word) - 2):
        a, b, c = word[i], word[i + 1], word[i + 2]
        n += 1
        likelihood = P[ctoi[a], ctoi[b], ctoi[c]]
        nll -= torch.log(likelihood)

nll / n

tensor(2.2466)

## Neural Network Model

In [38]:
# Split data into features and labels
def words_to_features(words):
    xs, ys = [], []
    for word in words:
        for i in range(len(word) - 2):
            a, b, c = word[i], word[i + 1], word[i + 2]
            xs.append((ctoi[a], ctoi[b]))
            ys.append(ctoi[c])
    return torch.tensor(xs), torch.tensor(ys)


xs_train, ys_train = words_to_features(train_words)
xs_train.shape, ys_train.shape

(torch.Size([182585, 2]), torch.Size([182585]))

In [39]:
# Encode the features using one-hot encoding
def encode(x):
    xenc = torch.zeros(len(x), len(chars) * 2)
    for i, (a, b) in enumerate(x):
        aenc = F.one_hot(a, num_classes=len(chars))
        benc = F.one_hot(b, num_classes=len(chars))
        xenc[i] = torch.cat((aenc, benc)).float()
    return xenc

xenc_train = encode(xs_train)

In [40]:
# Input:  [batch_size, chars], 
# W: [2*chars, chars]
# Output: [batch_size, chars]
# [batch_size x (2*chars)] @ [2*chars x chars] = batch_size x chars

class NeuralNetwork():
    def __init__(self):
        self.W = torch.randn(len(chars) * 2, len(chars), requires_grad=True)
        self.loss = 0

    def train(self, xenc: torch.Tensor, ys: torch.Tensor, epochs=1000, lr=0.1):
        for _ in range(epochs):
            # forward pass
            logits = xenc @ self.W
            # probs = F.softmax(logits, dim=1)
            # self.loss = -probs[torch.arange(len(ys)), ys].log().mean()
            self.loss = F.cross_entropy(logits, ys)  # equivalent to above 2 lines

            # backward pass
            self.W.grad = None
            self.loss.backward()

            # update weights
            self.W.data -= lr * self.W.grad

    def test(self, xenc: torch.Tensor, ys: torch.Tensor) -> float:
        logits = xenc @ self.W
        return F.cross_entropy(logits, ys).item()

In [41]:
model = NeuralNetwork()
model.train(xenc_train, ys_train, epochs=600, lr=30)

x_test, y_test = words_to_features(test_words)
xenc_test = encode(x_test)
model.test(xenc_test, y_test)

# LOSS:
# random:         4.16
# counting model: 2.21
# neural network: 2.34

2.355668067932129