# Exercise 1: Create Tri-gram Language Model

In [1]:
import torch
import torch.nn.functional as F

In [2]:
# Only lower-case English letters
names_text = open("../names.txt", "r").read()
words = [f"..{name}." for name in names_text.splitlines()]
words[:5]

['..emma.', '..olivia.', '..ava.', '..isabella.', '..sophia.']

In [3]:
# Creat encoding and decoding dictionaries
chars = sorted(list(set("".join(words))))
ctoi = {c: i for i, c in enumerate(chars)}
itoc = {i: c for i, c in enumerate(chars)}
ctoi['.'], itoc[0], ctoi['b'], itoc[2]

(0, '.', 2, 'b')

## Counting model

In [4]:
N = torch.zeros(len(chars), len(chars), len(chars))  # Tri-gram
for word in words:
    for i in range(len(word) - 2):
        # count how many times 3 characters appear together
        a, b, c = word[i], word[i + 1], word[i + 2]
        N[ctoi[a], ctoi[b], ctoi[c]] += 1

N = N + 1  # Laplace smoothing

N[ctoi['a'], ctoi['n'], ctoi['a']], N[ctoi['x'], ctoi['q'], ctoi['w']]

(tensor(805.), tensor(1.))

In [5]:
# Normalize the tri-gram matrix to get the probability
N = N.float()  # convert to float
P = N / N.sum(dim=2, keepdim=True)  # we want P[i][j].sum() == 1
P[14, 23].sum()

tensor(1.)

In [6]:
P[ctoi['a'], ctoi['n'], ctoi['a']], P[ctoi['x'], ctoi['q'], ctoi['w']]

(tensor(0.1473), tensor(0.0370))

In [7]:
def generate_name_stochastically(P):
    name = ".."
    while True:
        i, j = ctoi[name[-2]], ctoi[name[-1]]
        k = torch.multinomial(P[i][j], 1).item()
        name += itoc[k]
        if name[-1] == ".":
            break
    return name[2:-1]

for i in range(5):
    name = generate_name_stochastically(P)
    print(name)

aishmatcvone
ez
jaylakenyle
amylenne
landiti


In [8]:
# Let's calculate the mean of negative log likelihood (loss function)
nll = 0
n = 1
for word in words:
    for i in range(len(word) - 2):
        a, b, c = word[i], word[i + 1], word[i + 2]
        n += 1
        likelihood = P[ctoi[a], ctoi[b], ctoi[c]]
        nll -= torch.log(likelihood)

nll / n

tensor(2.2120)

## Neural Network Model

In [9]:
# Split data into features and labels
xs, ys = [], []
for word in words:
    for i in range(len(word) - 2):
        a, b, c = word[i], word[i + 1], word[i + 2]
        xs.append((ctoi[a], ctoi[b]))
        ys.append(ctoi[c])

xs, ys = torch.tensor(xs), torch.tensor(ys)
xs.shape, ys.shape

(torch.Size([228146, 2]), torch.Size([228146]))

In [10]:
# Encode the features using one-hot encoding
xenc = torch.zeros(len(xs), len(chars) * 2)
for i, (a, b) in enumerate(xs):
    aenc = F.one_hot(a, num_classes=len(chars))
    benc = F.one_hot(b, num_classes=len(chars))
    xenc[i] = torch.cat((aenc, benc)).float()
xenc.shape

torch.Size([228146, 54])

In [37]:
# Input:  [batch_size, chars], 
# W: [2*chars, chars]
# Output: [batch_size, chars]
# [batch_size x (2*chars)] @ [2*chars x chars] = batch_size x chars

class NeuralNetwork():
    def __init__(self):
        self.W = torch.randn(len(chars) * 2, len(chars), requires_grad=True)
        self.loss = 0

    def train(self, xenc: torch.Tensor, ys: torch.Tensor, epochs=1000, lr=0.1):
        for _ in range(epochs):
            # forward pass
            logits = xenc @ self.W
            # probs = F.softmax(logits, dim=1)
            # self.loss = -probs[torch.arange(len(ys)), ys].log().mean()
            self.loss = F.cross_entropy(logits, ys)  # equivalent to above 2 lines

            # backward pass
            self.W.grad = None
            self.loss.backward()

            # update weights
            self.W.data -= lr * self.W.grad

In [38]:
model = NeuralNetwork()
model.train(xenc, ys, epochs=600, lr=30)
model.loss.item()

# LOSS:
# random:         4.16
# counting model: 2.21
# neural network: 2.34

2.3414723873138428