# Trigram Language Model
1) Train a trigram language model, i.e. take two characters as an input to predict the 3rd one. Feel free to use either counting or a neural net. Evaluate the loss; Did it improve over a bigram model?
2) Split up the dataset randomly into 80% train set, 10% dev set, 10% test set. Train the bigram and trigram models only on the training set. Evaluate them on dev and test splits. What can you see?
3) Use the dev set to tune the strength of smoothing (or regularization) for the trigram model - i.e. try many possibilities and see which one works best based on the dev set loss. What patterns can you see in the train and dev set loss as you tune this strength? Take the best setting of the smoothing and evaluate on the test set once and at the end. How good of a loss do you achieve?
4) We saw that our 1-hot vectors merely select a row of W, so producing these vectors explicitly feels wasteful. Can you delete our use of F.one_hot in favor of simply indexing into rows of W?
5) Look up and use F.cross_entropy instead. You should achieve the same result. Can you think of why we'd prefer to use F.cross_entropy instead?

In [2]:
words = open("names.txt", "r").read().splitlines()

In [19]:
# vocabulary
chars = sorted(list(set(''.join(words))))
itos = {i+1: s for i, s in enumerate(chars)}
itos[0] = '.'
stoi = {s: i for i, s in itos.items()}

In [64]:
# counts matrix
import torch
import matplotlib.pyplot as plt

vocab_size = len(itos.items())
g = torch.Generator().manual_seed(1)

N = torch.zeros((vocab_size * vocab_size, vocab_size), dtype=torch.int32)

In [161]:
# dataset

def build_dataset(words):
    xs, ys = [], []
    block_size = 2 # trigram model

    for w in words:
        context = [0] * block_size
    
        for ch in w + '.':
            ix = stoi[ch]
            xs.append(context)
            ys.append(ix)
            # print('-'.join(str(i) for i in context), '--->', ix)
                    
            index = 27*context[0] + context[1]
            N[index, ix] += 1
            
            context = context[1:] + [ix]

    X = torch.tensor(xs)
    Y = torch.tensor(ys)
    print(X.shape, Y.shape)
    return X, Y

import random
random.seed(10)
random.shuffle(words)

l = len(words)
n1 = int(0.8*l)
n2 = int(0.9*l)
Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])

torch.Size([182472, 2]) torch.Size([182472])
torch.Size([22856, 2]) torch.Size([22856])
torch.Size([22818, 2]) torch.Size([22818])


In [162]:
P = (N+1).float()
P /= P.sum(1, keepdim=True)

In [79]:
# sampling from the model
g = torch.Generator().manual_seed(42)

for _ in range(20):

    out = []
    context = [0] * block_size

    while True:
        ix = 27*context[0] + context[1] # convert to corresponding row
        p = P[ix]

        iout = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        if iout == 0:
            break
        out.append(itos[iout])
        context = context[1:] + [iout]

    print(''.join(out))
    

anuee
nvtps
marian
dante
na
silayley
kemah
lucin
epiccaleen
dmzi
kence
jordon
kalla
miqrqyjaya
vihia
acen
kaitharcephelia
son
chieliyos
gan


In [84]:
# evaluating the loss
log_likelihood = 0.0
n = 0

for w in words:
    context = [0] * block_size

    for ch in w + '.':
        index = 27*context[0] + context[1]
        iout = stoi[ch]
        prob = P[index, iout]
        ll = prob.log()
        log_likelihood += ll
        n += 1
        context = context[1:] + [iout]

print(f'{log_likelihood=}')
nll = -log_likelihood
print(f'{nll=}')
print(f'{nll/n}')

log_likelihood=tensor(-504653.)
nll=tensor(504653.)
2.2119739055633545


### Trigram model with NN
dataset --> init model --> gradient descent (forward pass, calculate loss, backward, update) --> sample

In [182]:
# dataset

def build_dataset(words):
    xs, ys = [], []
    block_size = 2 # trigram model

    for w in words:
        context = [0] * block_size
    
        for ch in w + '.':
            ix = stoi[ch]
            index = 27*context[0] + context[1]            

            xs.append(index)
            ys.append(ix)
            context = context[1:] + [ix]

    X = torch.tensor(xs)
    Y = torch.tensor(ys)
    print(X.shape, Y.shape)
    return X, Y

import random
random.seed(10)
random.shuffle(words)

l = len(words)
n1 = int(0.8*l)
n2 = int(0.9*l)
Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])
num = Xtr.shape[0]

torch.Size([182556]) torch.Size([182556])
torch.Size([22774]) torch.Size([22774])
torch.Size([22816]) torch.Size([22816])


In [183]:
import torch.nn.functional as F
g = torch.Generator().manual_seed(10)

# initialize the network
W = torch.randn((729, 27), generator=g, requires_grad=True)

In [208]:
# gradient descent

for i in range(100):

    # forward pass
    # xenc = F.one_hot(Xtr, num_classes=729).float()
    logits = W[Xtr]
    loss = F.cross_entropy(logits, Ytr) + 0.01*(W**2).mean()
    # counts = logits.exp()
    # probs = counts / counts.sum(1, keepdim=True)
    # loss = -probs[torch.arange(num), Ytr].log().mean() + 0.01*(W**2).mean()
    if i % 10 == 9:
        print(loss.item()) 

    # backward pass
    W.grad = None
    loss.backward()

    # update
    W.data += -100*W.grad
    

2.228956460952759
2.2286083698272705
2.228269100189209
2.227938652038574
2.227616548538208
2.2273025512695312
2.226996660232544
2.226698160171509
2.2264068126678467
2.2261226177215576


In [209]:
# loss on dev set - for hyperparameter tuning (regularization strength, learning rate)

# xenc = F.one_hot(Xdev, num_classes=729).float()
logits = W[Xdev] # xenc @ W
loss = F.cross_entropy(logits, Ydev)
# counts = logits.exp()
# probs = counts / counts.sum(1, keepdim=True)
# loss = -probs[torch.arange(Xdev.shape[0]), Ydev].log().mean()

loss.item()

2.24782657623291

In [210]:
# loss on test set - eval only once

# xenc = F.one_hot(Xte, num_classes=729).float()
logits = W[Xte] # xenc @ W
loss = F.cross_entropy(logits, Yte)
# counts = logits.exp()
# probs = counts / counts.sum(1, keepdim=True)
# loss = -probs[torch.arange(Xte.shape[0]), Yte].log().mean()

loss.item()

2.250016450881958

Cross_entropy represents a more optimized function from torch library that is able to speed up the computations

In [193]:
# sample from the model
g = torch.Generator().manual_seed(42)

for _ in range(20):

    context = [0] * block_size
    out = []

    while True:
        index = 27*context[0] + context[1]
        enc = F.one_hot(torch.tensor([index]), num_classes=729).float()
        logits = enc @ W
        counts = logits.exp()
        probs = counts / counts.sum(1, keepdim=True)
        ix = torch.multinomial(probs, num_samples=1, replacement=True, generator=g).item()

        if ix == 0:
            break

        out.append(itos[ix])
        context = context[1:] + [ix]
        
    print(''.join(out))

anuguelvtps
marian
dante
na
silayley
kemah
luman
epjccuoden
dazi
kence
jordon
kalla
miqrqyjaya
vihia
acen
kaitharcephelia
son
chieliyos
gan
abren
