## Exercises:
E01: train a trigram language model, i.e. take two characters as an input to predict the 3rd one. Feel free to use either counting or a neural net. Evaluate the loss; Did it improve over a bigram model?

E02: split up the dataset randomly into 80% train set, 10% dev set, 10% test set. Train the bigram and trigram models only on the training set. Evaluate them on dev and test splits. What can you see?

E03: use the dev set to tune the strength of smoothing (or regularization) for the trigram model - i.e. try many possibilities and see which one works best based on the dev set loss. What patterns can you see in the train and dev set loss as you tune this strength? Take the best setting of the smoothing and evaluate on the test set once and at the end. How good of a loss do you achieve?

E04: we saw that our 1-hot vectors merely select a row of W, so producing these vectors explicitly feels wasteful. Can you delete our use of F.one_hot in favor of simply indexing into rows of W?

E05: look up and use F.cross_entropy instead. You should achieve the same result. Can you think of why we'd prefer to use F.cross_entropy instead?

E06: meta-exercise! Think of a fun/interesting exercise and complete it.

In [24]:
import torch
import torch.nn.functional as F

import matplotlib.pyplot as plt

In [13]:
with open("names.txt", "r") as f:
    words = f.read().splitlines()

In [14]:
# total words
print("Total names: ", len(words))

print("Minimum name length: ", min(len(w) for w in words))
print("Maximum name length: ", max(len(w) for w in words))

Total names:  32033
Minimum name length:  2
Maximum name length:  15


In [15]:
trigrams = {}
for w in words:
    #chs = ['<S>'] + list(w) + ['<E>']
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        trigram = (ch1, ch2, ch3)
        trigrams[trigram] = trigrams.get(trigram, 0) + 1

trigrams

{('.', 'e', 'm'): 288,
 ('e', 'm', 'm'): 100,
 ('m', 'm', 'a'): 72,
 ('m', 'a', '.'): 174,
 ('.', 'o', 'l'): 104,
 ('o', 'l', 'i'): 69,
 ('l', 'i', 'v'): 54,
 ('i', 'v', 'i'): 78,
 ('v', 'i', 'a'): 147,
 ('i', 'a', '.'): 903,
 ('.', 'a', 'v'): 243,
 ('a', 'v', 'a'): 161,
 ('v', 'a', '.'): 93,
 ('.', 'i', 's'): 124,
 ('i', 's', 'a'): 142,
 ('s', 'a', 'b'): 76,
 ('a', 'b', 'e'): 173,
 ('b', 'e', 'l'): 201,
 ('e', 'l', 'l'): 822,
 ('l', 'l', 'a'): 337,
 ('l', 'a', '.'): 684,
 ('.', 's', 'o'): 152,
 ('s', 'o', 'p'): 21,
 ('o', 'p', 'h'): 37,
 ('p', 'h', 'i'): 61,
 ('h', 'i', 'a'): 81,
 ('.', 'c', 'h'): 352,
 ('c', 'h', 'a'): 236,
 ('h', 'a', 'r'): 329,
 ('a', 'r', 'l'): 287,
 ('r', 'l', 'o'): 44,
 ('l', 'o', 't'): 14,
 ('o', 't', 't'): 34,
 ('t', 't', 'e'): 121,
 ('t', 'e', '.'): 175,
 ('.', 'm', 'i'): 393,
 ('m', 'i', 'a'): 95,
 ('.', 'a', 'm'): 384,
 ('a', 'm', 'e'): 226,
 ('m', 'e', 'l'): 188,
 ('e', 'l', 'i'): 537,
 ('l', 'i', 'a'): 518,
 ('.', 'h', 'a'): 505,
 ('a', 'r', 'p'): 8,
 ('r

In [16]:
sorted(trigrams.items(), key = lambda item: -item[1])

[(('a', 'h', '.'), 1714),
 (('n', 'a', '.'), 1673),
 (('a', 'n', '.'), 1509),
 (('o', 'n', '.'), 1503),
 (('.', 'm', 'a'), 1453),
 (('.', 'j', 'a'), 1255),
 (('.', 'k', 'a'), 1254),
 (('e', 'n', '.'), 1217),
 (('l', 'y', 'n'), 976),
 (('y', 'n', '.'), 953),
 (('a', 'r', 'i'), 950),
 (('i', 'a', '.'), 903),
 (('i', 'e', '.'), 858),
 (('a', 'n', 'n'), 825),
 (('e', 'l', 'l'), 822),
 (('a', 'n', 'a'), 804),
 (('i', 'a', 'n'), 790),
 (('m', 'a', 'r'), 776),
 (('i', 'n', '.'), 766),
 (('e', 'l', '.'), 727),
 (('y', 'a', '.'), 716),
 (('a', 'n', 'i'), 703),
 (('.', 'd', 'a'), 700),
 (('l', 'a', '.'), 684),
 (('e', 'r', '.'), 683),
 (('i', 'y', 'a'), 669),
 (('l', 'a', 'n'), 647),
 (('.', 'b', 'r'), 646),
 (('n', 'n', 'a'), 633),
 (('.', 'a', 'l'), 632),
 (('.', 'c', 'a'), 628),
 (('r', 'a', '.'), 627),
 (('n', 'i', '.'), 625),
 (('.', 'a', 'n'), 623),
 (('n', 'n', '.'), 619),
 (('n', 'e', '.'), 607),
 (('e', 'e', '.'), 605),
 (('e', 'y', '.'), 602),
 (('.', 'k', 'e'), 601),
 (('a', 'l', 'e')

In [17]:
chars = sorted(list(set("".join(words))))
print(chars)

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [18]:
# string to int mapping
stoi =  {}
for i, char in enumerate(chars):
    stoi[char] = i+1

print(stoi)

{'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26}


In [19]:
stoi = {char:i+1 for i, char in enumerate(chars)}
stoi['.'] = 0
print(stoi)

{'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26, '.': 0}


In [20]:
# int to string mapping
itos = {i+1:char for i, char in enumerate(chars)}
itos[0] = "."
print(itos)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [27]:
# First let's create a training set of bigrams (x, y)

xs, ys = [], []

for w in words[:1]:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        ix3 = stoi[ch3]
        xs.append((ix1, ix2))
        ys.append(ix3)
        print(f"{ch1}{ch2}{ch3}: {ix1} {ix2} {ix3}")

xs = torch.tensor(xs)
ys = torch.tensor(ys)

print(xs)
print(ys)

.em: 0 5 13
emm: 5 13 13
mma: 13 13 1
ma.: 13 1 0
tensor([[ 0,  5],
        [ 5, 13],
        [13, 13],
        [13,  1]])
tensor([13, 13,  1,  0])


In [26]:
xenc = F.one_hot(xs, num_classes=27).float()    # we dont want an int we want a float
print(xenc.shape)
# plt.imshow(xenc, cmap='Blues')

torch.Size([4, 2, 27])


In [94]:
# Create the dataset
xs, ys = [], []

for w in words[:1]:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        ix3 = stoi[ch3]
        xs.append((ix1, ix2))
        ys.append(ix3)
        #print(f"{ch1}{ch2}: {ix1} {ix2}")

xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()

print("Number of examples: ", num)

Number of examples:  8


In [95]:
# randomly initialize 27 neurons weights. Each neuron recieves 27 inputs
g = torch.Generator().manual_seed(2147483647)
#W = torch.randn((27, 27), generator=g, requires_grad=True)   # We created 27 neurons
W = torch.zeros((2, 27, 27), requires_grad=True)   # We created 27 neurons

In [96]:
xenc = F.one_hot(xs, num_classes=27).float().permute(1, 0, 2)
xenc.shape

torch.Size([2, 4, 27])

In [97]:
xenc @ W

print(xenc.shape)
print(W.shape)

torch.Size([2, 4, 27])
torch.Size([2, 27, 27])


In [98]:
# gradient descent in a loop -> training

lr = 50
alpha = 0.01

for k in range(1):
    # Forwards pass
    xenc = F.one_hot(xs, num_classes=27).float().permute(1, 0, 2)    # input is one-hot encoded

    logits = xenc @ W # log-counts
    counts = logits.exp() # counts, equivalent to N
    probs = (counts / counts.sum(1, keepdim=True)).permute(1, 0, 2)   # probabilities for next character 
    
    print(probs.shape)
    print(ys.shape)

    # # regularised loss
    # loss = -probs[torch.arange(num), torch.arange(num), ys].log().mean() + (alpha * (W**2).mean())
    # if k % 25 == 0:
    #     print('loss:', loss.item())

    # # Backward pass
    # W.grad = None   # set gradient to zero
    # loss.backward()

    # # update
    # W.data += -lr * W.grad # gradient descent

torch.Size([4, 2, 27])
torch.Size([4])


In [41]:
W.shape

torch.Size([27])

In [99]:
[torch.arange(num), torch.arange(num), ys]

[tensor([0, 1, 2, 3, 4, 5, 6, 7]),
 tensor([0, 1, 2, 3, 4, 5, 6, 7]),
 tensor([13, 13,  1,  0])]