## Exercises:
E01: train a trigram language model, i.e. take two characters as an input to predict the 3rd one. Feel free to use either counting or a neural net. Evaluate the loss; Did it improve over a bigram model?

E02: split up the dataset randomly into 80% train set, 10% dev set, 10% test set. Train the bigram and trigram models only on the training set. Evaluate them on dev and test splits. What can you see?

E03: use the dev set to tune the strength of smoothing (or regularization) for the trigram model - i.e. try many possibilities and see which one works best based on the dev set loss. What patterns can you see in the train and dev set loss as you tune this strength? Take the best setting of the smoothing and evaluate on the test set once and at the end. How good of a loss do you achieve?

E04: we saw that our 1-hot vectors merely select a row of W, so producing these vectors explicitly feels wasteful. Can you delete our use of F.one_hot in favor of simply indexing into rows of W?

E05: look up and use F.cross_entropy instead. You should achieve the same result. Can you think of why we'd prefer to use F.cross_entropy instead?

E06: meta-exercise! Think of a fun/interesting exercise and complete it.

In [1]:
import torch
import torch.nn.functional as F

import matplotlib.pyplot as plt
from tqdm import tqdm
import time

In [2]:
with open("names.txt", "r") as f:
    words = f.read().splitlines()

In [3]:
print(len(words))

32033


In [4]:
# total words
print("Total names: ", len(words))

print("Minimum name length: ", min(len(w) for w in words))
print("Maximum name length: ", max(len(w) for w in words))

Total names:  32033
Minimum name length:  2
Maximum name length:  15


In [5]:
trigrams = {}
for w in words:
    #chs = ['<S>'] + list(w) + ['<E>']
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        trigram = (ch1+ch2, ch3)
        trigrams[trigram] = trigrams.get(trigram, 0) + 1

trigrams

{('.e', 'm'): 288,
 ('em', 'm'): 100,
 ('mm', 'a'): 72,
 ('ma', '.'): 174,
 ('.o', 'l'): 104,
 ('ol', 'i'): 69,
 ('li', 'v'): 54,
 ('iv', 'i'): 78,
 ('vi', 'a'): 147,
 ('ia', '.'): 903,
 ('.a', 'v'): 243,
 ('av', 'a'): 161,
 ('va', '.'): 93,
 ('.i', 's'): 124,
 ('is', 'a'): 142,
 ('sa', 'b'): 76,
 ('ab', 'e'): 173,
 ('be', 'l'): 201,
 ('el', 'l'): 822,
 ('ll', 'a'): 337,
 ('la', '.'): 684,
 ('.s', 'o'): 152,
 ('so', 'p'): 21,
 ('op', 'h'): 37,
 ('ph', 'i'): 61,
 ('hi', 'a'): 81,
 ('.c', 'h'): 352,
 ('ch', 'a'): 236,
 ('ha', 'r'): 329,
 ('ar', 'l'): 287,
 ('rl', 'o'): 44,
 ('lo', 't'): 14,
 ('ot', 't'): 34,
 ('tt', 'e'): 121,
 ('te', '.'): 175,
 ('.m', 'i'): 393,
 ('mi', 'a'): 95,
 ('.a', 'm'): 384,
 ('am', 'e'): 226,
 ('me', 'l'): 188,
 ('el', 'i'): 537,
 ('li', 'a'): 518,
 ('.h', 'a'): 505,
 ('ar', 'p'): 8,
 ('rp', 'e'): 5,
 ('pe', 'r'): 77,
 ('er', '.'): 683,
 ('.e', 'v'): 154,
 ('ev', 'e'): 142,
 ('ve', 'l'): 76,
 ('el', 'y'): 353,
 ('ly', 'n'): 976,
 ('yn', '.'): 953,
 ('.a', 'b'):

In [6]:
sorted_trigrams = sorted(trigrams.items(), key = lambda item: -item[1])
sorted_trigrams

[(('ah', '.'), 1714),
 (('na', '.'), 1673),
 (('an', '.'), 1509),
 (('on', '.'), 1503),
 (('.m', 'a'), 1453),
 (('.j', 'a'), 1255),
 (('.k', 'a'), 1254),
 (('en', '.'), 1217),
 (('ly', 'n'), 976),
 (('yn', '.'), 953),
 (('ar', 'i'), 950),
 (('ia', '.'), 903),
 (('ie', '.'), 858),
 (('an', 'n'), 825),
 (('el', 'l'), 822),
 (('an', 'a'), 804),
 (('ia', 'n'), 790),
 (('ma', 'r'), 776),
 (('in', '.'), 766),
 (('el', '.'), 727),
 (('ya', '.'), 716),
 (('an', 'i'), 703),
 (('.d', 'a'), 700),
 (('la', '.'), 684),
 (('er', '.'), 683),
 (('iy', 'a'), 669),
 (('la', 'n'), 647),
 (('.b', 'r'), 646),
 (('nn', 'a'), 633),
 (('.a', 'l'), 632),
 (('.c', 'a'), 628),
 (('ra', '.'), 627),
 (('ni', '.'), 625),
 (('.a', 'n'), 623),
 (('nn', '.'), 619),
 (('ne', '.'), 607),
 (('ee', '.'), 605),
 (('ey', '.'), 602),
 (('.k', 'e'), 601),
 (('al', 'e'), 601),
 (('.s', 'a'), 595),
 (('al', 'i'), 575),
 (('sh', 'a'), 562),
 (('el', 'i'), 537),
 (('.d', 'e'), 524),
 (('li', 'a'), 518),
 (('le', 'e'), 517),
 (('y

In [7]:
elements = []

for trigram in sorted_trigrams:
    elements.append(trigram[0][0])
    elements.append(trigram[0][1])


elements = list(set(elements))

print(len(elements))

628


In [8]:
eltoi = {el: i for i, el in enumerate(elements)}
#print(eltoi)
break_int = eltoi["."]

In [9]:
# int to string mapping
itoel = {i:el for i, el in enumerate(elements)}
#print(itoel)

In [10]:
# First let's create a training set of bigrams (x, y)

xs, ys = [], []

for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        el1 = ch1+ch2
        el2 = ch3

        ix1 = eltoi[el1]
        ix2 = eltoi[el2]
        xs.append(ix1)
        ys.append(ix2)
        print(f"{el1} {el2}: {ix1} {ix2}")

xs = torch.tensor(xs)
ys = torch.tensor(ys)

print(xs)
print(ys)

.e m: 415 15
em m: 263 15
mm a: 343 91
ma .: 581 217
.o l: 195 137
ol i: 309 512
li v: 37 391
iv i: 221 512
vi a: 517 91
ia .: 230 217
.a v: 14 391
av a: 311 91
va .: 399 217
.i s: 404 283
is a: 589 91
sa b: 183 367
ab e: 202 348
be l: 147 137
el l: 162 137
ll a: 318 91
la .: 344 217
.s o: 56 325
so p: 539 560
op h: 433 497
ph i: 379 512
hi a: 35 91
ia .: 230 217
.c h: 559 497
ch a: 51 91
ha r: 370 596
ar l: 284 137
rl o: 341 325
lo t: 590 173
ot t: 242 173
tt e: 63 348
te .: 170 217
.m i: 440 512
mi a: 319 91
ia .: 230 217
.a m: 14 15
am e: 570 348
me l: 304 137
el i: 162 512
li a: 37 91
ia .: 230 217
.h a: 220 91
ha r: 370 596
ar p: 284 560
rp e: 510 348
pe r: 299 596
er .: 359 217
.e v: 415 391
ev e: 527 348
ve l: 307 137
el y: 162 224
ly n: 387 613
yn .: 577 217
.a b: 14 367
ab i: 202 512
bi g: 608 117
ig a: 381 91
ga i: 467 512
ai l: 437 137
il .: 368 217
.e m: 415 15
em i: 263 512
mi l: 319 137
il y: 368 224
ly .: 387 217
.e l: 415 137
el i: 162 512
li z: 37 565
iz a: 529 91
za b

In [11]:
# Create the dataset
xs, ys = [], []

for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        el1 = ch1+ch2
        el2 = ch3

        ix1 = eltoi[el1]
        ix2 = eltoi[el2]
        xs.append(ix1)
        ys.append(ix2)
        #print(f"{ch1}{ch2}: {ix1} {ix2}")

xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()

print("Number of examples: ", num)

Number of examples:  196113


In [12]:
# randomly initialize 27 neurons weights. Each neuron recieves 27 inputs
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((628, 628), generator=g, requires_grad=True)   # We created 27 neurons

In [13]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [14]:
# gradient descent in a loop -> training

lr = 150
alpha = 0.01

t_start = time.time()

# Gpu accelerated now
#W.to(device)
xs.to(device)
ys.to(device)

with tqdm(range(500), unit="Epoch") as tepoch:
    for epoch in tepoch:
        tepoch.set_description(f"Epoch {epoch}")
        

        # Forwards pass
        xenc = F.one_hot(xs, num_classes=628).float().to(device)

        logits = xenc @ W.to(device) # log-counts
        counts = logits.exp() # counts, equivalent to N
        probs = (counts / counts.sum(1, keepdim=True)) 
        
        # print(probs.shape)
        # print(ys.shape)

        # # regularised loss
        loss = -probs[torch.arange(num), ys].log().mean() + (alpha * (W**2).mean())

        # Backward pass
        W.grad = None   # set gradient to zero
        loss.backward()

        # update
        W.data += -lr * W.grad # gradient descent

        tepoch.set_postfix(loss=loss.item(), time=time.time() - t_start)

Epoch 999: 100%|██████████| 1000/1000 [14:30<00:00,  1.15Epoch/s, loss=2.31, time=873]


In [15]:
# finally, sample from the 'neural net' model
g = torch.Generator().manual_seed(2147483647)

for i in range(5):
  
  out = []
  ix = 0
  while True:
    
    # ----------
    # BEFORE:
    #p = P[ix]
    # ----------
    # NOW:
    xenc = F.one_hot(torch.tensor([ix]), num_classes=628).float()
    logits = xenc @ W # predict log-counts
    counts = logits.exp() # counts, equivalent to N
    p = counts / counts.sum(1, keepdims=True) # probabilities for next character
    # ----------
    
    ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
    out.append(itoel[ix])
    if ix == break_int:
      break
  print(''.join(out))
  print()


xlekeavlcg.kabtudirb.kecttzzwgy.

ucaanijizbmdwmneerxdcujykshcdbssriwwdvidrenthujgwil.

vhwnetvdldqanvibayeb.

spwbrotwovonkylckigzrqiyuwaystmkxbgvlmaeanoelezemnuuebeux.

ifezdhwwuymluwrsthwki.

