In [1]:
import torch
import torch.nn.functional as F

device = "cuda:0"
words = open('names.txt', 'r').read().splitlines()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# make a list of characters (a -> z)
chars = sorted(list(set("".join(words))))
chars = ["."] + chars

# # make a dictionary of character to index
stoi = {ch: i for (i, ch) in enumerate(chars)}

# # make a dictionary of index to character
itos = {i: ch for (ch, i) in stoi.items()}

### E01: train a trigram language model, i.e. take two characters as an input to predict the 3rd one. Feel free to use either counting or a neural net. Evaluate the loss; Did it improve over a bigram model?

In [3]:
N = torch.ones(27, 27, 27, dtype = torch.int32, device = device)
N[0, 0, 0] = 0
# getting the Bigrams
for w in words:
    # add start and end tokens
    chs = ["."] + list(w) + ["."]
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        ix3 = stoi[ch3]

        N[ix1, ix2, ix3] += 1

P = N / N.sum(dim = 2, keepdim = True)

In [4]:
g = torch.Generator(device=device).manual_seed(1122)
for i in range(20):
  out = []
  index1 = 0
  index2 = 0
  while True:
    p = P[index1,index2]
    out.append(itos[index1])
    index1 = index2
    index2 = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
    if index2 == 0 :
      break
  print(''.join(out).replace(".",""))

quealiellyrol
luwaqsh
joecherazr
ummann
fa
donit
br
zaelgle
sh
umic
mosinslkk
in
bexto
hast
velyne
zuri
dar
phitzssakathamorgia
den
quil


### E02: split up the dataset randomly into 80% train set, 10% dev set, 10% test set. Train the bigram and trigram models only on the training set. Evaluate them on dev and test splits. What can you see?

In [53]:
# Prepare the training set
xs , ys = [], []

for w in words:
    # add start and end tokens
    chs = ["."] + list(w) + ["."]
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        ix3 = stoi[ch3]

        xs.append([ix1, ix2])
        ys.append(ix3)

xs = torch.tensor(xs, dtype=torch.int64).to(device)
ys = torch.tensor(ys, dtype=torch.int64).to(device)

In [6]:
num_dataset = len(xs)
train_num =  int(num_dataset * 0.8)

train_x = xs[:train_num]
train_y = ys[:train_num]

val_x = xs[train_num:]
val_y = ys[train_num:]

### E03: use the dev set to tune the strength of smoothing (or regularization) for the trigram model - i.e. try many possibilities and see which one works best based on the dev set loss. What patterns can you see in the train and dev set loss as you tune this strength? Take the best setting of the smoothing and evaluate on the test set once and at the end. How good of a loss do you achieve?

In [7]:
torch.manual_seed(1122)

W = torch.randn((27,27,27), requires_grad = True, device = device)
def calculate_loss(x,y):
    num =  len(x)
    xenc = F.one_hot(x, num_classes = 27).float().to(device)
    first_char_enc = xenc[:,0,:] @ W.view(-1,W.shape[1] * W.shape[2])
    logits = xenc[:,1,:].unsqueeze(1) @ first_char_enc.view(len(xenc),W.shape[1],W.shape[2])
    result_exp = logits.exp()
    prob = result_exp / result_exp.sum(2, keepdims=True)
    loss = -prob.view(result_exp.shape[0],result_exp.shape[2])[torch.arange(num), y].log().mean() 
    # loss += 0.1*(W**2).mean()
    return loss

for k in range(500):
    train_loss = calculate_loss(train_x,train_y)
    W.grad = None
    train_loss.backward()
    with torch.no_grad():
        val_loss = calculate_loss(val_x,val_y)
        if k % 10 == 0:
            print(f" Train: {train_loss.item():.4f}  Val: {val_loss.item():.4f}")
        W -= 40 * W.grad
    

 Train: 3.7958  Val: 3.8308
 Train: 3.2885  Val: 3.4444
 Train: 3.0320  Val: 3.2426
 Train: 2.8811  Val: 3.1234
 Train: 2.7778  Val: 3.0390
 Train: 2.7011  Val: 2.9742
 Train: 2.6416  Val: 2.9225
 Train: 2.5938  Val: 2.8800
 Train: 2.5544  Val: 2.8443
 Train: 2.5214  Val: 2.8139
 Train: 2.4933  Val: 2.7876
 Train: 2.4690  Val: 2.7646
 Train: 2.4479  Val: 2.7443
 Train: 2.4294  Val: 2.7264
 Train: 2.4129  Val: 2.7103
 Train: 2.3983  Val: 2.6957
 Train: 2.3851  Val: 2.6826
 Train: 2.3731  Val: 2.6706
 Train: 2.3623  Val: 2.6596
 Train: 2.3524  Val: 2.6495
 Train: 2.3433  Val: 2.6401
 Train: 2.3349  Val: 2.6315
 Train: 2.3271  Val: 2.6234
 Train: 2.3199  Val: 2.6159
 Train: 2.3131  Val: 2.6088
 Train: 2.3068  Val: 2.6022
 Train: 2.3010  Val: 2.5960
 Train: 2.2954  Val: 2.5901
 Train: 2.2902  Val: 2.5846
 Train: 2.2853  Val: 2.5794
 Train: 2.2807  Val: 2.5744
 Train: 2.2763  Val: 2.5697
 Train: 2.2722  Val: 2.5652
 Train: 2.2682  Val: 2.5610
 Train: 2.2645  Val: 2.5569
 Train: 2.2609  Val:

### E04: we saw that our 1-hot vectors merely select a row of W, so producing these vectors explicitly feels wasteful. Can you delete our use of F.one_hot in favor of simply indexing into rows of W?

In [131]:
torch.manual_seed(1122)
W = torch.randn((27,27,27), requires_grad = True, device = device)
num = len(train_x)

for k in range(500):
    logits = W[train_x[:,0],train_x[:,1]]
    counts = logits.exp()
    probs = counts / counts.sum(1, keepdims=True)
    loss = -probs[torch.arange(num), train_y].log().mean()
    W.grad = None
    loss.backward()
    with torch.no_grad():
        W -= 40 * W.grad
        if k % 10 == 0 :
            print(f" Train: {loss.item():.4f}  ")


 Train: 3.6952  
 Train: 3.1927  
 Train: 2.9385  
 Train: 2.7890  
 Train: 2.6866  
 Train: 2.6106  
 Train: 2.5515  
 Train: 2.5039  
 Train: 2.4648  
 Train: 2.4318  
 Train: 2.4037  
 Train: 2.3795  
 Train: 2.3583  
 Train: 2.3397  
 Train: 2.3232  
 Train: 2.3085  
 Train: 2.2952  
 Train: 2.2832  
 Train: 2.2723  
 Train: 2.2623  
 Train: 2.2531  
 Train: 2.2446  
 Train: 2.2367  
 Train: 2.2294  
 Train: 2.2226  
 Train: 2.2162  
 Train: 2.2103  
 Train: 2.2047  
 Train: 2.1994  
 Train: 2.1944  
 Train: 2.1897  
 Train: 2.1852  
 Train: 2.1810  
 Train: 2.1770  
 Train: 2.1731  
 Train: 2.1695  
 Train: 2.1660  
 Train: 2.1627  
 Train: 2.1595  
 Train: 2.1565  
 Train: 2.1536  
 Train: 2.1508  
 Train: 2.1482  
 Train: 2.1456  
 Train: 2.1431  
 Train: 2.1408  
 Train: 2.1385  
 Train: 2.1363  
 Train: 2.1342  
 Train: 2.1322  


### E05: look up and use F.cross_entropy instead. You should achieve the same result. Can you think of why we'd prefer to use F.cross_entropy instead?

In [132]:
torch.manual_seed(1122)
W = torch.randn((27,27,27), requires_grad = True, device = device)
num = len(train_x)
for k in range(500):
    logits = W[train_x[:,0],train_x[:,1]]
    loss = F.cross_entropy(logits, train_y)
    W.grad = None
    loss.backward()
    with torch.no_grad():
        W -= 40 * W.grad
        if k % 10 == 0 :
            print(f" Train: {loss.item():.4f}  ")


 Train: 3.6952  
 Train: 3.1927  
 Train: 2.9385  
 Train: 2.7890  
 Train: 2.6866  
 Train: 2.6106  
 Train: 2.5515  
 Train: 2.5039  
 Train: 2.4648  
 Train: 2.4318  
 Train: 2.4037  
 Train: 2.3795  
 Train: 2.3583  
 Train: 2.3397  
 Train: 2.3232  
 Train: 2.3085  
 Train: 2.2952  
 Train: 2.2832  
 Train: 2.2723  
 Train: 2.2623  
 Train: 2.2531  
 Train: 2.2446  
 Train: 2.2367  
 Train: 2.2294  
 Train: 2.2226  
 Train: 2.2162  
 Train: 2.2103  
 Train: 2.2047  
 Train: 2.1994  
 Train: 2.1944  
 Train: 2.1897  
 Train: 2.1852  
 Train: 2.1810  
 Train: 2.1770  
 Train: 2.1731  
 Train: 2.1695  
 Train: 2.1660  
 Train: 2.1627  
 Train: 2.1596  
 Train: 2.1565  
 Train: 2.1536  
 Train: 2.1508  
 Train: 2.1482  
 Train: 2.1456  
 Train: 2.1432  
 Train: 2.1408  
 Train: 2.1385  
 Train: 2.1363  
 Train: 2.1342  
 Train: 2.1322  


### E06: meta-exercise! Think of a fun/interesting exercise and complete it