# Code from Lecture

In [1]:
words = open('../external_resources/makemore/names.txt', 'r').read().splitlines()


chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}

import torch


xs_bi, ys_bi = [], []
for w in words:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2 in zip(chs, chs[1:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    xs_bi.append(ix1)
    ys_bi.append(ix2)
xs_bi = torch.tensor(xs_bi)
ys_bi = torch.tensor(ys_bi)
print('number of examples: ', xs_bi.nelement())

number of examples:  228146


# Write Neural Network into a Class

Here, I package Kerpathy's code into a class, and make it a bit more general so it can run both bigram and tigram models.

In [2]:

# Re-write the code from the lecture into a unified class to avoid code duplication
import torch.nn.functional as F

class NGramModel():
    def __init__(self, xs, ys, vocab_size, output_dim, step_size = 50, shrinkage=0.01):

        # data
        self._xs = xs
        self._ys = ys
        self._num = xs.nelement()
        self._vocab_size = vocab_size
        self._output_dim = output_dim
        self._num = xs.nelement()

        # random number generators
        self._g = torch.Generator().manual_seed(2147483647)

        # initialize parameters
        self._W = torch.randn(self._vocab_size, self._output_dim, generator=self._g, requires_grad=True)

        # hyperparameters
        self._shrinkage = shrinkage
        self._step_size = step_size
        self._loss = None
    
    def _forward(self):
        xenc = F.one_hot(self._xs, self._vocab_size).float()
        logits = xenc @ self._W # predict log-counts
        counts = logits.exp() # convert to counts
        # (n x vocab_size) -> (vocab_size x 1)
        # such that each entry is the probability of the corresponding character
        probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
        # shirnking W to zero is equivalent to encouraging the network to predict uniform probabilities
        loss = -probs[torch.arange(self._num), self._ys].log().mean() + self._shrinkage*(self._W**2).mean()
        self._loss = loss

    def _backward(self):
        self._W.grad = None # set to zero the gradient
        self._loss.backward()
    
    def _step(self):
        self._W.data += -self._step_size * self._W.grad

    def train(self, epochs = 200):
        for k in range(epochs):

            # compute loss
            self._forward()

            # compute gradients
            self._backward()

            # update parameters of the model
            # using simple gradient descent
            self._step()

        print('loss: ', self._loss.item())
            

In [3]:
model = NGramModel(xs_bi, ys_bi, 27, 27, step_size=50, shrinkage=0.01)
model.train()

loss:  2.4829957485198975


# Exercises

* E01: train a trigram language model, i.e. take two characters as an input to predict the 3rd one. Feel free to use either counting or a neural net. Evaluate the loss; Did it improve over a bigram model?

* E02: split up the dataset randomly into 80% train set, 10% dev set, 10% test set. Train the bigram and trigram models only on the training set. Evaluate them on dev and test splits. What can you see?

* E03: use the dev set to tune the strength of smoothing (or regularization) for the trigram model - i.e. try many possibilities and see which one works best based on the dev set loss. What patterns can you see in the train and dev set loss as you tune this strength? Take the best setting of the smoothing and evaluate on the test set once and at the end. How good of a loss do you achieve?

* E04: we saw that our 1-hot vectors merely select a row of W, so producing these vectors explicitly feels wasteful. Can you delete our use of F.one_hot in favor of simply indexing into rows of W?
E05: look up and use F.cross_entropy instead. You should achieve the same result. Can you think of why we'd prefer to use F.cross_entropy instead?

# E01

In [4]:
words_mutated = ["." + word + "." for word in words]

In [5]:
# all unique bigrams
unique_bigrams = sorted(list(set([w[i:i+2] for w in words_mutated for i in range(len(w)-1)])))
stoi2 = {s:i for i,s in enumerate(unique_bigrams)}
itos2 = {i:s for s,i in stoi2.items()}



In [6]:
xs_tri, ys_tri = [], []
for w in words_mutated:
  chs = list(w)
  bigrams = [bi for bi in zip(chs, chs[1:])]
  trigrams = zip(bigrams, chs[2:])
  for (bigram, ch) in trigrams:
    ix1 = stoi2[''.join(bigram)]
    ix2 = stoi[ch]
    xs_tri.append(ix1)
    ys_tri.append(ix2)
xs_tri = torch.tensor(xs_tri)
ys_tri = torch.tensor(ys_tri)
print('number of examples: ', xs_tri.nelement())

number of examples:  196113


In [8]:
tigram_model = NGramModel(xs_tri, ys_tri, len(unique_bigrams), 27, step_size=100, shrinkage=0.01)

In [9]:
tigram_model.train()

loss:  2.1890223026275635


# E02