<a href="https://colab.research.google.com/github/iamthedoan/nn-zero-to-hero/blob/master/lectures/makemore/makemore_part1_exercises.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
# download the names.txt file from github
!wget https://raw.githubusercontent.com/karpathy/makemore/master/names.txt

--2024-06-03 21:01:28--  https://raw.githubusercontent.com/karpathy/makemore/master/names.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 228145 (223K) [text/plain]
Saving to: ‘names.txt’


2024-06-03 21:01:28 (4.40 MB/s) - ‘names.txt’ saved [228145/228145]



In [3]:
words = open('names.txt', 'r').read().splitlines()
words[:5]

32033

In [4]:
len(words)

32033

In [5]:
min(len(w) for w in words)

2

In [6]:
max(len(w) for w in words)

15

**E01: train a trigram language model, i.e. take two characters as an input to predict the 3rd one. Feel free to use either counting or a neural net. Evaluate the loss; Did it improve over a bigram model?**

In [41]:
# list of chars
chars = sorted(list(set(''.join(words))))
chars = ["."] + chars

# dict mapping char to index
stoi = {s:i for i, s in enumerate(chars)}
stoi['.'] = 0

# dict mapping index to char
itos = {i:s for s,i in stoi.items()}


In [None]:
print(stoi)

In [None]:
print(itos)

In [11]:
t = {}
for w in words[:5]:
  # add start and end tokens
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
    trigram = (ch1,ch2,ch3)
    t[trigram] = t.get(trigram, 0) + 1

In [None]:
sorted(t.items(), key = lambda kv: -kv[1])

Counting

In [42]:
N = torch.ones(27,27,27, dtype=torch.int32)
# , device = device

N[0,0,0] = 0

# counting the trigrams
for w in words:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    ix3 = stoi[ch3]
    N[ix1, ix2, ix3] += 1

# P is N but all values are probabilities
P = N / N.sum(dim = 2, keepdim = True)

In [24]:
P.shape

torch.Size([27, 27, 27])

Loss Function

In [53]:
def loss_func(input):
  log_likelihood = 0.0

  # for calculating average
  n = 0

  # for w in ["brandon"]

  for w in input:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2, ch3, in zip(chs, chs[1:], chs[2:]):
      ix1 = stoi[ch1]
      ix2 = stoi[ch2]
      ix3 = stoi[ch3]
      prob = P[ix1, ix2, ix3]
      # print(f'{ch1}{ch2}{ch3}: {prob:.4f}')

      logprob = torch.log(prob)
      log_likelihood += logprob
      n += 1

  # higher log_likelihood better, log value closer to 0 means probability was higher
  print(f'{log_likelihood=}')

  # negative log likelihood
  nll = -log_likelihood
  print(f'{nll=}')

  # normalized negative log likelihood, minimize this
  print(f'{nll/n}')



log_likelihood=tensor(-6.2656)
nll=tensor(6.2656)
2.088524103164673


In [61]:
loss_func(words)

log_likelihood=tensor(-410414.9688)
nll=tensor(410414.9688)
2.092747449874878


Sampling

In [107]:
names = []

for i in range(10):
  out = []
  ix1,ix2 = 0,0
  while True:
    p = P[ix1, ix2]
    ix1 = ix2
    ix2 = torch.multinomial(p, 1, replacement=True).item()
    if ix2 == 0:
      break
    out.append(itos[ix2])

  names.append("".join(out))

print(names)
loss_func(names)


['zur', 'yelays', 'karigha', 'yrene', 'yariola', 'glce', 'trah', 'ulennakinn', 'xashadhir', 'daishia']
log_likelihood=tensor(-130.2120)
nll=tensor(130.2120)
2.100193977355957


**E02: split up the dataset randomly into 80% train set, 10% dev set, 10% test set. Train the bigram and trigram models only on the training set. Evaluate them on dev and test splits. What can you see?**

Set up inputs and outputs

In [91]:
# create training set of bigrams (x,y)

xs, ys = [],[]

for w in words:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    ix3 = stoi[ch3]
    # print(ch1, ch2, ch3)
    xs.append([ix1, ix2])
    ys.append(ix3)

xs = torch.tensor(xs, dtype = torch.int64)
ys = torch.tensor(ys, dtype = torch.int64)

Create NN–perform forward pass, softmax, loss, backward pass, update grad

In [102]:
# weights
W = torch.randn((27*2), 27, requires_grad=True)

num_pass = 200

for i in range(num_pass):
  # forward pass
  xenc = F.one_hot(xs, num_classes=27).float()
  xenc = xenc.view(-1, 27*2)

  # softmax, used for normalizing output to a probability distribution
  logits = xenc @ W # predict log-counts
  counts = torch.exp(logits) # counts, equivalent to N
  probs = counts / counts.sum(dim = 1, keepdims = True) # probabilites for next character

  # loss (negative log likelihood)
  loss = -probs[torch.arange(len(xs)),ys].log().mean()

  # regularization: incentivizes W to be near 0 --> smoothing
  loss += 0.2 * (W**2).mean()

  if i % 10 == 0:
    print(f"{i}: {loss.item():.4f}")


  # backward pass
  W.grad = None # set grad to zero
  loss.backward()

  # update
  # update weights
  # with torch.no_grad():
  #     W -= 50 * W.grad

  W.data += -50 * W.grad





0: 4.5036
10: 2.6134
20: 2.4748
30: 2.4243
40: 2.4001
50: 2.3867
60: 2.3788
70: 2.3738
80: 2.3706
90: 2.3684
100: 2.3670
110: 2.3660
120: 2.3653
130: 2.3649
140: 2.3645
150: 2.3643
160: 2.3642
170: 2.3640
180: 2.3640
190: 2.3639


Sampling

In [112]:
names = []

for i in range(10):
  out = []
  ix1,ix2 = 0,0
  while True:
    xenc = F.one_hot(torch.tensor([ix1,ix2]), num_classes = 27).float()
    xenc = xenc.view(-1, 27*2)

    logits = xenc @ W
    counts = logits.exp()
    p = counts / counts.sum(1, keepdims = True)

    ix1 = ix2
    ix2 = torch.multinomial(p, num_samples=1, replacement=True).item()
    out.append(itos[ix2])
    if ix2 == 0:
      break

  names.append("".join(out))

print(names)
loss_func(names)

['ouray.', 'aivak.', 'xa.', 'umysha.', 'chzmayla.', 'arber.', 'alyalas.', 'ir.', 'aan.', 'eryor.']
log_likelihood=tensor(-155.0886)
nll=tensor(155.0886)
2.6739416122436523
