<a href="https://colab.research.google.com/github/iamthedoan/nn-zero-to-hero/blob/master/lectures/makemore/makemore_part1_exercises.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
# download the names.txt file from github
!wget https://raw.githubusercontent.com/karpathy/makemore/master/names.txt

--2024-06-04 06:21:56--  https://raw.githubusercontent.com/karpathy/makemore/master/names.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 228145 (223K) [text/plain]
Saving to: ‘names.txt’


2024-06-04 06:21:56 (1.74 MB/s) - ‘names.txt’ saved [228145/228145]



In [52]:
words = open('names.txt', 'r').read().splitlines()
words[:5]

['emma', 'olivia', 'ava', 'isabella', 'sophia']

In [53]:
len(words)

32033

In [5]:
min(len(w) for w in words)

2

In [6]:
max(len(w) for w in words)

15

**E01: train a trigram language model, i.e. take two characters as an input to predict the 3rd one. Feel free to use either counting or a neural net. Evaluate the loss; Did it improve over a bigram model?**

In [5]:
# list of chars
chars = sorted(list(set(''.join(words))))
chars = ["."] + chars

# dict mapping char to index
stoi = {s:i for i, s in enumerate(chars)}
stoi['.'] = 0

# dict mapping index to char
itos = {i:s for s,i in stoi.items()}


In [None]:
print(stoi)

In [None]:
print(itos)

In [11]:
t = {}
for w in words[:5]:
  # add start and end tokens
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
    trigram = (ch1,ch2,ch3)
    t[trigram] = t.get(trigram, 0) + 1

In [None]:
sorted(t.items(), key = lambda kv: -kv[1])

Counting

In [42]:
N = torch.ones(27,27,27, dtype=torch.int32)
# , device = device

N[0,0,0] = 0

# counting the trigrams
for w in words:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    ix3 = stoi[ch3]
    N[ix1, ix2, ix3] += 1

# P is N but all values are probabilities
P = N / N.sum(dim = 2, keepdim = True)

In [24]:
P.shape

torch.Size([27, 27, 27])

Loss Function

In [53]:
def loss_func(input):
  log_likelihood = 0.0

  # for calculating average
  n = 0

  # for w in ["brandon"]

  for w in input:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2, ch3, in zip(chs, chs[1:], chs[2:]):
      ix1 = stoi[ch1]
      ix2 = stoi[ch2]
      ix3 = stoi[ch3]
      prob = P[ix1, ix2, ix3]
      # print(f'{ch1}{ch2}{ch3}: {prob:.4f}')

      logprob = torch.log(prob)
      log_likelihood += logprob
      n += 1

  # higher log_likelihood better, log value closer to 0 means probability was higher
  print(f'{log_likelihood=}')

  # negative log likelihood
  nll = -log_likelihood
  print(f'{nll=}')

  # normalized negative log likelihood, minimize this
  print(f'{nll/n}')



log_likelihood=tensor(-6.2656)
nll=tensor(6.2656)
2.088524103164673


In [61]:
loss_func(words)

log_likelihood=tensor(-410414.9688)
nll=tensor(410414.9688)
2.092747449874878


Sampling

In [107]:
names = []

for i in range(10):
  out = []
  ix1,ix2 = 0,0
  while True:
    p = P[ix1, ix2]
    ix1 = ix2
    ix2 = torch.multinomial(p, 1, replacement=True).item()
    if ix2 == 0:
      break
    out.append(itos[ix2])

  names.append("".join(out))

print(names)
loss_func(names)


['zur', 'yelays', 'karigha', 'yrene', 'yariola', 'glce', 'trah', 'ulennakinn', 'xashadhir', 'daishia']
log_likelihood=tensor(-130.2120)
nll=tensor(130.2120)
2.100193977355957


**Multi layer perceptron (MLP) approach**


Set up inputs and outputs

In [91]:
# create training set of bigrams (x,y)

xs, ys = [],[]

for w in words:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    ix3 = stoi[ch3]
    # print(ch1, ch2, ch3)
    xs.append([ix1, ix2])
    ys.append(ix3)

xs = torch.tensor(xs, dtype = torch.int64)
ys = torch.tensor(ys, dtype = torch.int64)

Create NN–perform forward pass, softmax, loss, backward pass, update grad

In [18]:
# weights
W = torch.randn((27*2), 27, requires_grad=True)

num_pass = 200

for i in range(num_pass):
  # forward pass
  xenc = F.one_hot(xs, num_classes=27).float()
  xenc = xenc.view(-1, 27*2)

  # softmax, used for normalizing output to a probability distribution
  logits = xenc @ W # predict log-counts
  counts = torch.exp(logits) # counts, equivalent to N
  probs = counts / counts.sum(dim = 1, keepdims = True) # probabilites for next character

  # loss (negative log likelihood)
  loss = -probs[torch.arange(len(xs)),ys].log().mean()

  # regularization: incentivizes W to be near 0 --> smoothing
  loss += 0.2 * (W**2).mean()

  if i % 10 == 0:
    print(f"{i}: {loss.item():.4f}")


  # backward pass
  W.grad = None # set grad to zero
  loss.backward()

  # update
  # update weights
  with torch.no_grad():
      W -= 50 * W.grad

  # W.data += -50 * W.grad





0: 4.3992
10: 2.4116
20: 2.2584
30: 2.2016
40: 2.1737
50: 2.1583
60: 2.1491
70: 2.1434
80: 2.1398
90: 2.1374
100: 2.1357
110: 2.1346
120: 2.1338
130: 2.1333
140: 2.1329
150: 2.1326
160: 2.1324
170: 2.1322
180: 2.1321
190: 2.1320


Sampling

In [112]:
names = []

for i in range(10):
  out = []
  ix1,ix2 = 0,0
  while True:
    xenc = F.one_hot(torch.tensor([ix1,ix2]), num_classes = 27).float()
    xenc = xenc.view(-1, 27*2)

    logits = xenc @ W
    counts = logits.exp()
    p = counts / counts.sum(1, keepdims = True)

    ix1 = ix2
    ix2 = torch.multinomial(p, num_samples=1, replacement=True).item()
    out.append(itos[ix2])
    if ix2 == 0:
      break

  names.append("".join(out))

print(names)
loss_func(names)

['ouray.', 'aivak.', 'xa.', 'umysha.', 'chzmayla.', 'arber.', 'alyalas.', 'ir.', 'aan.', 'eryor.']
log_likelihood=tensor(-155.0886)
nll=tensor(155.0886)
2.6739416122436523


**E02: split up the dataset randomly into 80% train set, 10% dev set, 10% test set. Train the bigram and trigram models only on the training set. Evaluate them on dev and test splits. What can you see?**

In [93]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(words, test_size=0.2)
dev_set, test_set = train_test_split(test_set, test_size=0.5)

x_train, y_train, x_dev, y_dev, x_test, y_test = [],[],[],[],[],[]

for dataset in [train_set, dev_set, test_set]:
  xs, ys = [], []

  for w in dataset:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
      ix1 = stoi[ch1]
      ix2 = stoi[ch2]
      ix3 = stoi[ch3]

      xs.append([ix1, ix2])
      ys.append(ix3)

  xs = torch.tensor(xs, dtype = torch.int64)
  ys = torch.tensor(ys, dtype = torch.int64)

  if dataset == train_set:
    x_train, y_train = xs, ys
  elif dataset == dev_set:
    x_dev, y_dev = xs, ys
  else:
    x_test, y_test = xs, ys


In [74]:
x_test.shape

torch.Size([19568, 2])

In [94]:
# same code as above but x and y are now the x_train and y_train

# weights
W = torch.randn((27*2), 27, requires_grad=True)

num_pass = 200

for i in range(num_pass):
  # forward pass
  xenc = F.one_hot(x_train, num_classes=27).float()
  xenc = xenc.view(-1, 27*2)

  # softmax, used for normalizing output to a probability distribution
  logits = xenc @ W # predict log-counts
  counts = torch.exp(logits) # counts, equivalent to N
  probs = counts / counts.sum(dim = 1, keepdims = True) # probabilites for next character

  # loss (negative log likelihood)
  loss = -probs[torch.arange(len(x_train)),y_train].log().mean()

  # regularization: incentivizes W to be near 0 --> smoothing
  # loss += 0.2 * (W**2).mean()

  if i % 10 == 0:
    print(f"{i}: {loss.item():.4f}")


  # backward pass
  W.grad = None # set grad to zero
  loss.backward()

  # update
  # update weights
  with torch.no_grad():
      W -= 50 * W.grad

  # W.data += -50 * W.grad



0: 4.3586
10: 2.4827
20: 2.3676
30: 2.3243
40: 2.3019
50: 2.2885
60: 2.2794
70: 2.2729
80: 2.2679
90: 2.2640
100: 2.2609
110: 2.2584
120: 2.2563
130: 2.2545
140: 2.2530
150: 2.2517
160: 2.2506
170: 2.2496
180: 2.2487
190: 2.2479


In [None]:
W[0]

In [95]:
def loss_func_MLP(x, y, W):
  xenc = F.one_hot(x, num_classes = 27).float()
  xenc = xenc.view(-1, 27*2)

  logits = xenc @ W
  counts = logits.exp()
  probs = counts / counts.sum(1, keepdims = True)

  loss = -probs[torch.arange(len(x)),y].log().mean()

  return loss.item()


In [96]:
print(f"train loss: {loss_func_MLP(x_train, y_train, W):.4f}")
print(f"dev loss: {loss_func_MLP(x_dev, y_dev, W):.4f}")
print(f"test loss: {loss_func_MLP(x_test, y_test, W):.4f}")

train loss: 2.2472
dev loss: 2.2523
test loss: 2.2518


**E03: use the dev set to tune the strength of smoothing (or regularization) for the trigram model - i.e. try many possibilities and see which one works best based on the dev set loss. What patterns can you see in the train and dev set loss as you tune this strength? Take the best setting of the smoothing and evaluate on the test set once and at the end. How good of a loss do you achieve?**

In [103]:
# weights
W = torch.randn((27*2), 27, requires_grad=True)


for i in range(200):
  # forward pass
  xenc = F.one_hot(x_train, num_classes=27).float()
  xenc = xenc.view(-1, 27*2)

  # softmax, used for normalizing output to a probability distribution
  logits = xenc @ W # predict log-counts
  counts = torch.exp(logits) # counts, equivalent to N
  probs = counts / counts.sum(dim = 1, keepdims = True) # probabilites for next character

  # loss (negative log likelihood)
  loss = -probs[torch.arange(len(x_train)),y_train].log().mean()

  # regularization: incentivizes W to be near 0 --> smoothing
  # loss += 0.05 * (W**2).mean()
  # best with no regularization


  if i % 10 == 0:
    print(f"{i}: train loss {loss.item():.4f} | dev loss {loss_func_MLP(x_dev, y_dev, W):.4f}")


  # backward pass
  W.grad = None # set grad to zero
  loss.backward()

  # update
  # update weights
  with torch.no_grad():
      W -= 50 * W.grad

0: train loss 4.1880 | dev loss 4.1859
10: train loss 2.5002 | dev loss 2.5088
20: train loss 2.3823 | dev loss 2.3905
30: train loss 2.3345 | dev loss 2.3418
40: train loss 2.3085 | dev loss 2.3151
50: train loss 2.2927 | dev loss 2.2987
60: train loss 2.2823 | dev loss 2.2881
70: train loss 2.2751 | dev loss 2.2807
80: train loss 2.2698 | dev loss 2.2753
90: train loss 2.2657 | dev loss 2.2711
100: train loss 2.2625 | dev loss 2.2679
110: train loss 2.2598 | dev loss 2.2652
120: train loss 2.2576 | dev loss 2.2630
130: train loss 2.2558 | dev loss 2.2612
140: train loss 2.2542 | dev loss 2.2596
150: train loss 2.2528 | dev loss 2.2583
160: train loss 2.2516 | dev loss 2.2571
170: train loss 2.2505 | dev loss 2.2560
180: train loss 2.2496 | dev loss 2.2551
190: train loss 2.2487 | dev loss 2.2543
