<a href="https://colab.research.google.com/github/harveyj/aoc/blob/master/3_embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch

def enc(c):
  if c == '.': return 0
  return ord(c) - ord('a') + 1
def dec(c):
  if c == 0: return '.'
  return chr(c + ord('a') - 1)
names = raw_names.split('\n')[1:]
N = torch.zeros((27, 27), dtype=torch.int32)
for n in names:
  for c1, c2 in zip(n, n[1:]):
    N[enc(c1)][enc(c2)] += 1
def normalize(N):
  return N / torch.sum(N, dim=1, keepdim=True)
norm = normalize(N)

In [None]:
xs = []; ys = []
import torch.nn.functional as F

BATCH_SIZE = 32
WINDOW = 3
EMBED_SIZE = 5
HIDDEN_LAYER = 200
DICT_SIZE = 27

for n in names:
  padded = '.' * WINDOW + n + '.'
  for i in range(len(padded) - WINDOW):
    xs.append(list(map(enc, padded[i:i+WINDOW])))
    ys.append(enc(padded[i+WINDOW]))
xs = torch.tensor(xs)
ys = torch.tensor(ys)

# for i in range(300):
#   print(list(map(dec, list(xs[i]))), dec(ys[i]))

C = torch.randn([DICT_SIZE, EMBED_SIZE])
W1 = torch.randn([WINDOW*EMBED_SIZE, HIDDEN_LAYER])
B1 = torch.randn([HIDDEN_LAYER])
W2 = torch.randn([HIDDEN_LAYER, DICT_SIZE])
B2 = torch.randn([DICT_SIZE])
parameters = [C, W1, B1, W2, B2]

for p in parameters: p.requires_grad = True


In [None]:
for _ in range(10000):
  ixs = torch.randint(0, xs.shape[0], (32,)) # generate random sample of 32 indices

  # Forward
  emb = C[xs[ixs]]
  h = torch.tanh(emb.view(-1, WINDOW*EMBED_SIZE) @ W1 + B1)
  logits = h @ W2 + B2
  loss = F.cross_entropy(logits, ys[ixs])
  # Backward
  for p in parameters: p.grad = None
  loss.backward()
  # Update
  for p in parameters:
    p.data += -0.001 * p.grad
loss


tensor(2.2218, grad_fn=<NllLossBackward0>)

In [None]:
# Evaluate loss on entire set
emb = C[xs]
h = torch.tanh(emb.view(-1, WINDOW*EMBED_SIZE) @ W1 + B1)
logits = h @ W2 + B2
loss = F.cross_entropy(logits, ys)
loss


tensor(2.3858, grad_fn=<NllLossBackward0>)

In [None]:
for i in range(20):
  window = [0,0,0]
  c = -1
  while c != 0:
    emb = C[torch.tensor(window)]
    hpreact = emb.view(-1, WINDOW*EMBED_SIZE) @ W1 + B1
    h = torch.tanh(hpreact)
    logits = h @ W2 + B2
    probs = F.softmax(logits, dim=1)
    ix = torch.multinomial(probs, num_samples=1).item()
    print(dec(ix), end='')
    window = window[1:] + [ix]
    if ix == 0: break
  print('')


miki.
ali.
yamarha.
faneee.
antoniyn.
rocerie.
zakarishtyn.
lomlya.
larla.
geltzsaby.
mymonean.
wibi.
bquxn.
lunyse.
mace.
kanayla.
tadeele.
kira.
dhli.
yah.


In [None]:
# download the names.txt file from github
!wget https://raw.githubusercontent.com/karpathy/makemore/master/names.txt
raw_names = open('names.txt', 'r').read()


--2024-02-18 18:23:48--  https://raw.githubusercontent.com/karpathy/makemore/master/names.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 228145 (223K) [text/plain]
Saving to: ‘names.txt.1’


2024-02-18 18:23:48 (8.21 MB/s) - ‘names.txt.1’ saved [228145/228145]



# Recall
### Embeddings
* keeping n-character context windows around gets really crazy. one-hot encoding a 27-element array means a context window of 4 is 27**4 inputs = 531441 inputs.
* Enter embeddings. Instead of one-hot, map each input token (letter in our case) to an N-dimensional vector, where N is far less than the cardinality of the data. Learn the embeddings during the training
* Take the embeddings of your input, concatenate them, feed them into the first layer of the neural network.
* Repeat the bengio gradient paper. Embeddings point into an MLP.