In [None]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
import numpy as np

In [None]:
# Construct the list of names
words = list()
with open("names.txt", "r") as infile:
    for line in infile:
        words.append(line.strip())
words[:8]

In [None]:
len(words)

In [None]:
# Mapping of characters to/from integers
special = "."
chars = sorted(list(set("".join(words))))
stoi = {s: i + 1 for i, s in enumerate(chars)}
stoi[special] = 0
itos = {i: s for s, i in stoi.items()}

In [None]:
# Construct the dataset
block_size = 3  # context length: amount of information to use to predict the next character
(X, Y) = (list(), list())

for word in words:
    context = [0] * block_size
    for char in word + special:
        ix = stoi[char]
        X.append(context)
        Y.append(ix)
        context = context[1:] + [ix]

X = torch.tensor(X)
Y = torch.tensor(Y)

In [None]:
X.shape, Y.shape

In [None]:
# Compress the set of characters into a 2D lookup table / embedding
gen = torch.Generator().manual_seed(2147483647)
C = torch.randn((len(stoi), 2), generator=gen)
W1 = torch.randn((6, 100), generator=gen)
b1 = torch.randn(100, generator=gen)
W2 = torch.randn((100, len(stoi)))
b2 = torch.randn(len(stoi))
parameters = [C, W1, b1, W2, b2]
print(sum(p.nelement() for p in parameters))

for p in parameters:
    p.requires_grad = True

In [None]:
lre = torch.linspace(-3, 0, 1000)
lrs = 10**lre

In [None]:
# Find the optimal learning rate
lri = []
lossi = []

for epoch in range(1000):

    # Get indices of minibatch
    ix = torch.randint(0, X.shape[0], (32,))

    # Forward pass
    embedding = C[X[ix]]
    h = torch.tanh(embedding.view(-1, 6) @ W1 + b1)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Y[ix])

    # Backward pass
    for p in parameters:
        p.grad = None
    loss.backward()

    # Update
    learning_rate = lrs[epoch]
    for p in parameters:
        p.data += -learning_rate * p.grad

    lri.append(lre[epoch])
    lossi.append(loss.item())

In [None]:
x = np.array(lossi)
index = np.where(x == x.min())[0].item()
optimal_rate = lre[index].item()

plt.plot(lri, lossi)
plt.plot(optimal_rate, lossi[index], marker="o", color="r")
print(optimal_rate)

In [None]:
for epoch in range(30000):

    # Get indices of minibatch
    ix = torch.randint(0, X.shape[0], (32,))

    # Forward pass
    embedding = C[X[ix]]
    h = torch.tanh(embedding.view(-1, 6) @ W1 + b1)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Y[ix])

    # Backward pass
    for p in parameters:
        p.grad = None
    loss.backward()

    # Update
    for p in parameters:
        p.data += optimal_rate * p.grad

In [None]:
for epoch in range(10000):

    # Get indices of minibatch
    ix = torch.randint(0, X.shape[0], (32,))

    # Forward pass
    embedding = C[X[ix]]
    h = torch.tanh(embedding.view(-1, 6) @ W1 + b1)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Y[ix])

    # Backward pass
    for p in parameters:
        p.grad = None
    loss.backward()

    # Update with a decayed learning rate
    for p in parameters:
        p.data += optimal_rate / 10 * p.grad

In [None]:
# compute loss over entire dataset
embedding = C[X]
h = torch.tanh(embedding.view(-1, 6) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Y)
print(loss.item())