In [None]:
import torch
import random
import numpy as np
import torch.nn.functional as F
import matplotlib.pyplot as plt
from MulticoreTSNE import MulticoreTSNE as TSNE

random.seed(42)

In [None]:
# Construct the list of names
words = list()
with open("names.txt", "r") as infile:
    for line in infile:
        words.append(line.strip())
words[:8]

In [None]:
len(words)

In [None]:
# Mapping of characters to/from integers
special = "."
chars = sorted(list(set("".join(words))))
STOI = {s: i + 1 for i, s in enumerate(chars)}
STOI[special] = 0
ITOS = {i: s for s, i in STOI.items()}

In [None]:
# Construct the dataset


def build_dataset(words, *, special: str = ".") -> tuple[torch.tensor]:
    block_size = 3  # context length: amount of information to use to predict the next character
    (X, Y) = (list(), list())

    for word in words:
        context = [0] * block_size
        for char in word + special:
            ix = STOI[char]
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix]

    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(f"Constructed dataset: X: {X.shape}, Y: {Y.shape}")
    return X, Y


# Train, Development, Test split
random.shuffle(words)
n1 = int(0.8 * len(words))
n2 = int(0.9 * len(words))

Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xtest, Ytest = build_dataset(words[n2:])

In [None]:
# Compress the set of characters into a 2D lookup table / embedding
gen = torch.Generator().manual_seed(2147483647)
C = torch.randn((len(STOI), 2), generator=gen)
W1 = torch.randn((6, 100), generator=gen)
b1 = torch.randn(100, generator=gen)
W2 = torch.randn((100, len(STOI)))
b2 = torch.randn(len(STOI))
parameters = [C, W1, b1, W2, b2]
print(sum(p.nelement() for p in parameters))

for p in parameters:
    p.requires_grad = True

In [None]:
n_steps = 1000
lre = torch.linspace(-3, 0, n_steps)
lrs = 10**lre

In [None]:
# Find the optimal learning rate by stepping through rates in exponential space

lri = []
lossi = []

for epoch in range(n_steps):

    # Get indices of minibatch
    ix = torch.randint(0, Xtr.shape[0], (32,))

    # Forward pass
    embedding = C[Xtr[ix]]
    h = torch.tanh(embedding.view(-1, 6) @ W1 + b1)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Ytr[ix])

    # Backward pass
    for p in parameters:
        p.grad = None
    loss.backward()

    # Update
    learning_rate = lrs[epoch]
    for p in parameters:
        p.data += -learning_rate * p.grad

    lri.append(lre[epoch])
    lossi.append(loss.item())

# Find the index of minimal loss and map to stored learning rate
x = np.array(lossi)
index = np.where(x == x.min())[0].item()
optimal_rate = lre[index].item()

plt.plot(lri, lossi)
plt.plot(optimal_rate, lossi[index], marker="o", color="r")
print(optimal_rate)

In [None]:
# Reinitialize the network and launch training with the above optimal rate
epoch_i = list()
loss_i = list()
n_epochs = 40_000
start_decay = int(0.9 * n_epochs)
n_hidden = 300
batch_size = 32

gen = torch.Generator().manual_seed(2147483647)
C = torch.randn((len(STOI), 2), generator=gen)
W1 = torch.randn((6, n_hidden), generator=gen)
b1 = torch.randn(n_hidden, generator=gen)
W2 = torch.randn((n_hidden, len(STOI)))
b2 = torch.randn(len(STOI))
parameters = [C, W1, b1, W2, b2]

for p in parameters:
    p.requires_grad = True

for epoch in range(n_epochs):

    # Get indices of minibatch from the training data
    ix = torch.randint(0, Xtr.shape[0], (batch_size,))

    # Forward pass using the training targets
    embedding = C[Xtr[ix]]
    h = torch.tanh(embedding.view(-1, 6) @ W1 + b1)  # batch_size, n_hidden
    logits = h @ W2 + b2  # batch_size, len(stoi)
    loss = F.cross_entropy(logits, Ytr[ix])

    # Backward pass
    for p in parameters:
        p.grad = None
    loss.backward()

    # Update and decay the learning rate if required
    current_rate = optimal_rate if epoch < start_decay else optimal_rate / 10
    for p in parameters:
        p.data += current_rate * p.grad

    epoch_i.append(epoch)
    loss_i.append(loss.item())

plt.plot(epoch_i, loss_i)
plt.axvline(start_decay, linestyle="--", color="r")
plt.title("Loss")
plt.show()

# compute loss over the development set and compare with the training loss
# to check whether we're overfitting or not

embedding = C[Xdev]
h = torch.tanh(embedding.view(-1, 6) @ W1 + b1)
logits = h @ W2 + b2
dev_loss = F.cross_entropy(logits, Ydev)

embedding = C[Xtr]
h = torch.tanh(embedding.view(-1, 6) @ W1 + b1)
logits = h @ W2 + b2
train_loss = F.cross_entropy(logits, Ytr)

assert np.isclose(
    dev_loss.item(), train_loss.item(), atol=1e-1
), f"Overfitting: DL: {dev_loss.item():.2f}, TL: {train_loss.item():.2f}"
print(f"DL: {dev_loss.item():.2f}, TL: {train_loss.item():.2f}")

# Visualize the 2D NN character embedding
plt.figure(figsize=(8, 8))
plt.scatter(C[:, 0].data, C[:, 1].data, s=200)
for i in range(C.shape[0]):
    plt.text(C[i, 0].item(), C[i, 1].item(), ITOS[i], ha="center", va="center", color="white")
plt.grid("minor")

In [None]:
# Perform the same optimization using a higher dimensional embedding space
# and a slightly smaller hidden dimension.

epoch_i = list()
loss_i = list()

n_emb = 10
W1_in = 30
n_epochs = 40_000
start_decay = int(0.9 * n_epochs)
n_hidden = 200
batch_size = 32

gen = torch.Generator().manual_seed(2147483647)
C = torch.randn((len(STOI), n_emb), generator=gen)
W1 = torch.randn((W1_in, n_hidden), generator=gen)
b1 = torch.randn(n_hidden, generator=gen)
W2 = torch.randn((n_hidden, len(STOI)))
b2 = torch.randn(len(STOI))
parameters = [C, W1, b1, W2, b2]

for p in parameters:
    p.requires_grad = True

for epoch in range(n_epochs):

    # Get indices of minibatch from the training data
    ix = torch.randint(0, Xtr.shape[0], (batch_size,))

    # Forward pass using the training targets
    embedding = C[Xtr[ix]]
    h = torch.tanh(embedding.view(-1, W1_in) @ W1 + b1)  # batch_size, n_hidden
    logits = h @ W2 + b2  # batch_size, len(stoi)
    loss = F.cross_entropy(logits, Ytr[ix])

    # Backward pass
    for p in parameters:
        p.grad = None
    loss.backward()

    # Update and decay the learning rate if required
    current_rate = optimal_rate if epoch < start_decay else optimal_rate / 10
    for p in parameters:
        p.data += current_rate * p.grad

    epoch_i.append(epoch)
    loss_i.append(loss.item())

plt.plot(epoch_i, loss_i)
plt.axvline(start_decay, linestyle="--", color="r")
plt.title("Loss")
plt.show()

# compute loss over the development set and compare with the training loss
# to check whether we're overfitting or not

embedding = C[Xdev]
h = torch.tanh(embedding.view(-1, W1_in) @ W1 + b1)
logits = h @ W2 + b2
dev_loss = F.cross_entropy(logits, Ydev)

embedding = C[Xtr]
h = torch.tanh(embedding.view(-1, W1_in) @ W1 + b1)
logits = h @ W2 + b2
train_loss = F.cross_entropy(logits, Ytr)

assert np.isclose(
    dev_loss.item(), train_loss.item(), atol=1e-1
), f"Overfitting: DL: {dev_loss.item():.2f}, TL: {train_loss.item():.2f}"
print(f"DL: {dev_loss.item():.2f}, TL: {train_loss.item():.2f}")

# Visualize the N-Dim NN character embedding
tsne = TSNE(n_jobs=4, perplexity=5)
tsne_emb = tsne.fit_transform(C.detach().numpy())

plt.figure(figsize=(8, 8))
plt.scatter(tsne_emb[:, 0].data, tsne_emb[:, 1].data, s=200)
for i in range(tsne_emb.shape[0]):
    plt.text(tsne_emb[i, 0].item(), tsne_emb[i, 1].item(), ITOS[i], ha="center", va="center", color="white")
plt.grid("minor")