# mlp reimplementation

Reproducing https://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf

Code written while following along with: https://www.youtube.com/watch?v=TCH_1BHY58I

In [1]:
import torch
from torch import nn
import torch.nn.functional as F
from torchvision import datasets
from torchvision.transforms import ToTensor
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
import string

stoi = {ch: i+1 for i, ch in enumerate(string.ascii_lowercase)}
stoi['.'] = 0
itos = {i: ch for ch, i in stoi.items()}

In [3]:
words = open('data/names.txt', 'r').read().splitlines()

In [94]:
#### Hyperparameters

# Num of features for each tri-gram embedding to learn
m_dim = 10

# possible characters, the vocabulary size, this isn't actually tuneable as its len(alphas) + 1 for delimiter
v = 27

# Number of neurons in hidden layer
h = 200

# This is n in the paper, the size of the "context" or look-back window
block_size = n = 3

In [95]:
#### Data prep

def build_dataset(in_words):
    # Create the dataset, where the inputs are a trigram context window -> outputs are the next character
    X, Y = [], []
    
    for w in in_words:
        w = w + '.'
        context = [0] * block_size
        for char in list(w):
            X.append(context)
            Y.append(stoi[char])
            context = context[1:] + [stoi[char]]
    
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    return X, Y


import random
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

X_train, Y_train = build_dataset(words[:n1])
X_dev, Y_dev = build_dataset(words[n1:n2])
X_test, Y_test = build_dataset(words[n2:])

In [97]:
#### Parameter initialization

# Embedding each character to 2-dimensional feature space
C = torch.rand((v, m_dim))

# Hidden layer weights and biases
W1 = torch.randn((block_size*m_dim, h)) * 0.1
b1 = torch.randn(h) * 0.01

# Output layer weights and biases
W2 = torch.randn((h, v)) * 0.01
b2 = torch.randn(v) * 0

for p in [C, W1, b1, W2, b2]:
    if not p.requires_grad:
        p.requires_grad = True

In [None]:
# Training loop
batch_size = 32

stepi = []
lossi = []
dead_neurons = []

for i in range(200000):
    # minibatch
    # select batch_size random indices from the training data
    idxs = torch.randint(0, X_train.shape[0], (batch_size,))
    idx = X_train[idxs]

    # Forwards pass
    embedding = C[idx]
    # Lots of dead neurons when using ReLU, but turns out this wasn't the issue causing
    # bad performance at first. Seems to work with tanh/leaky_ReLU/ReLU all the same.
    #z1 = F.relu(embedding.view(-1, block_size*m_dim) @ W1 + b1) # (batch_size, h)
    #z1 = F.leaky_relu(embedding.view(-1, block_size*m_dim) @ W1 + b1) # (batch_size, h)
    z1 = F.tanh(embedding.view(-1, block_size*m_dim) @ W1 + b1) # (batch_size, h)
    logits = z1 @ W2 + b2 # (batch_size, v)
    loss = F.cross_entropy(logits, Y_train[idxs])

    # Backwards pass
    for params in [C, W1, b1, W2, b2]:
        params.grad = None

    loss.backward()

    learning_rate = 0.1
    dead = 0
    for p in [C, W1, b1, W2, b2]:
        dead += torch.sum((p.data == 0).int()).item()
        p.data += -learning_rate * p.grad

    dead_neurons.append(dead)
    stepi.append(i)
    lossi.append(loss.log10().item())
    if i % 10000 == 0:
        print(f'Batch: {i}, training loss: {loss.item()}')

Batch: 0, training loss: 3.296767473220825
Batch: 10000, training loss: 2.417764663696289
Batch: 20000, training loss: 1.9988784790039062


In [None]:
plt.plot(stepi, lossi)

In [None]:
plt.hist(z1.view(-1).tolist(), 50);

In [89]:
@torch.no_grad()
def split_loss(split):
    data, labels = {
        'dev': (X_dev, Y_dev),
        'test': (X_test, Y_test),
    }[split]
    
    embedding = C[data]
    z1 = F.tanh(embedding.view(-1, block_size*m_dim) @ W1 + b1)
    logits = z1 @ W2 + b2
    loss = F.cross_entropy(logits, labels)
    print(f'{split} split loss: {loss}')

split_loss('dev')
split_loss('test')

dev split loss: 2.2908425331115723
test split loss: 2.2985916137695312


In [84]:
# Test it out

for _ in range(10):
    out = []
    idx = 0
    context = [0] * block_size
    while True:
        emd = C[context].view((1, block_size*m_dim))
        #z1 = F.leaky_relu(emd @ W1 + b1)
        z1 = F.tanh(emd @ W1 + b1)
        z2 = F.softmax(z1 @ W2 + b2, dim=1)
        idx = torch.multinomial(z2, num_samples=1, replacement=True).item()
        out.append(itos[idx])
        context = context[1:] + [idx]
        if idx == 0:
            break

        # debug infinite loops...
        if len(out) > 10:
            break
    
    print(''.join(out))

aniyla.
rosen.
karala.
gez.
dayne.
kaleia.
aunika.
griellyessa
myra.
iva.
