In [None]:
import torch
import torch.nn.functional as F

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In order to add more context we could add more letters to improve the predictions. Nevertheless, the numbers of rows in the contingency table will grow exponentially. 

Solution: Use the NN approach

We will implement a MLP model, based on the first paper that proposed a similar algorithm:
[Open PDF](papers/BengioDucharmeVincentJanvin2003.pdf)

The idea of the paper is:
- Use multiple inputs simultaneously
- Use an embedding to encode the inputs (in the paper, words). 
    - Significant lower dimension than one-hot encoding
    - The same encoder is used for all characthers
    - Saves a lot of parameters to learn
- Use a single hidden fully connected layer with tanh as activation function
- Use a softmax output layer for the next character probabilities

In [None]:
words = open('data/names.txt', 'r').read().splitlines()
words[:8]

In [None]:
len(words)

In [None]:
# build the vocabulary of characters and mappings to/from integers
all_chars = ['.'] + sorted(list(set("".join(words))))
itos = {idx: v for idx, v in enumerate(all_chars)}
stoi = {v: k for k, v in itos.items()}

In [None]:
len(all_chars)

In [None]:
print(itos)

In [None]:
# build the dataset
block_size = 3
X, Y = [], []

for w in words[:5]:
    print(w)
    context = [0] * block_size
    for ch in w + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        print(''.join(itos[i] for i in context), '--->', itos[ix])
        context = context[1:] + [ix]

X = torch.tensor(X)
Y = torch.tensor(Y)

In [None]:
X.shape, X.dtype, Y.shape, Y.dtype

In [None]:
X[:10]

In [None]:
Y[:10]

In the paper, they translate from the original 17'000 words space into 30, 60 or 100 space. 

Here, we will move from the 27 characters space to a 2 dimmensional space

In [None]:
C = torch.randn((27, 2))

In [None]:
C

In [None]:
# Initial code for character at position 5
C[5]

In [None]:
# The alternative one-hot encoding of 5 
F.one_hot(torch.tensor(5), num_classes=27)

In [None]:
F.one_hot(torch.tensor(5), num_classes=27).float() @ C 

Because of the way multiplication is performed, this is equivalent to the direct indexing.

Then, we can understand the embeding operation like the first neural network layer
- Linear layer where the proper encoding is learned by backpropagation.
- No activation function 

In order to implement the layer, we can use torch indexing power.

In [None]:
C[[5, 6, 7]]

In [None]:
# we can even repeat indexes, and it dupplicate the rows
C[[3, 4, 5, 5, 5, 5]]

In [None]:
# We can also index with multidimensional arrays
C[torch.tensor([[2, 3, 4], [3, 4, 5], [4, 5, 6], [5, 6, 7]])]

In [None]:
C[torch.tensor([[2, 3, 4], [3, 4, 5], [4, 5, 6], [5, 6, 7]])].shape

In [None]:
# So, we can directly use the vector X to index in the embedding matrix
C[X]

In [None]:
C[X].shape

In [None]:
# lets create the embedding layer
emb = C[X]
emb.shape

In [None]:
# Now, lets create the fully connected layer (the one with tanh activation)
# the number of inputs is 2 x 3 = 6, and we will use 100 neurons
W1 = torch.randn((6, 100))
b1 = torch.randn(100)

In [None]:
# Now we have a problem, because the embedding matrix cannot be multiplied by the first layer
emb @ W1 + b1

In [None]:
# The problem, the embedding results are split in three different component 
# We need to concatenate the output off all the embeddings [32, 3, 2] -> [32, 6]
# Torch has many different functions that can perform that operation
emb[:, 0, :].shape


In [None]:
torch.cat([emb[:, 0, :], emb[:, 1, :], emb[:, 2, :]], dim=1).shape

In [None]:
# Another posibility is to use torch.unbind, which removes a dimension from a tensor, returning a list
torch.cat(torch.unbind(emb, 1), dim=1).shape

In [None]:
torch.all(torch.cat([emb[:, 0, :], emb[:, 1, :], emb[:, 2, :]], dim=1) == torch.cat(torch.unbind(emb, 1), dim=1))

In [None]:
# A simpler operator is view, which reshape dynamically the informatio of the tensor
emb.view(-1, 6).shape

In [None]:
# Results of both operations are equivalent
torch.all(torch.cat([emb[:, 0, :], emb[:, 1, :], emb[:, 2, :]], dim=1) == emb.view(-1, 6))

In [None]:
# Modifying the original code
h = (emb.view(-1, 6) @ W1 + b1)
h.shape

In [None]:
# we got the output of each of the 100 neurons for each of the 32 inputs
h

In [None]:
# lets add the tanh
h = (emb.view(-1, 6) @ W1 + b1).tanh()
h

In [None]:
# Lets create the final layer
W2 = torch.randn([100, len(all_chars)])
b2 = torch.rand(len(all_chars))

In [None]:
# And calculate the softmax
logits = h @ W2 + b2
counts = logits.exp()
probs = counts / counts.sum(dim=1, keepdim=True)
probs.shape

In [None]:
# Now the loss
loss = -probs[torch.arange(32), Y].log().mean()
loss

In [None]:
# There is a Torch function that performs softmax with the cross entropy loss
F.cross_entropy(logits, Y)

This is not only clearer, but faster, because derivatives are calculate directly (like in minigrad example of tanh)

Lets put it all together

In [None]:
# lets put all together
g = torch.Generator().manual_seed(31416)
C = torch.randn((27, 2), generator=g)
W1 = torch.randn((6, 100), generator=g)
b1 = torch.randn(100, generator=g)
W2 = torch.randn([100, len(all_chars)], generator=g)
b2 = torch.rand(len(all_chars), generator=g)
parameters = [C, W1, b1, W2, b2]

for p in parameters:
    p.requires_grad = True

In [None]:
sum(p.nelement() for p in parameters)

In [None]:
for epoch in range(10000):

    # forward pass

    # - Embedding
    emb = C[X]
    # - Layer 1
    h = torch.tanh((emb.view(-1, 6) @ W1 + b1))
    # - Output layer
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Y)

    if epoch < 10 or epoch % 1000 == 0:
        print(f"Epoch:{epoch}, loss={loss.item()}")

    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()

    # update
    for p in parameters:
        p.data -= 0.01 * p.grad



 The loss go down so fast because we are only overfitting the model of thousands parameters for only 32 objects

In [None]:
torch.max(logits, dim=1)

In [None]:
Y

There are few errors, mostly because there are identical inputs with different outputs.

Now, lets try with the full dataset.

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
# build the dataset
block_size = 3
X, Y = [], []

for w in words:
    context = [0] * block_size
    for ch in w + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        context = context[1:] + [ix]

X = torch.tensor(X, device=device)
Y = torch.tensor(Y, device=device)

In [None]:
X.shape, Y.shape

In [None]:
# lets put all together
g = torch.Generator(device=device).manual_seed(31416)
C = torch.randn((27, 2), generator=g, device=device)
W1 = torch.randn((6, 100), generator=g, device=device)
b1 = torch.randn(100, generator=g, device=device)
W2 = torch.randn([100, len(all_chars)], generator=g, device=device)
b2 = torch.rand(len(all_chars), generator=g, device=device)
parameters = [C, W1, b1, W2, b2]

for p in parameters:
    p.requires_grad = True

In [None]:

for epoch in range(100):

    # forward pass

    # - Embedding
    emb = C[X]
    # - Layer 1
    h = torch.tanh((emb.view(-1, 6) @ W1 + b1))
    # - Output layer
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Y)

    if epoch < 10 or epoch % 10 == 0:
        print(f"Epoch:{epoch}, loss={loss.item()}")

    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()

    # update
    for p in parameters:
        p.data -= 0.1 * p.grad



The loss is going down, but we can see each execution is quite slower than before.
- Now the loss expansion is a quite large and complex graph, with more than 200K elements!

Solution: Run on mini-batches of the problem
- A minibatch is a random subset of the training data
- We have many local improvements of the loss function instead of a slow global improvement

In [None]:
# We are going to select a random subset of indices of element in the training sample
torch.randint(0, X.shape[0], (32,))

In [None]:
# lets put all together
g = torch.Generator(device=device).manual_seed(31416)
C = torch.randn((27, 2), generator=g, device=device)
W1 = torch.randn((6, 100), generator=g, device=device)
b1 = torch.randn(100, generator=g, device=device)
W2 = torch.randn([100, len(all_chars)], generator=g, device=device)
b2 = torch.rand(len(all_chars), generator=g, device=device)
parameters = [C, W1, b1, W2, b2]

for p in parameters:
    p.requires_grad = True

In [None]:
batch_size = 32

for epoch in range(10000):

    # build minibatch
    ix = torch.randint(0, X.shape[0], (batch_size,))

    # forward pass
    emb = C[X[ix]]
    h = torch.tanh((emb.view(-1, 6) @ W1 + b1))
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Y[ix])

    if epoch < 10 or epoch % 1000 == 0:
        print(f"Epoch:{epoch}, loss={loss.item()}")

    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()

    # update
    for p in parameters:
        p.data -= 0.1 * p.grad

Now the loss is evaluated only on the minibatches, not the whole training set
- The quality of the gradient is lower
- The loss is moving up and down, depending on the particular minibatch, but the trend is going down

Usually, it is better to have multiple low-quality steps that a better slow high quality step.

In [None]:
# Lets evaluate the loss in the whole training set
with torch.no_grad():
    emb = C[X]
    h = torch.tanh((emb.view(-1, 6) @ W1 + b1))
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Y)
    print(loss.item())

In [None]:
# Now, lets play with different learning rates

for learning_rate in [0.0001, 0.001, 0.01, 1, 10, 0.1]:
    print("Learning rate:", learning_rate)
    print("="*30)
    g = torch.Generator(device=device).manual_seed(31416)
    C = torch.randn((27, 2), generator=g, device=device)
    W1 = torch.randn((6, 100), generator=g, device=device)
    b1 = torch.randn(100, generator=g, device=device)
    W2 = torch.randn([100, len(all_chars)], generator=g, device=device)
    b2 = torch.rand(len(all_chars), generator=g, device=device)
    parameters = [C, W1, b1, W2, b2]

    for p in parameters:
        p.requires_grad = True
    batch_size = 32

    for epoch in range(1000):

        # build minibatch
        ix = torch.randint(0, X.shape[0], (batch_size,))

        # forward pass
        emb = C[X[ix]]
        h = torch.tanh((emb.view(-1, 6) @ W1 + b1))
        logits = h @ W2 + b2
        loss = F.cross_entropy(logits, Y[ix])

        if epoch < 10 or epoch % 100 == 0:
            print(f"Epoch:{epoch}, loss={loss.item()}")

        # backward pass
        for p in parameters:
            p.grad = None
        loss.backward()

        # update
        for p in parameters:
            p.data -= learning_rate * p.grad

0.1 seems to be the best value. Lets train the whole network with that value, and show the evolution of the loss

In [None]:
# lets put all together
g = torch.Generator(device=device).manual_seed(31416)
C = torch.randn((27, 2), generator=g, device=device)
W1 = torch.randn((6, 100), generator=g, device=device)
b1 = torch.randn(100, generator=g, device=device)
W2 = torch.randn([100, len(all_chars)], generator=g, device=device)
b2 = torch.rand(len(all_chars), generator=g, device=device)
parameters = [C, W1, b1, W2, b2]

for p in parameters:
    p.requires_grad = True
batch_size = 32

lossi = []

In [None]:
for epoch in range(20000):

    # build minibatch
    ix = torch.randint(0, X.shape[0], (batch_size,))

    # forward pass
    emb = C[X[ix]]
    h = torch.tanh((emb.view(-1, 6) @ W1 + b1))
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Y[ix])
    
    lossi.append(loss.item())

    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()

    # update
    for p in parameters:
        p.data -= 0.1 * p.grad

In [None]:
plt.plot(lossi)

In [None]:
# Lets evaluate the loss in the whole training set
with torch.no_grad():
    emb = C[X]
    h = torch.tanh((emb.view(-1, 6) @ W1 + b1))
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Y)
    print(loss.item())

In general, the learning rate should be adjusted while training, going from larger values in the beginning to smaller values in final epochs. This is known as **learning rate decay**

We will implement a very simple type of decay.

In [None]:
# lets put all together
g = torch.Generator(device=device).manual_seed(31416)
C = torch.randn((27, 2), generator=g, device=device)
W1 = torch.randn((6, 100), generator=g, device=device)
b1 = torch.randn(100, generator=g, device=device)
W2 = torch.randn([100, len(all_chars)], generator=g, device=device)
b2 = torch.rand(len(all_chars), generator=g, device=device)
parameters = [C, W1, b1, W2, b2]

for p in parameters:
    p.requires_grad = True
batch_size = 32

lossi = []

In [None]:
num_epochs = 20000

for epoch in range(num_epochs):

    # build minibatch
    ix = torch.randint(0, X.shape[0], (batch_size,))

    # forward pass
    emb = C[X[ix]]
    h = torch.tanh((emb.view(-1, 6) @ W1 + b1))
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Y[ix])
    
    lossi.append(loss.item())

    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()

    learning_rate = 0.1 if epoch < num_epochs / 2 else 0.01

    # update
    for p in parameters:
        p.data -= learning_rate * p.grad

In [None]:
# Lets evaluate the loss in the whole training set
with torch.no_grad():
    emb = C[X]
    h = torch.tanh((emb.view(-1, 6) @ W1 + b1))
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Y)
    print(loss.item())

You can see that this loss is lower than the best one we achieved with the bi-gram model: 2.47446

But there is a problem: This model have much more parameters than the previous one, so maybe it is only overfitting by memorizing the training set.

Solution: Split the available data intro three different sets:
- Training set: used to tune parameters of the models
- Validation set: used to select between models and to tune the hyper-parameters of a given model
- Test set: used to estimate the power of generalization of a resultant model.

Note:
- Training and validation sets can be used freely, but the use of test set must be strictly limited.

In [None]:
# build the dataset
def build_dataset(words):
    block_size = 3
    X, Y = [], []

    for w in words:
        context = [0] * block_size
        for ch in w + '.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix]

    X = torch.tensor(X, device=device)
    Y = torch.tensor(Y, device=device)
    return X, Y

import random
random.seed(314)
random.shuffle(words)
n1 = int(0.8 * len(words))
n2 = int(0.9 * len(words))
Xtr, Ytr = build_dataset(words[:n1])
Xval, Yval = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])

In [None]:
 len(words), len(Xtr), len(Xval), len(Xte)

Now, lets transform all the previous code to the new datasets

In [None]:
g = torch.Generator(device=device).manual_seed(31416)
C = torch.randn((27, 2), generator=g, device=device)
W1 = torch.randn((6, 100), generator=g, device=device)
b1 = torch.randn(100, generator=g, device=device)
W2 = torch.randn([100, len(all_chars)], generator=g, device=device)
b2 = torch.rand(len(all_chars), generator=g, device=device)
parameters = [C, W1, b1, W2, b2]

for p in parameters:
    p.requires_grad = True
batch_size = 32

lossi = []

In [None]:
for epoch in range(20000):

    # build minibatch
    ix = torch.randint(0, Xtr.shape[0], (batch_size,))

    # forward pass
    emb = C[Xtr[ix]]
    h = torch.tanh((emb.view(-1, 6) @ W1 + b1))
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Ytr[ix])
    
    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()

    learning_rate = 0.1 if epoch < num_epochs / 2 else 0.01
    # update
    for p in parameters:
        p.data -= learning_rate * p.grad


In [None]:
with torch.no_grad():
    emb = C[Xtr]
    h = torch.tanh((emb.view(-1, 6) @ W1 + b1))
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Ytr)
    print("Training loss", loss.item())

    emb = C[Xval]
    h = torch.tanh((emb.view(-1, 6) @ W1 + b1))
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Yval)
    print("Validation loss", loss.item())

Since both losses are similar, the network is not memorizing the data but actually learning from data. 

On the other hand, the model is underfitting, because both losses are quite similar. We can change to a model with more parameters.

Lets increase the size of the hiden layer. 
- Note that we are here tunning hyperparameters

In [None]:
g = torch.Generator(device=device).manual_seed(31416)
C = torch.randn((27, 2), generator=g, device=device)
W1 = torch.randn((6, 300), generator=g, device=device)
b1 = torch.randn(300, generator=g, device=device)
W2 = torch.randn([300, len(all_chars)], generator=g, device=device)
b2 = torch.rand(len(all_chars), generator=g, device=device)
parameters = [C, W1, b1, W2, b2]

for p in parameters:
    p.requires_grad = True
batch_size = 32

lossi = []

In [None]:
sum(p.nelement() for p in parameters)

In [None]:

for epoch in range(20000):

    # build minibatch
    ix = torch.randint(0, Xtr.shape[0], (batch_size,))

    # forward pass
    emb = C[Xtr[ix]]
    h = torch.tanh((emb.view(-1, 6) @ W1 + b1))
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Ytr[ix])
    lossi.append(loss.item())
    
    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()

    learning_rate = 0.1 if epoch < num_epochs / 2 else 0.01
    # update
    for p in parameters:
        p.data -= learning_rate * p.grad


In [None]:
with torch.no_grad():
    emb = C[Xtr]
    h = torch.tanh((emb.view(-1, 6) @ W1 + b1))
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Ytr)
    print("Training loss", loss.item())

    emb = C[Xval]
    h = torch.tanh((emb.view(-1, 6) @ W1 + b1))
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Yval)
    print("Validation loss", loss.item())


Now the loss is lower, and we can keep increasing the number of network parameters
- Increasing the number of parameters usually increase the training time.

Lets take a look to the loss.

In [None]:
plt.plot(lossi)

There is a lot of variance in the loss due to the batch size, which is a non-representative percentage of the whole dataset. Lets increase it.

In [None]:
def inner():
    g = torch.Generator(device=device).manual_seed(31416)
    C = torch.randn((27, 2), generator=g, device=device)
    W1 = torch.randn((6, 300), generator=g, device=device)
    b1 = torch.randn(300, generator=g, device=device)
    W2 = torch.randn([300, len(all_chars)], generator=g, device=device)
    b2 = torch.rand(len(all_chars), generator=g, device=device)
    parameters = [C, W1, b1, W2, b2]

    for p in parameters:
        p.requires_grad = True
    batch_size = 64

    lossi = []
    
    for epoch in range(20000):

        # build minibatch
        ix = torch.randint(0, Xtr.shape[0], (batch_size,))

        # forward pass
        emb = C[Xtr[ix]]
        h = torch.tanh((emb.view(-1, 6) @ W1 + b1))
        logits = h @ W2 + b2
        loss = F.cross_entropy(logits, Ytr[ix])
        
        epochs.append(epoch)
        lossi.append(loss.item())

        # backward pass
        for p in parameters:
            p.grad = None
        loss.backward()

        learning_rate = 0.1 if epoch < num_epochs / 2 else 0.01
        # update
        for p in parameters:
            p.data -= learning_rate * p.grad

    plt.plot(epochs, lossi)


inner()


We can see that now the variance is lower, but the memory and training time increases: we have a tradeoff here.

Now, lets visualize the embedding results.

In [None]:
plt.figure(figsize=(8, 8))
plt.scatter(C[:,0].data.cpu(), C[:,1].data.cpu(), s=200)
for i in range(C.shape[0]):
    plt.text(C[i,0].item(), C[i,1].item(), itos[i], ha='center', va='center', color='white')
plt.grid('minor')
plt.show()

We significantly improve the number of neurons in the hidden layer, but the impact in loss is not significant. Lets consider a larger embedding dimmension.

In [None]:
embedding_size = 10
hidden_layer_size = 200

g = torch.Generator(device=device).manual_seed(31416)
C = torch.randn((27, embedding_size), generator=g, device=device)
W1 = torch.randn((block_size*embedding_size, hidden_layer_size), generator=g, device=device)
b1 = torch.randn(hidden_layer_size, generator=g, device=device)
W2 = torch.randn([hidden_layer_size, len(all_chars)], generator=g, device=device)
b2 = torch.rand(len(all_chars), generator=g, device=device)
parameters = [C, W1, b1, W2, b2]
 
for p in parameters:
    p.requires_grad = True
batch_size = 32

epochs = []
lossi = []


In [None]:
for epoch in range(20000):

    # build minibatch
    ix = torch.randint(0, Xtr.shape[0], (batch_size,))

    # forward pass
    emb = C[Xtr[ix]]
    h = torch.tanh((emb.view(-1, block_size*embedding_size) @ W1 + b1))
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Ytr[ix])
    
    epochs.append(epoch)
    lossi.append(loss.item())

    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()

    learning_rate = 0.1 if epoch < num_epochs / 2 else 0.01
    # update
    for p in parameters:
        p.data -= learning_rate * p.grad

In [None]:
with torch.no_grad():
    emb = C[Xtr]
    h = torch.tanh((emb.view(-1, block_size*embedding_size) @ W1 + b1))
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Ytr)
    print("Training loss", loss.item())

    emb = C[Xval]
    h = torch.tanh((emb.view(-1, block_size*embedding_size) @ W1 + b1))
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Yval)
    print("Validation loss", loss.item())

The network is getting better, and some overfit starts to appear.

You can tune many network parameters in order to decrease the loss:
- Add neurons to the hidden layer
- Use a larger block size (more letters)
- Increase the number of layers

Finally, when the quality cannot be improved any longer, use the test set to have an accurate estimation about the quality of the model while dealing with unseen data.

In [None]:
# New names generated

for _ in range(20):
    xs = [0, 0, 0]
    letters = []
    while True:
        emb = C[torch.tensor(xs)]
        h = torch.tanh((emb.view(-1) @ W1 + b1))
        logits = h @ W2 + b2
        probs = F.softmax(logits, dim=0)
        ix = torch.multinomial(probs,  num_samples=1, replacement=True).item()
        letters.append(itos[ix])
        xs = xs[1:] + [ix]
        if ix == 0:        
            break
    print(''.join(letters))

An interesting modification is keeping only the largest probabilities while creating the new names

In [None]:
k = 6
for _ in range(20):
    xs = [0, 0, 0]
    letters = []
    while True:
        emb = C[torch.tensor(xs)]
        h = torch.tanh((emb.view(-1) @ W1 + b1))
        logits = h @ W2 + b2
        probs = F.softmax(logits, dim=0)

        _, indices = torch.topk(probs, k)
        mask = torch.zeros_like(probs)
        mask.scatter_(0, indices, 1)
        probs = probs * mask

        ix = torch.multinomial(probs,  num_samples=1, replacement=True).item()
        letters.append(itos[ix])
        xs = xs[1:] + [ix]
        if ix == 0:        
            break
    print(''.join(letters))