In [1]:
import torch
import matplotlib.pyplot as plt
%matplotlib inline

# **Dataset**

We want to train an MLP to learn the conditional distribution of characters in names w.r.t. a 3-letter context window, i.e. $P(x_i|x_{i-1}, x_{i-2}, x_{i-3})$. To achieve this we take each name in the dataset and extract from it all possible three-letter sequences and the corresponding target letter.
We use '.' as the special token to denote the start and end of a name. For example, the name "John" would give us the following data points:

* ... $\rightarrow$ J
* ..J $\rightarrow$ o
* .Jo $\rightarrow$ h
* Joh $\rightarrow$ n
* ohn $\rightarrow$ .

The objective of the network then is to maximize the log-likelihood of the data, i.e. to maximize the probability of the target letter given the context window. We use the negative log-likelihood as the loss function to be minimized.

In [2]:
# Load data
words = open('names.txt', 'r').read().splitlines()
print(words[:10])

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia', 'harper', 'evelyn']


In [4]:
# Create a dictionary that maps characters to integers and vice versa
char2idx = {c: i+1 for i, c in enumerate('abcdefghijklmnopqrstuvwxyz')}
char2idx['.'] = 0 # special character for marking start and end of a word
idx2char = {i: c for c, i in char2idx.items()}

In [17]:
# Form training pairs of context and target characters
block_size = 3 # context size for next character prediction
X, Y = [], []

for word in words[:5]:
    w2idx = [0] * block_size + [char2idx[c] for c in word] + [0]
    for i in range(len(w2idx) - block_size):
        X.append(w2idx[i:i+block_size])
        Y.append(w2idx[i+block_size])

X = torch.tensor(X)
Y = torch.tensor(Y)

print('X:', X.shape, X.dtype, '\nY:', Y.shape, Y.dtype)
# print first 5 samples of X and Y
# for i in range(len(Y)):
#     print(''.join([idx2char[idx.item()] for idx in X[i]]), '->', idx2char[Y[i].item()])

X: torch.Size([32, 3]) torch.int64 
Y: torch.Size([32]) torch.int64


# **MLP**