In [2]:
import torch
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt

MLP Approach:
- construct look up table to convert words to N dimensional embedding vector; constitutes inputs to first layer of NN
- NN hidden layer - fully connected tanh layer
- output layer is size V (vocabulary) - softmax function creates probability distribution over vocabulary of words

In [22]:
words = open('names.txt', 'r').read().splitlines()
print(words[:8])
print(len(words))

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']
32033


In [23]:
# build vocabulary of characters
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i, s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}


In [24]:
words[0]

'emma'

In [25]:
# build dataset
block_size = 3 # context length: how many characters constitute the input?
X, Y = [], [] # input and output 

for w in words[:5]:
    print(w)
    context = [0] * block_size 
    for ch in w + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        print(''.join(itos[i] for i in context), '--->', itos[ix])
        context = context[1:] + [ix] # move forward one character and append

X = torch.tensor(X)
Y = torch.tensor(Y)





emma
... ---> e
..e ---> m
.em ---> m
emm ---> a
mma ---> .
olivia
... ---> o
..o ---> l
.ol ---> i
oli ---> v
liv ---> i
ivi ---> a
via ---> .
ava
... ---> a
..a ---> v
.av ---> a
ava ---> .
isabella
... ---> i
..i ---> s
.is ---> a
isa ---> b
sab ---> e
abe ---> l
bel ---> l
ell ---> a
lla ---> .
sophia
... ---> s
..s ---> o
.so ---> p
sop ---> h
oph ---> i
phi ---> a
hia ---> .


In [26]:
X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([32, 3]), torch.int64, torch.Size([32]), torch.int64)

In [50]:
g = torch.Generator().manual_seed(2147483647)
# create embedding lookup table (v x d)
v = len(chars)+1
d = 2
C = torch.randn((v, d), generator=g)

# doing a lookup table indexing is equivalent to one-hot encoding and multiplying
# first approach is faster
print(C[5])
print(F.one_hot(torch.tensor(5), num_classes=27).float() @ C)

# Can use tensor indexing to take the input matrix and grab corresponding embeddings directly
print(C[[5,6,7]])
print(C[torch.tensor([5,6,7,7,7,7])])
print(C[X].shape)

tensor([-0.4713,  0.7868])
tensor([-0.4713,  0.7868])
tensor([[-0.4713,  0.7868],
        [-0.3284, -0.4330],
        [ 1.3729,  2.9334]])
tensor([[-0.4713,  0.7868],
        [-0.3284, -0.4330],
        [ 1.3729,  2.9334],
        [ 1.3729,  2.9334],
        [ 1.3729,  2.9334],
        [ 1.3729,  2.9334]])
torch.Size([32, 3, 2])


In [51]:
# input embedding layer
emb = C[X]
print(emb.shape)

# flatten tensor to concatenate full bcontext window embeddings
h = emb.view(-1, i)
print(h.shape)

torch.Size([32, 3, 2])
torch.Size([32, 6])


In [52]:
# hidden layer
# context window (3) * n_dimension (2) = 6
# N neurons is up to us
i = block_size * d
j = 100

W1 = torch.randn((i, j), generator = g)
b1 = torch.randn(j, generator=g)
print(W1.shape)
print(b1.shape)
h = torch.tanh(h @ W1 + b1)
print(h.shape)


torch.Size([6, 100])
torch.Size([100])
torch.Size([32, 100])


In [53]:
# Output layer
W2 = torch.randn((j, v), generator=g)
b2 = torch.randn(v, generator=g)

logits = h @ W2 + b2
print(logits.shape)
counts = logits.exp()
prob = counts / counts.sum(1, keepdim=True)


torch.Size([32, 27])


In [54]:
# grab NN's prob value for each actual output
# for each example
# convert to NLL
loss = -prob[torch.arange(32), Y].log().mean()
print(loss)

tensor(17.7697)
