In [1]:
import torch
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt

MLP Approach:
- construct look up table to convert words to N dimensional embedding vector; constitutes inputs to first layer of NN
- NN hidden layer - fully connected tanh layer
- output layer is size V (vocabulary) - softmax function creates probability distribution over vocabulary of words

In [3]:
words = open('names.txt', 'r').read().splitlines()
print(words[:8])
print(len(words))

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']
32033


In [4]:
# build vocabulary of characters
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i, s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}


In [5]:
words[0]

'emma'

In [21]:
# build dataset
block_size = 3 # context length: how many characters constitute the input?
X, Y = [], [] # input and output 

for w in words:
    # print(w)
    context = [0] * block_size 
    for ch in w + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        # print(''.join(itos[i] for i in context), '--->', itos[ix])
        context = context[1:] + [ix] # move forward one character and append

X = torch.tensor(X)
Y = torch.tensor(Y)





In [22]:
X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([228146, 3]), torch.int64, torch.Size([228146]), torch.int64)

In [8]:
g = torch.Generator().manual_seed(2147483647)
# create embedding lookup table (v x d)
v = len(chars)+1
d = 2
C = torch.randn((v, d), generator=g)

# doing a lookup table indexing is equivalent to one-hot encoding and multiplying
# first approach is faster
print(C[5])
print(F.one_hot(torch.tensor(5), num_classes=27).float() @ C)

# Can use tensor indexing to take the input matrix and grab corresponding embeddings directly
print(C[[5,6,7]])
print(C[torch.tensor([5,6,7,7,7,7])])
print(C[X].shape)

tensor([-0.4713,  0.7868])
tensor([-0.4713,  0.7868])
tensor([[-0.4713,  0.7868],
        [-0.3284, -0.4330],
        [ 1.3729,  2.9334]])
tensor([[-0.4713,  0.7868],
        [-0.3284, -0.4330],
        [ 1.3729,  2.9334],
        [ 1.3729,  2.9334],
        [ 1.3729,  2.9334],
        [ 1.3729,  2.9334]])
torch.Size([32, 3, 2])


In [10]:
# input embedding layer
emb = C[X]
print(emb.shape)

# flatten tensor to concatenate full context window embeddings
i = block_size * d
h = emb.view(-1, i)
print(h.shape)

torch.Size([32, 3, 2])
torch.Size([32, 6])


In [11]:
# hidden layer
# context window (3) * n_dimension (2) = 6
# N neurons is up to us
i = block_size * d
j = 100

W1 = torch.randn((i, j), generator = g)
b1 = torch.randn(j, generator=g)
print(W1.shape)
print(b1.shape)
h = torch.tanh(h @ W1 + b1)
print(h.shape)


torch.Size([6, 100])
torch.Size([100])
torch.Size([32, 100])


In [12]:
# Output layer
W2 = torch.randn((j, v), generator=g)
b2 = torch.randn(v, generator=g)

logits = h @ W2 + b2
print(logits.shape)
counts = logits.exp()
prob = counts / counts.sum(1, keepdim=True)


torch.Size([32, 27])


In [13]:
params = [C, W1, b1, W2, b2]
sum(p.nelement() for p in params)

3481

In [14]:
# grab NN's prob value for each actual output
# for each example
# convert to NLL
loss = -prob[torch.arange(32), Y].log().mean()
print(loss)

tensor(17.7697)


In [17]:
parameters = [C, W1, b1, W2, b2]
for p in parameters:
    p.requires_grad = True

In [20]:
k = 1000

for _ in range(k):
    # forward pass
    emb = C[X]
    h = torch.tanh(emb.view(-1, i) @ W1 + b1)
    logits = h @ W2 + b2 
    loss = F.cross_entropy(logits, Y)


    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()
    for p in parameters:
        p.data += -.01 * p.grad
print(loss.item())





0.3072108030319214


In [34]:
# More concise implementation
# Parameters
g = torch.Generator().manual_seed(2147483647)

v = len(chars)+1 # size of vocabulary
d = 2 # embedding dimensions
i = block_size * d # length of full embedding input vector
j = 100 # number of nodes in hidden layer

C = torch.randn((v, d), generator=g)
W1 = torch.randn((i, j), generator = g)
b1 = torch.randn(j, generator=g)
W2 = torch.randn((j, v), generator=g)
b2 = torch.randn(v, generator=g)
parameters = [C, W1, b1, W2, b2]


In [35]:
for p in parameters:
    p.requires_grad = True

In [53]:
k = 200
batch_size = 32

for it in range(k):
    ix = torch.randint(0, X.shape[0], (batch_size, )) # minibatch
    
    # forward pass
    emb = C[X[ix]]
    h = torch.tanh(emb.view(-1, i) @ W1 + b1)
    logits = h @ W2 + b2 
    loss = F.cross_entropy(logits, Y[ix])
    if it%10==0: print(loss.item())


    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()
    for p in parameters:
        p.data += -.01 * p.grad
# print(loss.item())





3.0869922637939453
2.6657216548919678
2.711425542831421
3.16888427734375
2.989213228225708
2.9418580532073975
3.057060718536377
3.4854986667633057
2.8669891357421875
3.7489635944366455
3.2786831855773926
3.6607577800750732
2.7204909324645996
3.0025904178619385
3.130762815475464
3.330831289291382
2.6191914081573486
3.380959987640381
2.7354719638824463
2.7938380241394043


In [54]:
# evaluate on full dataset
# forward pass
emb = C[X]
h = torch.tanh(emb.view(-1, i) @ W1 + b1)
logits = h @ W2 + b2 
loss = F.cross_entropy(logits, Y)
print(loss.item())

2.976243495941162


In [27]:
# implement minibatch training - index a random subset of the data for each pass
batch_size = 32
torch.randint(0, X.shape[0], (32, ))

tensor([ 22270, 160234,  83752, 121874, 161253, 151696, 205667,  89200, 201097,
        208441, 135973, 183249, 132000,  46186,  84685, 128673,  92590,  37097,
        159037, 201308,  70525,  15000, 160363, 117602,  89467, 126700, 132770,
        148990,  83629, 171593, 206838, 210088])