In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
words = open('names.txt', 'r').read().splitlines()

In [16]:
stoi = {s: i+1 for i, s in enumerate(sorted(list(set(''.join(words)))))}
stoi['.'] = 0
stoi

{'a': 1,
 'b': 2,
 'c': 3,
 'd': 4,
 'e': 5,
 'f': 6,
 'g': 7,
 'h': 8,
 'i': 9,
 'j': 10,
 'k': 11,
 'l': 12,
 'm': 13,
 'n': 14,
 'o': 15,
 'p': 16,
 'q': 17,
 'r': 18,
 's': 19,
 't': 20,
 'u': 21,
 'v': 22,
 'w': 23,
 'x': 24,
 'y': 25,
 'z': 26,
 '.': 0}

In [112]:
context_length = 3 
X, Y = [], []

for word in words[:5]:
    # initialize the context for the word: 
    context  = ['.'] * context_length
    for ch in word + '.':
        X.append([stoi[s] for s in context])
        Y.append(stoi[ch])
        print(context, '->', ch)
        context =  context[1:] + [ch]
        
    

['.', '.', '.'] -> e
['.', '.', 'e'] -> m
['.', 'e', 'm'] -> m
['e', 'm', 'm'] -> a
['m', 'm', 'a'] -> .
['.', '.', '.'] -> o
['.', '.', 'o'] -> l
['.', 'o', 'l'] -> i
['o', 'l', 'i'] -> v
['l', 'i', 'v'] -> i
['i', 'v', 'i'] -> a
['v', 'i', 'a'] -> .
['.', '.', '.'] -> a
['.', '.', 'a'] -> v
['.', 'a', 'v'] -> a
['a', 'v', 'a'] -> .
['.', '.', '.'] -> i
['.', '.', 'i'] -> s
['.', 'i', 's'] -> a
['i', 's', 'a'] -> b
['s', 'a', 'b'] -> e
['a', 'b', 'e'] -> l
['b', 'e', 'l'] -> l
['e', 'l', 'l'] -> a
['l', 'l', 'a'] -> .
['.', '.', '.'] -> s
['.', '.', 's'] -> o
['.', 's', 'o'] -> p
['s', 'o', 'p'] -> h
['o', 'p', 'h'] -> i
['p', 'h', 'i'] -> a
['h', 'i', 'a'] -> .


In [136]:
X = torch.tensor(X)
Y = torch.tensor(Y)
print(X)
print(Y)

tensor([[ 0,  0,  0],
        [ 0,  0,  5],
        [ 0,  5, 13],
        [ 5, 13, 13],
        [13, 13,  1],
        [ 0,  0,  0],
        [ 0,  0, 15],
        [ 0, 15, 12],
        [15, 12,  9],
        [12,  9, 22],
        [ 9, 22,  9],
        [22,  9,  1],
        [ 0,  0,  0],
        [ 0,  0,  1],
        [ 0,  1, 22],
        [ 1, 22,  1],
        [ 0,  0,  0],
        [ 0,  0,  9],
        [ 0,  9, 19],
        [ 9, 19,  1],
        [19,  1,  2],
        [ 1,  2,  5],
        [ 2,  5, 12],
        [ 5, 12, 12],
        [12, 12,  1],
        [ 0,  0,  0],
        [ 0,  0, 19],
        [ 0, 19, 15],
        [19, 15, 16],
        [15, 16,  8],
        [16,  8,  9],
        [ 8,  9,  1]])
tensor([ 5, 13, 13,  1,  0, 15, 12,  9, 22,  9,  1,  0,  1, 22,  1,  0,  9, 19,
         1,  2,  5, 12, 12,  1,  0, 19, 15, 16,  8,  9,  1,  0])


In [113]:
# embedding matrix: C: 27,2 
C = torch.rand(27,2)

In [114]:
C.shape

torch.Size([27, 2])

In [34]:
C[X[0]]

tensor([[0.1681, 0.9909],
        [0.1681, 0.9909],
        [0.1681, 0.9909]])

Embedding Structure:

- `C` is an embedding matrix of `(27, 2)`
- `X` is list  of context window, where each context is a list of indicies

When we do `C[X]` we get tensor shaped `(32, 3, 2)`, where:
- the first dimension (32) is the number of context windows
- the second dimension (3) is the number of characters in each context window
- the third dimension (2) is the dimensionality of the context window

We should view `emb` as 3-dimensional tensor where each "element" is a `3,2` of matrix

In [39]:
emb = C[X]
emb.shape

torch.Size([32, 3, 2])

In [43]:
emb[0]

tensor([[0.1681, 0.9909],
        [0.1681, 0.9909],
        [0.1681, 0.9909]])

Neural Networks, particularly MLPs, expect input to be flat vectors. Our new vector represents the entire context window as a single unit of information, which the neural network can process.


In [44]:
emb.view(-1,6)[0]

tensor([0.1681, 0.9909, 0.1681, 0.9909, 0.1681, 0.9909])

In [90]:
W1 = torch.randn((6,100))
b1 = torch.randn((100))
out = emb.view(-1,6) @ W1 + b1
W2 = torch.randn((100,27))
b2 = torch.randn(27)
logits = out @ W2 + b2
counts = logits.exp()
probs = counts /counts.sum(1, keepdims = True)


In [91]:
probs[0].sum()

tensor(1.0000)

In [92]:
probs[torch.arange(32), Y]

tensor([2.0012e-23, 1.5547e-22, 5.7554e-25, 5.6777e-25, 3.9090e-14, 2.8465e-16,
        9.9962e-01, 1.4990e-11, 4.4908e-28, 5.4308e-04, 7.5388e-19, 2.2951e-11,
        3.2417e-20, 2.0056e-17, 8.0035e-18, 1.0065e-09, 3.0842e-07, 2.1833e-17,
        3.7064e-22, 1.9988e-15, 7.1823e-23, 1.4927e-09, 4.4547e-07, 1.0946e-30,
        1.2979e-15, 2.4256e-17, 4.0404e-14, 5.3734e-11, 2.9308e-15, 9.9549e-06,
        5.2579e-22, 2.6991e-08])

This gives the probabilities assigned by the neural network to the correct outputs. Now we examine the negative log likelihood:

In [94]:
-probs[torch.arange(32), Y].log().mean()

tensor(34.8668)

In [95]:
# ---------------------- re-writing -------------

In [153]:
C = torch.randn((27,2))
W1 = torch.randn((6,100))
b1 = torch.randn(100)
W2= torch.randn(100, 27)
b2 = torch.randn(27)
parameters = [C, W1, b1, W2, b2]

for p in parameters:
    p.requires_grad = True

In [152]:
sum(p.nelement() for p in parameters)

3481

In [137]:
# forward pass:
emb = C[X]
h = torch.tanh(emb.view(-1, 6) @ W1 + b1) # (32, 100)
logits = h @ W2 + b2 #(32, 27)
# counts = logits.exp()
# prob = counts / counts.sum(1, keepdims= True)
# loss = -prob[torch.arange(32), Y].log().mean()
loss = F.cross_entropy(logits, Y)
loss

tensor(14.8185)

In [138]:
loss

tensor(14.8185)

In [158]:
C = torch.randn((27,2))
W1 = torch.randn((6,100))
b1 = torch.randn(100)
W2= torch.randn(100, 27)
b2 = torch.randn(27)
parameters = [C, W1, b1, W2, b2]
for p in parameters:
    p.requires_grad = True

In [159]:
for _ in range(10):
    # forward pass:
    emb = C[X]
    h = torch.tanh(emb.view(-1, 6) @ W1 + b1) # (32, 100)
    logits = h @ W2 + b2 #(32, 27)
    loss = F.cross_entropy(logits, Y)
    print(loss.item())
    # backward pass
    for p in parameters:
        p.grad = None # same as setting it to 0 in Pytorch
    loss.backward()
    # update
    for p in parameters:
        p.data += -0.1 * p.grad

13.690547943115234
11.096037864685059
9.4055757522583
8.00573444366455
6.8577399253845215
5.927469730377197
5.188044548034668
4.581540107727051
4.07319974899292
3.641629934310913


In [149]:
for p in parameters:
    p.requires_grad = True