## Makemore MLP

In [1]:
import torch

In [2]:
import torch.nn.functional as F

### Build vocabulary

In [3]:
words = open('names.txt','r').read().splitlines()

In [4]:
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
print(itos)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


### Build dataset

In [5]:
block_size = 3
X, Y = [], []
for w in words[:5]:
    print(w)
    context = [0] * block_size
    for ch in w + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        print(''.join(itos[i] for i in context), '--->', itos[ix])
        context = context[1:] + [ix]

X = torch.tensor(X)
Y = torch.tensor(Y)
        

emma
... ---> e
..e ---> m
.em ---> m
emm ---> a
mma ---> .
olivia
... ---> o
..o ---> l
.ol ---> i
oli ---> v
liv ---> i
ivi ---> a
via ---> .
ava
... ---> a
..a ---> v
.av ---> a
ava ---> .
isabella
... ---> i
..i ---> s
.is ---> a
isa ---> b
sab ---> e
abe ---> l
bel ---> l
ell ---> a
lla ---> .
sophia
... ---> s
..s ---> o
.so ---> p
sop ---> h
oph ---> i
phi ---> a
hia ---> .


In [6]:
X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([32, 3]), torch.int64, torch.Size([32]), torch.int64)

In [7]:
C = torch.randn([27,2])
C[3]

tensor([ 0.2585, -0.8153])

In [8]:
F.one_hot(torch.tensor(3), num_classes=27).float() @ C

tensor([ 0.2585, -0.8153])

In [9]:
# 31th example 3rd character is an "a" = tensor(1)
X[31][2]

tensor(1)

In [10]:
emb = C[X]
# shape is [32,3,2] 32 training examples, of 3 (block size), 2 dimensions (embedding dimesion)
emb.shape

torch.Size([32, 3, 2])

In [11]:
emb[:,0,:].shape

torch.Size([32, 2])

### Weights and biases
We need to flatten one dimension to perform the multiplication. The input layer receives a vector of 3 by 2 (block size and embedding size) 

In [12]:
torch.cat(torch.unbind(emb,1),1).shape
# concatenation is ineffiecient and creates all kinds of memory

torch.Size([32, 6])

In [13]:
emb.view(32,6)

tensor([[ 0.7364,  0.8447,  0.7364,  0.8447,  0.7364,  0.8447],
        [ 0.7364,  0.8447,  0.7364,  0.8447, -0.6496,  0.4290],
        [ 0.7364,  0.8447, -0.6496,  0.4290, -0.4145,  0.2332],
        [-0.6496,  0.4290, -0.4145,  0.2332, -0.4145,  0.2332],
        [-0.4145,  0.2332, -0.4145,  0.2332,  0.3273, -0.1209],
        [ 0.7364,  0.8447,  0.7364,  0.8447,  0.7364,  0.8447],
        [ 0.7364,  0.8447,  0.7364,  0.8447,  0.4285, -0.3618],
        [ 0.7364,  0.8447,  0.4285, -0.3618, -0.0410, -2.4396],
        [ 0.4285, -0.3618, -0.0410, -2.4396, -0.9374,  0.4159],
        [-0.0410, -2.4396, -0.9374,  0.4159, -0.0549, -0.1997],
        [-0.9374,  0.4159, -0.0549, -0.1997, -0.9374,  0.4159],
        [-0.0549, -0.1997, -0.9374,  0.4159,  0.3273, -0.1209],
        [ 0.7364,  0.8447,  0.7364,  0.8447,  0.7364,  0.8447],
        [ 0.7364,  0.8447,  0.7364,  0.8447,  0.3273, -0.1209],
        [ 0.7364,  0.8447,  0.3273, -0.1209, -0.0549, -0.1997],
        [ 0.3273, -0.1209, -0.0549, -0.1

In [14]:
# input size and neurons

W1 = torch.randn((6,100))
b1 = torch.randn(100)

In [15]:
h = emb.view(emb.shape[0], 6) @ W1 + b1
# or emb.view(-1, 6) with "-1" pytorch will infer what the other dimension should be to sum up to the correct one

In [16]:
# two broadcastable shapes:
(emb.view(emb.shape[0], 6) @ W1).shape, b1.shape

(torch.Size([32, 100]), torch.Size([100]))

In [17]:
h = torch.tanh(emb.view(-1,6) @ W1 + b1)

In [18]:
h.shape

torch.Size([32, 100])

In [19]:
# 27 characters are output
W2 = torch.randn((100, 27))
b2 = torch.randn(27)

In [20]:
logits = h @ W2 + b2

In [21]:
logits.shape

torch.Size([32, 27])

In [22]:
counts = logits.exp()

In [23]:
# counts are normalized so they sum to 1
prob = counts / counts.sum(1, keepdims = True)

In [24]:
prob[0].sum()

tensor(1.0000)

In [25]:
# actual probabilities for the correct output character (for now without training)
# torch.arange(32), Y this means "example i, character Y (the correct character)"
prob[torch.arange(32), Y]

tensor([3.2040e-02, 2.4462e-06, 1.9938e-06, 9.8679e-09, 9.9607e-01, 4.3287e-05,
        3.3500e-07, 4.5616e-07, 7.9641e-03, 1.2027e-01, 1.4381e-05, 9.9972e-01,
        1.2388e-04, 1.1880e-10, 2.5930e-07, 8.4489e-01, 2.2123e-08, 4.3216e-01,
        3.5788e-10, 3.6110e-05, 1.9498e-06, 3.4924e-10, 1.0413e-05, 3.2373e-09,
        2.7967e-08, 5.0140e-04, 4.0921e-06, 2.6839e-06, 6.5761e-07, 7.3938e-03,
        5.4731e-11, 9.8278e-01])

In [26]:
prob.shape

torch.Size([32, 27])

In [62]:
torch.arange(32), Y

(tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
         18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]),
 tensor([ 5, 13, 13,  1,  0, 15, 12,  9, 22,  9,  1,  0,  1, 22,  1,  0,  9, 19,
          1,  2,  5, 12, 12,  1,  0, 19, 15, 16,  8,  9,  1,  0]))

In [46]:
g = torch.Generator().manual_seed(2147483647)
C = torch.randn((27,2), generator = g)
W1 = torch.randn((6,100), generator = g)
b1 = torch.randn(100, generator = g)
W2 = torch.randn((100,27), generator = g)
b2 = torch.randn(27, generator = g)

In [47]:
parameters = [C, W1, b1, W2, b2]

In [48]:
num_params = sum(p.nelement() for p in parameters)
num_params

3481

In [49]:
emb = C[X]
h = torch.tanh(emb.view(-1, 6) @ W1 + b1)
logits = h @ W2 + b2
counts = logits.exp()
prob = counts / counts.sum(1, keepdims=True)
loss = -prob[torch.arange(32), Y].log().mean()
loss

tensor(17.7697)

In [50]:
# the previous cell is the same as cross_entropy
F.cross_entropy(logits, Y)

tensor(17.7697)

**cross entropy** measures the difference between two probability distributions. is used in classification problems. If $P$ is our target distribution (in this case $Y$) and $Q$ is an approximation of $Q$ (in this case $logits$), the **cross entropy** is computed as follows:
$$H(P,Q) = - \sum_{x \in X} P(x) \cdot \log{Q(x)}$$

In [53]:
for p in parameters:
    p.requires_grad = True

In [55]:
for _ in range(1000):
    # forward pass
    emb = C[X]
    h = torch.tanh(emb.view(-1,6) @ W1 + b1)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Y)

    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()

    # update
    for p in parameters:
        p.data += -0.1 * p.grad

print(loss.item())

13.230988502502441
