## Makemore MLP

In [1]:
import torch

In [2]:
import torch.nn.functional as F

### Build vocabulary

In [3]:
words = open('names.txt','r').read().splitlines()

In [5]:
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
print(itos)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


### Build dataset

In [6]:
block_size = 3
X, Y = [], []
for w in words[:5]:
    print(w)
    context = [0] * block_size
    for ch in w + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        print(''.join(itos[i] for i in context), '--->', itos[ix])
        context = context[1:] + [ix]

X = torch.tensor(X)
Y = torch.tensor(Y)
        

emma
... ---> e
..e ---> m
.em ---> m
emm ---> a
mma ---> .
olivia
... ---> o
..o ---> l
.ol ---> i
oli ---> v
liv ---> i
ivi ---> a
via ---> .
ava
... ---> a
..a ---> v
.av ---> a
ava ---> .
isabella
... ---> i
..i ---> s
.is ---> a
isa ---> b
sab ---> e
abe ---> l
bel ---> l
ell ---> a
lla ---> .
sophia
... ---> s
..s ---> o
.so ---> p
sop ---> h
oph ---> i
phi ---> a
hia ---> .


In [12]:
X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([32, 3]), torch.int64, torch.Size([32]), torch.int64)

In [9]:
C = torch.randn([27,2])
C[3]

tensor([1.3102, 0.1169])

In [10]:
F.one_hot(torch.tensor(3), num_classes=27).float() @ C

tensor([1.3102, 0.1169])

In [15]:
# 31th example 3rd character is an "a" = tensor(1)
X[31][2]

tensor(1)

In [16]:
emb = C[X]
# shape is [32,3,2] 32 training examples, of 3 (block size), 2 dimensions (embedding dimesion)
emb.shape

torch.Size([32, 3, 2])

In [18]:
emb[:,0,:].shape

torch.Size([32, 2])

### Weights and biases
We need to flatten one dimension to perform the multiplication. The input layer receives a vector of 3 by 2 (block size and embedding size) 

In [23]:
torch.cat(torch.unbind(emb,1),1).shape
# concatenation is ineffiecient and creates all kinds of memory

torch.Size([32, 6])

In [24]:
emb.view(32,6)

tensor([[-0.9138,  0.8037, -0.9138,  0.8037, -0.9138,  0.8037],
        [-0.9138,  0.8037, -0.9138,  0.8037, -0.5068,  0.6986],
        [-0.9138,  0.8037, -0.5068,  0.6986,  1.3613, -1.8846],
        [-0.5068,  0.6986,  1.3613, -1.8846,  1.3613, -1.8846],
        [ 1.3613, -1.8846,  1.3613, -1.8846, -0.7232, -0.5272],
        [-0.9138,  0.8037, -0.9138,  0.8037, -0.9138,  0.8037],
        [-0.9138,  0.8037, -0.9138,  0.8037,  0.4990, -1.1098],
        [-0.9138,  0.8037,  0.4990, -1.1098, -0.0801, -1.0422],
        [ 0.4990, -1.1098, -0.0801, -1.0422, -0.4536, -0.6207],
        [-0.0801, -1.0422, -0.4536, -0.6207, -2.0167, -0.0185],
        [-0.4536, -0.6207, -2.0167, -0.0185, -0.4536, -0.6207],
        [-2.0167, -0.0185, -0.4536, -0.6207, -0.7232, -0.5272],
        [-0.9138,  0.8037, -0.9138,  0.8037, -0.9138,  0.8037],
        [-0.9138,  0.8037, -0.9138,  0.8037, -0.7232, -0.5272],
        [-0.9138,  0.8037, -0.7232, -0.5272, -2.0167, -0.0185],
        [-0.7232, -0.5272, -2.0167, -0.0

In [47]:
# input size and neurons

W1 = torch.randn((6,100))
b1 = torch.randn(100)

In [48]:
h = emb.view(emb.shape[0], 6) @ W1 + b1
# or emb.view(-1, 6) with "-1" pytorch will infer what the other dimension should be to sum up to the correct one

In [49]:
# two broadcastable shapes:
(emb.view(emb.shape[0], 6) @ W1).shape, b1.shape

(torch.Size([32, 100]), torch.Size([100]))

In [50]:
h = torch.tanh(emb.view(-1,6) @ W1 + b1)

In [51]:
h.shape

torch.Size([32, 100])

In [54]:
# 27 characters are output
W2 = torch.randn((100, 27))
b2 = torch.randn(27)

In [55]:
logits = h @ W2 + b2

In [56]:
logits.shape

torch.Size([32, 27])

In [57]:
counts = logits.exp()

In [58]:
# counts are normalized so they sum to 1
prob = counts / counts.sum(1, keepdims = True)

In [59]:
prob[0].sum()

tensor(1.)

In [60]:
# actual probabilities for the correct output character (for now without training)
# torch.arange(32), Y this means "example i, character Y (the correct character)"
prob[torch.arange(32), Y]

tensor([2.6793e-10, 9.9151e-01, 1.9929e-12, 1.0000e+00, 8.3869e-17, 8.1524e-06,
        2.2827e-09, 4.9025e-15, 1.9119e-07, 2.5365e-06, 5.1905e-02, 1.1000e-11,
        2.1639e-09, 6.6223e-05, 5.0690e-06, 3.0231e-14, 2.0031e-03, 1.9185e-09,
        4.0100e-03, 5.9459e-09, 1.7484e-13, 2.0088e-14, 3.1048e-08, 9.5746e-01,
        2.7550e-17, 6.4165e-06, 2.6006e-01, 2.7596e-13, 4.0356e-12, 6.0943e-01,
        1.3959e-05, 1.1939e-13])

In [61]:
prob.shape

torch.Size([32, 27])

In [62]:
torch.arange(32), Y

(tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
         18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]),
 tensor([ 5, 13, 13,  1,  0, 15, 12,  9, 22,  9,  1,  0,  1, 22,  1,  0,  9, 19,
          1,  2,  5, 12, 12,  1,  0, 19, 15, 16,  8,  9,  1,  0]))