# **makemore** character level language model

In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
words = open('names.txt', 'r').read().splitlines()
words[:5]

['emma', 'olivia', 'ava', 'isabella', 'sophia']

In [3]:
len(words)

32033

### (re-)building our training dataset

In [4]:
# create the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i, s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s, i in stoi.items()}

In [5]:
# build the dataset

block_size = 3
X, Y = [], []

for w in words[:3]:
    print(w)
    context = [0] * block_size
    for ch in w + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        print(''.join(itos[i] for i in context), '----->', itos[ix])
        context = context[1:] + [ix]

X = torch.tensor(X)
Y = torch.tensor(Y)

emma
... -----> e
..e -----> m
.em -----> m
emm -----> a
mma -----> .
olivia
... -----> o
..o -----> l
.ol -----> i
oli -----> v
liv -----> i
ivi -----> a
via -----> .
ava
... -----> a
..a -----> v
.av -----> a
ava -----> .


In [6]:
X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([16, 3]), torch.int64, torch.Size([16]), torch.int64)

### implementing the embedding lookup table

In [7]:
C = torch.randn((27,2))
C

tensor([[-7.1580e-01,  4.0690e-01],
        [-1.1244e+00, -9.0459e-01],
        [-1.9700e-01,  2.1793e+00],
        [ 1.7969e+00, -2.4572e+00],
        [ 2.3304e-01, -1.9314e+00],
        [ 7.4945e-01, -2.7370e-02],
        [-6.1758e-01, -8.8516e-01],
        [ 1.5348e+00, -1.4031e+00],
        [ 1.4703e+00, -2.0701e-01],
        [-1.7259e+00, -1.0940e+00],
        [ 2.2821e+00,  4.4190e-01],
        [-2.0759e+00,  1.4705e-03],
        [ 1.5831e+00, -1.1907e-01],
        [-1.5216e+00,  4.9696e-01],
        [ 1.1044e-01,  2.2362e+00],
        [-2.7097e-01,  1.1891e+00],
        [ 1.3408e+00, -1.8662e-01],
        [ 8.9310e-01, -7.5095e-01],
        [ 1.9594e+00,  7.5707e-01],
        [ 2.6810e-01,  1.2227e+00],
        [ 4.0545e-01, -7.4810e-01],
        [ 3.4169e-01,  1.4539e-01],
        [-9.1193e-01, -1.5951e+00],
        [-6.7749e-01,  6.0704e-01],
        [-1.7167e+00,  5.0638e-01],
        [ 2.7915e-01,  7.9710e-01],
        [-8.2932e-01,  4.8390e-01]])

In [8]:
emb = C[X]
emb.shape

torch.Size([16, 3, 2])

### implementing the hidden layer

In [9]:
W1 = torch.randn((3*2, 100))
b = torch.randn(100)

In [10]:
h = emb.view(emb.shape[0], 6) @ W1 + b
h = torch.tanh(h)
h

tensor([[-0.9951,  0.5747, -0.9827,  ..., -0.9376, -0.8562,  0.9340],
        [-0.9779, -0.8262, -0.8964,  ...,  0.1763,  0.8308,  0.7574],
        [-0.6151,  0.9992,  0.5792,  ..., -0.9999, -0.6048,  0.7760],
        ...,
        [-0.9974, -0.8666, -0.9153,  ..., -0.9979, -0.9964,  0.7982],
        [-0.9945, -0.9876, -0.4220,  ..., -0.9971, -0.9914,  0.4066],
        [-0.8570, -0.2889,  0.5530,  ..., -0.9948, -0.6066,  0.8962]])

In [11]:
h.shape

torch.Size([16, 100])

### implementing the output layer

In [12]:
W2 = torch.randn((100, 27))
b = torch.randn(27)

In [13]:
logits = h @ W2 + b
logits

tensor([[-7.6892e+00, -1.2921e+01, -5.5874e-01, -3.7206e+00, -1.2567e+01,
         -5.1561e+00, -5.2286e+00,  1.7415e-02,  3.4478e+00,  4.2923e-01,
         -2.6775e+00,  6.2969e+00, -1.4304e+00, -7.0164e+00,  1.5239e+01,
          6.2397e+00, -1.2713e+01, -5.9553e+00,  1.2132e+00, -9.0699e+00,
         -1.8321e+01, -4.2600e+00, -4.9258e-01,  4.9596e+00, -5.9961e+00,
          8.1291e+00, -4.1215e+00],
        [-1.1738e+01, -1.2997e+01, -6.6406e-01, -1.6535e+00, -1.7980e+01,
         -1.0893e+01, -1.1606e+01, -1.2615e+01,  6.5770e+00,  9.8676e-01,
         -8.4918e-01,  9.5937e+00, -1.9253e+00, -1.1588e+01,  6.4213e+00,
          5.2779e+00, -2.2500e+00, -5.9052e+00, -1.8445e+01, -1.6116e+01,
         -9.2384e+00,  5.4796e+00,  7.0828e-01, -9.4454e+00, -3.4156e+00,
          1.5204e+00, -2.7699e+00],
        [ 1.9602e+00, -9.9869e+00, -1.6924e+00, -2.3282e+00, -4.7702e+00,
         -1.1964e+00,  4.6112e+00,  2.3050e+00,  5.2753e+00, -5.0789e+00,
         -8.0262e+00, -6.1355e+00, -3.85

In [14]:
logits.shape

torch.Size([16, 27])

### implementing our negative log likelihood loss

In [15]:
counts = logits.exp()
prob = counts / counts.sum(1, keepdim=True)
prob.shape

torch.Size([16, 27])

In [16]:
probs = prob[torch.arange(len(Y)), Y]
probs

tensor([1.3864e-09, 5.6370e-10, 1.1699e-08, 9.0034e-11, 6.5516e-14, 1.2332e-04,
        4.7026e-07, 5.6629e-02, 4.1453e-01, 6.3160e-07, 6.4638e-11, 3.3674e-19,
        5.8819e-13, 4.2594e-06, 1.5457e-05, 6.9038e-19])

In [17]:
loss = -probs.log().mean()
loss

tensor(19.6538)