<a href="https://colab.research.google.com/github/heerboi/AI-from-scratch/blob/main/neural_probabilistic_language_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Reference: A Neural Probabilistic Language Model by Bengio et al. (Published Feb 2003!)

Link: https://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf

In [121]:
!curl https://raw.githubusercontent.com/karpathy/makemore/refs/heads/master/names.txt > names.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100  222k  100  222k    0     0   797k      0 --:--:-- --:--:-- --:--:--  795k


In [122]:
words = open("names.txt", "r").read().splitlines()

In [123]:
words[:3]

['emma', 'olivia', 'ava']

In [124]:
import torch
import torch.nn.functional as F
import torch.nn

In [125]:
unique_chars = ["."] + sorted(list(set("".join(words))))

In [126]:
unique_chars

['.',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z']

In [127]:
stoi = {s:i for i, s in enumerate(unique_chars)}
itos = {i:s for s, i in stoi.items()}

In [128]:
xs = []
ys = []
block_size = 4
for word in words:
    word = word + "."

    context = [0] * block_size

    for char in word:
        xs.append(context)
        ys.append(stoi[char])
        context = context[1:] + [stoi[char]]

xs = torch.tensor(xs)
ys = torch.tensor(ys)

In [129]:
len(xs)

228146

In [130]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    xs, ys, test_size=0.10)

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.50)

In [131]:
xs[:5], ys[:5]

(tensor([[ 0,  0,  0,  0],
         [ 0,  0,  0,  5],
         [ 0,  0,  5, 13],
         [ 0,  5, 13, 13],
         [ 5, 13, 13,  1]]),
 tensor([ 5, 13, 13,  1,  0]))

1. C -> embedding table of dims (unique words/tokens, embedding size)
2. W1 -> weights for connections from input to hidden layer (input dim, hidden nodes)
3. W2 -> weights for connections from hidden to output layer (hidden nodes, output dim)


we makin a quadra-gram model, so three char embeddings as input. Input dims = 3 * embedding_size

output dim is probabilities of next char, so it will be 27 (including dot to indicate end/start of name)

In [142]:
embedding_dims = 30
hidden_size = 100
learning_rate = 0.1
unique_tokens = len(unique_chars)
C = torch.nn.Embedding(num_embeddings=unique_tokens, embedding_dim=embedding_dims)

W = torch.randn((block_size * embedding_dims, unique_tokens)) # (256, 20) x (20, 27) = (256, 27)
H = torch.randn((block_size * embedding_dims, hidden_size))
d = torch.randn(hidden_size)
U = torch.randn((hidden_size, unique_tokens))
b = torch.randn(unique_tokens)

parameters = [C.weight, W, H, d, U, b]

In [143]:
sum(p.nelement() for p in parameters) # Total trainable parameters

18877

In [144]:
for p in parameters:
    p.requires_grad = True

In [145]:
iterations = 200000
batch_size = 512

for iter in range(iterations):

    # mini-batch
    ix = torch.randint(0, X_train.shape[0], (batch_size,))


    # fw pass
    # (examples, embeds * 3) * (embeds * 3, hidden)
    # (examples, hidden) * (hidden, unique_tokens)

    # X_train_embeds = (batch_size, block_size * embedding_size)
    X_train_embeds = C(X_train[ix]).view(-1, block_size * embedding_dims)
    assert(X_train_embeds.shape[0] == batch_size)

    # Tanh(Hx + d)
    HxD = torch.tanh(X_train_embeds @ H + d)

    # U(W1X) + b
    U_d = HxD @ U + b

    Wx = X_train_embeds @ W

    outputs = Wx + U_d

    # bw pass
    # loss = -outputs[torch.arange(X_train.shape[0]), y_train].log().mean()
    loss = F.cross_entropy(outputs, y_train[ix])

    # C.weight.grad = None
    # W1.grad = None
    # W2.grad = None
    for p in parameters:
        p.grad = None

    loss.backward()

    # with torch.no_grad():
    #     X_val_embeds = C(X_val).view(-1, block_size * embedding_dims)
    #     W1X = torch.tanh(X_val_embeds @ W1)
    #     W2W1 = W1X @ W2
    #     # outputs = F.softmax(W2W1, dim = 1)
    #     # val_loss = -outputs[torch.arange(X_val.shape[0]), y_val].log().mean()
    #     val_loss = F.cross_entropy(W2W1, y_val)

    if iter > 100000:
        learning_rate = 0.01


    # C.weight.data -= learning_rate * C.weight.grad
    # W1.data -= learning_rate * W1.grad
    # W2.data -= learning_rate * W2.grad
    for p in parameters:
        p.data -= learning_rate * p.grad

    if iter % 10000 == 0:
        print(f"{iter} | Loss: {loss.item()}")

0 | Loss: 31.76503562927246
10000 | Loss: 2.1501147747039795
20000 | Loss: 2.1738152503967285
30000 | Loss: 2.253756046295166
40000 | Loss: 2.1920087337493896
50000 | Loss: 2.178954839706421
60000 | Loss: 2.0707366466522217
70000 | Loss: 1.9934029579162598
80000 | Loss: 2.053574323654175
90000 | Loss: 2.212862491607666
100000 | Loss: 1.9982186555862427
110000 | Loss: 2.1132164001464844
120000 | Loss: 2.08832049369812
130000 | Loss: 2.1042370796203613
140000 | Loss: 2.1109704971313477
150000 | Loss: 2.0421621799468994
160000 | Loss: 2.1163203716278076
170000 | Loss: 2.091182231903076
180000 | Loss: 2.045867919921875
190000 | Loss: 1.9896823167800903


In [146]:
# testing testing
with torch.no_grad():
    X_test_embeds = C(X_test).view(-1, block_size * embedding_dims)

    HxD = torch.tanh(X_test_embeds @ H + d)

    # U(W1X) + b
    U_d = HxD @ U + b

    Wx = X_test_embeds @ W

    outputs = Wx + U_d

    # outputs = F.softmax(W2W1, dim = 1)
    # loss = -outputs[torch.arange(X_test.shape[0]), y_test].log().mean()
    loss = F.cross_entropy(outputs, y_test)

    print(f"Test loss: {loss.item()}")

Test loss: 2.150269031524658


In [147]:
names = []
for _ in range(5):
    new_name = "." * block_size
    while True:
        with torch.no_grad():
            input = torch.tensor([stoi[i] for i in new_name[-block_size:]])
            input_embeds = C(input).view(-1, block_size * embedding_dims)

            HxD = torch.tanh(input_embeds @ H + d)

            U_d = HxD @ U + b

            Wx = input_embeds @ W

            outputs = F.softmax(Wx + U_d, dim = 1)

            next_pred = itos[torch.multinomial(outputs, num_samples = 1, replacement = True).item()]

            if next_pred == ".":
                names.append(new_name[block_size:])
                break
            new_name += next_pred

names

['keianna', 'zavary', 'alieah', 'tyvisenna', 'krizley']

I BEAT ANDREJ KARPATHY's MODEL WOOOOOOO