<a href="https://colab.research.google.com/github/heerboi/AI-from-scratch/blob/main/neural_probabilistic_language_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Reference: A Neural Probabilistic Language Model by Bengio et al. (Published Feb 2003!)

Link: https://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf

In [1]:
!curl https://raw.githubusercontent.com/karpathy/makemore/refs/heads/master/names.txt > names.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  222k  100  222k    0     0   233k      0 --:--:-- --:--:-- --:--:--  233k


In [2]:
words = open("names.txt", "r").read().splitlines()

In [3]:
words[:3]

['emma', 'olivia', 'ava']

In [4]:
import torch
import torch.nn.functional as F
import torch.nn

In [9]:
unique_chars = ["."] + sorted(list(set("".join(words))))

In [10]:
unique_chars

['.',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z']

In [12]:
stoi = {s:i for i, s in enumerate(unique_chars)}
itos = {i:s for s, i in stoi.items()}

In [84]:
xs = []
ys = []

for word in words:
    word = word + "."

    context = [".", ".", "."]

    for char in word:
        xs.append([stoi[i] for i in context])

        ys.append(stoi[char])
        context = context[1:] + [char]
xs = torch.tensor(xs)
ys = torch.tensor(ys)

In [80]:
len(xs)

228146

In [85]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    xs, ys, test_size=0.20)

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.428)

In [82]:
X_train[:5]

tensor([[ 5, 25,  1],
        [14,  4, 15],
        [ 1, 22,  9],
        [13,  9, 15],
        [ 0,  3, 15]])

1. C -> embedding table of dims (unique words/tokens, embedding size)
2. W1 -> weights for connections from input to hidden layer (input dim, hidden nodes)
3. W2 -> weights for connections from hidden to output layer (hidden nodes, output dim)


we makin a quadra-gram model, so three char embeddings as input. Input dims = 3 * embedding_size

output dim is probabilities of next char, so it will be 27 (including dot to indicate end/start of name)

In [122]:
embedding_dims = 5
hidden_size = 15
learning_rate = 1
unique_tokens = len(unique_chars)
C = torch.nn.Embedding(num_embeddings=unique_tokens, embedding_dim=embedding_dims)

W1 = torch.randn((3 * embedding_dims, hidden_size), requires_grad=True)
W2 = torch.randn((hidden_size, unique_tokens), requires_grad=True)

In [129]:
iterations = 2000

for iter in range(iterations):

    # fw pass
    # (examples, embeds * 3) * (embeds * 3, hidden)
    # (examples, hidden) * (hidden, unique_tokens)

    X_train_embeds = C(X_train).view(-1, 3 * embedding_dims)

    W1X = torch.tanh(X_train_embeds @ W1)
    W2W1 = W1X @ W2

    # w2w1 = (examples, unique_tokens)
    # have to add up the COLUMNS and take softmax!
    # dim = 1 (sum along dim 1)
    outputs = F.softmax(W2W1, dim=1)


    # bw pass
    loss = -outputs[torch.arange(X_train.shape[0]), y_train].log().mean()

    C.weight.grad = None
    W1.grad = None
    W2.grad = None

    loss.backward()

    with torch.no_grad():
        X_val_embeds = C(X_val).view(-1, 3 * embedding_dims)
        W1X = torch.tanh(X_val_embeds @ W1)
        W2W1 = W1X @ W2
        outputs = F.softmax(W2W1, dim = 1)
        val_loss = -outputs[torch.arange(X_val.shape[0]), y_val].log().mean()


    C.weight.data -= learning_rate * C.weight.grad
    W1.data -= learning_rate * W1.grad
    W2.data -= learning_rate * W2.grad

    if iter % 100 == 0:
        print(f"{iter + 1} | Loss: {loss.item()} | Validation loss: {val_loss.item()}")

1 | Loss: 2.3431262969970703 | Validation loss: 2.3545219898223877
101 | Loss: 2.3404242992401123 | Validation loss: 2.351945161819458
201 | Loss: 2.3379321098327637 | Validation loss: 2.3495779037475586
301 | Loss: 2.335564136505127 | Validation loss: 2.3473286628723145
401 | Loss: 2.333195924758911 | Validation loss: 2.3450636863708496
501 | Loss: 2.33221173286438 | Validation loss: 2.3441758155822754
601 | Loss: 2.328718423843384 | Validation loss: 2.340710163116455
701 | Loss: 2.326855421066284 | Validation loss: 2.3389840126037598
801 | Loss: 2.3253426551818848 | Validation loss: 2.337675094604492
901 | Loss: 2.323465347290039 | Validation loss: 2.336019515991211
1001 | Loss: 2.3221242427825928 | Validation loss: 2.334901809692383
1101 | Loss: 2.3201677799224854 | Validation loss: 2.3330795764923096
1201 | Loss: 2.3198046684265137 | Validation loss: 2.3329594135284424
1301 | Loss: 2.318779945373535 | Validation loss: 2.3320841789245605
1401 | Loss: 2.3178024291992188 | Validation 

In [130]:
# testing testing
with torch.no_grad():
    X_test_embeds = C(X_test).view(-1, 3 * embedding_dims)

    W1X = torch.tanh(X_test_embeds @ W1)
    W2W1 = W1X @ W2

    outputs = F.softmax(W2W1, dim = 1)
    loss = -outputs[torch.arange(X_test.shape[0]), y_test].log().mean()

    print(f"Test loss: {loss.item()}")

Test loss: 2.323338747024536


In [131]:
names = []
for _ in range(5):
    new_name = "..."
    while True:
        with torch.no_grad():
            input = torch.tensor([stoi[i] for i in new_name[-3:]])
            input_embeds = C(input).view(-1, 3 * embedding_dims)

            W1X = torch.tanh(input_embeds @ W1)
            W2W1 = W1X @ W2
            outputs = F.softmax(W2W1, dim=1)

            next_pred = itos[torch.multinomial(outputs, num_samples = 1, replacement = True).item()]

            if next_pred == ".":
                names.append(new_name[3:])
                break
            new_name += next_pred

names

['dixan', 'leyon', 'vyn', 'ches', 'mamylonenee']