In [21]:
# Following along with
# https://www.youtube.com/watch?v=TCH_1BHY58I&list=PLAqhIrjkxbuWI23v9cThsA9GvCAUhRvKZ&index=3
# https://github.com/karpathy/nn-zero-to-hero/blob/master/lectures/makemore/makemore_part2_mlp.ipynb
import torch
import random

# for making figures
%matplotlib inline

In [27]:
# read in all the words - helps us build words
all_words = open("words.txt", "r").read().splitlines()
MAX_WORDS = min(10_000, len(all_words))
random.seed(6_6_1978)  # pin the RNG
words = random.sample(all_words, MAX_WORDS)
random.shuffle(words)

In [28]:
random.sample(all_words, 20)

['beatris',
 'itzayana',
 'raelyn',
 'janalise',
 'haseeb',
 'kyian',
 'lloyd',
 'aidric',
 'aizik',
 'ivon',
 'aryannah',
 'wayne',
 'tauren',
 'mailey',
 'manases',
 'lamine',
 'statham',
 'kyro',
 'jalee',
 'keaston']

In [29]:
# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set("".join(words))))
stoi = {s: i + 1 for i, s in enumerate(chars)}
stoi["."] = 0
itos = {i: s for s, i in stoi.items()}


def string_to_index(s):
    return stoi[s]


def index_to_string(s):
    return itos[s]


def to_word(t):
    return "".join([itos[i.item()] for i in t])

In [30]:
# build the dataset
def build_dataset(words: list[str]):
    TERMINAL = "."

    block_size = (
        4  # context length: how many characters do we take to predict the next one?
    )
    X, Y = [], []
    for word in words[:]:
        # print(word)
        context = [
            string_to_index(TERMINAL)
        ] * block_size  # we start with a full terminal string
        for char in word + ".":
            ix = string_to_index(char)
            X.append(context)
            Y.append(ix)
            # print(''.join(itos[i] for i in context), '--->', itos[ix])
            context = context[1:] + [ix]  # crop and append

    X = torch.tensor(X)
    Y = torch.tensor(Y)
    return X, Y


X, Y = build_dataset(words)

In [34]:
def debug_samples(X, Y):
    for inp, out in list(zip(X, Y))[25:50]:
        print(f"{to_word(inp)}=>{index_to_string(out.item())} == {inp}, {out} ")
    print("--random samples--")
    for inp, out in random.sample(list(zip(X, Y)), 20):
        print(f"{to_word(inp)}=>{index_to_string(out.item())} == {inp}, {out} ")


debug_samples(X, Y)

hima=>m == tensor([ 8,  9, 13,  1]), 13 
imam=>a == tensor([ 9, 13,  1, 13]), 1 
mama=>n == tensor([13,  1, 13,  1]), 14 
aman=>d == tensor([ 1, 13,  1, 14]), 4 
mand=>a == tensor([13,  1, 14,  4]), 1 
anda=>. == tensor([ 1, 14,  4,  1]), 0 
....=>c == tensor([0, 0, 0, 0]), 3 
...c=>r == tensor([0, 0, 0, 3]), 18 
..cr=>i == tensor([ 0,  0,  3, 18]), 9 
.cri=>s == tensor([ 0,  3, 18,  9]), 19 
cris=>p == tensor([ 3, 18,  9, 19]), 16 
risp=>i == tensor([18,  9, 19, 16]), 9 
ispi=>n == tensor([ 9, 19, 16,  9]), 14 
spin=>. == tensor([19, 16,  9, 14]), 0 
....=>c == tensor([0, 0, 0, 0]), 3 
...c=>e == tensor([0, 0, 0, 3]), 5 
..ce=>d == tensor([0, 0, 3, 5]), 4 
.ced=>r == tensor([0, 3, 5, 4]), 18 
cedr=>i == tensor([ 3,  5,  4, 18]), 9 
edri=>c == tensor([ 5,  4, 18,  9]), 3 
dric=>. == tensor([ 4, 18,  9,  3]), 0 
....=>l == tensor([0, 0, 0, 0]), 12 
...l=>e == tensor([ 0,  0,  0, 12]), 5 
..le=>n == tensor([ 0,  0, 12,  5]), 14 
.len=>i == tensor([ 0, 12,  5, 14]), 9 
--random samples--


# Our arhcitecture

```mermaid
sequenceDiagram
actor X as input<br>(letters)
participant C as C=lookup embedding<br>(27x2)

X->>C: 1 shot encoding on input charectors

```

Tree

```mermaid
%%{init: {"flowchart": {"htmlLabels": false}} }%%

flowchart TD
    I0["in[0]"]
    Ii["in[i]"]
    In["in[n]"]
    I0 --> C_In_0
    Ii --> C_In_i
    In --> C_In_n
    C_In_0 --> N
    C_In_i --> N
    C_In_n --> N
    subgraph Neuron
    N --> SoftMax
    end
    SoftMax-->output
```
