In [88]:
# Following along with
# https://www.youtube.com/watch?v=TCH_1BHY58I&list=PLAqhIrjkxbuWI23v9cThsA9GvCAUhRvKZ&index=3
# https://github.com/karpathy/nn-zero-to-hero/blob/master/lectures/makemore/makemore_part2_mlp.ipynb
import torch
import random

# for making figures
%matplotlib inline

In [89]:
# read in all the words - helps us build words
all_words = open("words.txt", "r").read().splitlines()
MAX_WORDS = min(10_000, len(all_words))
words = random.sample(all_words, MAX_WORDS)

In [90]:
random.seed(6_6_1978)  # pin the RNG
random.sample(all_words, 20)

['florencia',
 'draysen',
 'simrin',
 'yasna',
 'lathan',
 'lilymarie',
 'maryah',
 'ara',
 'pheonix',
 'muir',
 'aubriegh',
 'maryruth',
 'feroz',
 'abdiel',
 'anabiya',
 'kristin',
 'dashon',
 'harlei',
 'valery',
 'janani']

In [91]:
# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set("".join(words))))
stoi = {s: i + 1 for i, s in enumerate(chars)}
stoi["."] = 0
itos = {i: s for s, i in stoi.items()}


def string_to_index(s):
    return stoi[s]


def index_to_string(s):
    return itos[s]


def to_word(t):
    return "".join([itos[i.item()] for i in t])

In [92]:
# build the dataset

block_size = (
    4  # context length: how many characters do we take to predict the next one?
)
X, Y = [], []
for word in words[:]:
    # print(word)
    context = [string_to_index(".")] * block_size  # we start with '...'
    for char in word + ".":
        ix = string_to_index(char)
        X.append(context)
        Y.append(ix)
        # print(''.join(itos[i] for i in context), '--->', itos[ix])
        context = context[1:] + [ix]  # crop and append

X = torch.tensor(X)
Y = torch.tensor(Y)
inputs, expected_output = X, Y

In [93]:
def debug_samples():
    for inp, out in random.sample(list(zip(inputs, expected_output)), 20):
        print(f"{to_word(inp)}=>{index_to_string(out.item())} == {inp}, {out} ")


debug_samples()

...e=>m == tensor([0, 0, 0, 5]), 13 
...c=>o == tensor([0, 0, 0, 3]), 15 
...s=>i == tensor([ 0,  0,  0, 19]), 9 
...j=>a == tensor([ 0,  0,  0, 10]), 1 
niko=>l == tensor([14,  9, 11, 15]), 12 
ndre=>s == tensor([14,  4, 18,  5]), 19 
alil=>a == tensor([ 1, 12,  9, 12]), 1 
adon=>. == tensor([ 1,  4, 15, 14]), 0 
ylin=>e == tensor([25, 12,  9, 14]), 5 
yiah=>. == tensor([25,  9,  1,  8]), 0 
....=>p == tensor([0, 0, 0, 0]), 16 
.kam=>e == tensor([ 0, 11,  1, 13]), 5 
ysab=>e == tensor([25, 19,  1,  2]), 5 
zion=>a == tensor([26,  9, 15, 14]), 1 
.vic=>t == tensor([ 0, 22,  9,  3]), 20 
illy=>a == tensor([ 9, 12, 12, 25]), 1 
...l=>a == tensor([ 0,  0,  0, 12]), 1 
stet=>s == tensor([19, 20,  5, 20]), 19 
urni=>. == tensor([21, 18, 14,  9]), 0 
..ev=>e == tensor([ 0,  0,  5, 22]), 5 


'.eli'