In [1]:
# imports
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d
import numpy as np
%matplotlib inline

In [6]:
# loading data set
words = open('names.txt', 'r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [3]:
#build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
print(stoi)

{'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26, '.': 0}


In [4]:
# build the dataset with train/val/test splits
def build_dataset(words):
    block_size = 3 # context length: how many characters do we take to predict the next one?
    X, Y = [], [] # X is input to neural net. Y is label for each input
    for w in words:
        
        context = [0] * block_size # create padded context of Zero tokens
        for ch in w + '.': # concat . token to end of word and iterate through charcters
            ix = stoi[ch] # assign index of ch to variable ix
            X.append(context) # append context to input X
            Y.append(ix) # append index of ch to labels Y
            context = context[1:] + [ix] # crop and append

    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(X.shape, Y.shape)
    return X, Y

import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])

torch.Size([182625, 4]) torch.Size([182625])
torch.Size([22655, 4]) torch.Size([22655])
torch.Size([22866, 4]) torch.Size([22866])


In [8]:
# build the layers of the MLP
n_embd = 2
n_hidden = 300
block_size = 3
vocab_size = len(stoi)

g = torch.Generator().manual_seed(2023)
C = torch.randn((vocab_size, n_embd), generator=g)
W1 = torch.randn((n_embd*block_size, n_hidden), generator=g) * 0.2
b1 = torch.randn(n_hidden, generator=g) * 0.01
W2 = torch.randn((n_hidden, vocab_size), generator=g) * 0.01
b2 = torch.randn(vocab_size, generator=g) * 0
parameters = [C, W1, b1, W2, b2] # to count total parameters
for p in parameters:
    p.requires_grad = True

In [9]:
# check the number of parameters in the model
sum(p.nelement() for p in parameters)

10281

In [10]:
# functions used to plot and save images
def get_floor(ix, T, buff):
    return T[:,ix].min().item() - (T[:,ix].max().item() - T[:,ix].min().item()) * buff

colors = ['b' if i not in {1, 5, 9, 15, 21} else 'crimson' for i in np.arange(27)]

def save_fig(frame):
    n_chars = 5
    frame_s = str(frame)
    n = n_chars - len(frame_s)
    frame_s = ('0' * n) + frame_s
    plt.figure(figsize=(5,5))
    ax = plt.axes(projection='3d')
    # plot shadows
    floor = get_floor(2, C, 0.1)
    ax.scatter3D(C[:,0].data, C[:,1].data, floor, c='gray', s=150, zorder=2)
    # plot points
    ax.scatter3D(C[:,0].data, C[:,1].data, C[:,2].data, c=colors, s=150, zorder=20)

    # plot labels
    for i in range(C.shape[0]):
        ax.text3D(C[i,0].item(), C[i,1].item(), C[i,2].item(), itos[i], ha="center", va="center", color="white", zorder=40, fontsize=8)
    for i in range(C.shape[0]):
        ax.text3D(C[i,0].item(), C[i,1].item(), floor, itos[i], ha="center", va="center", color="white", zorder=10, fontsize=8)
    ax.set_xlabel('x')
    ax.set_ylabel('y')
    ax.set_zlabel('z')
    ax.set_title('Training of Embeddings Over 1M Iterations')
    ax.view_init(45, rot[frame])
    plt.savefig(f'GIF/{frame_s}.png')
    plt.close()

In [None]:
# run forwards and backwards passes, update network, save plots
max_steps = 1_000_000
batch_size = 64
# frames = 1000
# frame_n = 0

for i in range(max_steps):

    # construct minibatch
    ix = torch.randint(0, Xtr.shape[0], (batch_size,))

    # forward pass
    emb = C[Xtr[ix]] # (64, 3, 2)
    h = torch.tanh(emb.view(-1, n_embd*block_size) @ W1 + b1) # (64, 100)
    logits = h @ W2 + b2 # (64, 27)
    loss = F.cross_entropy(logits, Ytr[ix])
    # print(loss.item())

    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()

    # update
    lr = 0.1 if i < 100000 else 0.01
    for p in parameters:
        p.data += -lr * p.grad

    # track stats
    if i % (max_steps / 10) == 0:
        print(f'{i}/{max_steps}: batch loss: {loss.item():.6f}')
        split_loss('train')

    # if i % 25 == 0:
    #     save_fig(int(frame_n))
    #     frame_n += 1
    # elif i == (max_steps-1):
    #     save_fig(frame_n)
    #     frame_n += 1

In [None]:
# function to evaluate the model
@torch.no_grad()
def split_loss(split):
    x,y = {
        'train': (Xtr, Ytr),
        'val': (Xdev, Ydev),
        'test': (Xte, Yte)
    }[split]
    emb = C[x] # (32, 3, 2)
    embcat = emb.view(emb.shape[0], -1)
    h = torch.tanh(embcat @ W1 + b1) # (32, 100)
    logits = h @ W2 + b2 # (32, 27)
    loss = F.cross_entropy(logits, y)
    print(f'{split}: {loss.item()}')

split_loss('train')
split_loss('val')
split_loss('test')

In [None]:
# sampling from the model
g = torch.Generator().manual_seed(2147483647 + 2023)

for _ in range(20):
    out = []
    context = [0] * block_size # initialize with all '...'
    while True:
        emb = C[torch.tensor([context])] # (1, block_size, d)
        h = torch.tanh(emb.view(1, -1) @ W1 + b1)
        logits = h @ W2 + b2
        probs = F.softmax(logits, dim=1)
        ix = torch.multinomial(probs, num_samples=1, generator=g).item()
        context = context[1:] + [ix]
        out.append(ix)
        if ix == 0:
            break
    print(''.join(itos[i] for i in out))