In [1]:
# imports
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d
import numpy as np
%matplotlib inline

In [2]:
# loading data set
words = open('names.txt', 'r').read().splitlines()
print('Input:')
words[:10]

Input:


['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

In [3]:
len(words)

32033

In [4]:
#build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
print(stoi)

{'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26, '.': 0}


In [5]:
# build the dataset

block_size = 4 # context length: how many characters do we take to predict the next one?
X, Y = [], [] # X is input to neural net. Y is label for each input
for w in words[:5]:
    
    print(w)
    context = [0] * block_size # create padded context of Zero tokens
    for ch in w + '.': # concat . token to end of word and iterate through charcters
        ix = stoi[ch] # assign index of ch to variable ix
        X.append(context) # append context to input X
        Y.append(ix) # append index of ch to labels Y
        print(''.join(itos[i] for i in context), '--->', itos[ix])
        context = context[1:] + [ix] # crop and append

X = torch.tensor(X)
Y = torch.tensor(Y)

emma
.... ---> e
...e ---> m
..em ---> m
.emm ---> a
emma ---> .
olivia
.... ---> o
...o ---> l
..ol ---> i
.oli ---> v
oliv ---> i
livi ---> a
ivia ---> .
ava
.... ---> a
...a ---> v
..av ---> a
.ava ---> .
isabella
.... ---> i
...i ---> s
..is ---> a
.isa ---> b
isab ---> e
sabe ---> l
abel ---> l
bell ---> a
ella ---> .
sophia
.... ---> s
...s ---> o
..so ---> p
.sop ---> h
soph ---> i
ophi ---> a
phia ---> .


In [6]:
C = torch.randn((27, 2))

In [7]:
C[5]

tensor([0.8608, 0.2919])

In [8]:
F.one_hot(torch.tensor(5), num_classes=27).float() @ C

tensor([0.8608, 0.2919])

Indexing into the column is equivalent to taking the dot product of our lookup table C, with the one hot encoded values

PyTorch indexing is very powerful. We can index with a list, or even with a multi-dimensional tensor

In [9]:
X[13, 2]

tensor(0)

In [10]:
C[X]

tensor([[[-4.3063e-01, -2.0792e-01],
         [-4.3063e-01, -2.0792e-01],
         [-4.3063e-01, -2.0792e-01],
         [-4.3063e-01, -2.0792e-01]],

        [[-4.3063e-01, -2.0792e-01],
         [-4.3063e-01, -2.0792e-01],
         [-4.3063e-01, -2.0792e-01],
         [ 8.6082e-01,  2.9192e-01]],

        [[-4.3063e-01, -2.0792e-01],
         [-4.3063e-01, -2.0792e-01],
         [ 8.6082e-01,  2.9192e-01],
         [ 7.4220e-01,  2.7630e-01]],

        [[-4.3063e-01, -2.0792e-01],
         [ 8.6082e-01,  2.9192e-01],
         [ 7.4220e-01,  2.7630e-01],
         [ 7.4220e-01,  2.7630e-01]],

        [[ 8.6082e-01,  2.9192e-01],
         [ 7.4220e-01,  2.7630e-01],
         [ 7.4220e-01,  2.7630e-01],
         [-9.0564e-04,  1.4911e+00]],

        [[-4.3063e-01, -2.0792e-01],
         [-4.3063e-01, -2.0792e-01],
         [-4.3063e-01, -2.0792e-01],
         [-4.3063e-01, -2.0792e-01]],

        [[-4.3063e-01, -2.0792e-01],
         [-4.3063e-01, -2.0792e-01],
         [-4.3063e-01, -2.

In [11]:
C[X][13, 2]

tensor([-0.4306, -0.2079])

In [12]:
C[1]

tensor([-9.0564e-04,  1.4911e+00])

Notice that these are equivalent

In [13]:
# This is our embedding
emb = C[X]
emb.shape

torch.Size([32, 4, 2])

In [14]:
# Creating our weights and biases
# Weights are 6x100. 6 because our output from embedding is 3x2=6, so there are 6 inputs to this hidden layer. 100 neurons has been chosen randomly.
# We also instantiate 100 biases to match our 100 weights
W1 = torch.randn((8, 100))
b1 = torch.randn(100)

In [15]:
# torch.cat concats tensors. takes a positional argument for which dimension to concat on. cat is an inefficient operation
# this works, but if we change our block size this would break and we'd need to add more arguments to concat. lets try torch.unbind
torch.cat([emb[:, 0, :], emb[:, 1, :], emb[:, 2, :]], 1)

tensor([[-4.3063e-01, -2.0792e-01, -4.3063e-01, -2.0792e-01, -4.3063e-01,
         -2.0792e-01],
        [-4.3063e-01, -2.0792e-01, -4.3063e-01, -2.0792e-01, -4.3063e-01,
         -2.0792e-01],
        [-4.3063e-01, -2.0792e-01, -4.3063e-01, -2.0792e-01,  8.6082e-01,
          2.9192e-01],
        [-4.3063e-01, -2.0792e-01,  8.6082e-01,  2.9192e-01,  7.4220e-01,
          2.7630e-01],
        [ 8.6082e-01,  2.9192e-01,  7.4220e-01,  2.7630e-01,  7.4220e-01,
          2.7630e-01],
        [-4.3063e-01, -2.0792e-01, -4.3063e-01, -2.0792e-01, -4.3063e-01,
         -2.0792e-01],
        [-4.3063e-01, -2.0792e-01, -4.3063e-01, -2.0792e-01, -4.3063e-01,
         -2.0792e-01],
        [-4.3063e-01, -2.0792e-01, -4.3063e-01, -2.0792e-01, -1.7604e+00,
          1.5907e+00],
        [-4.3063e-01, -2.0792e-01, -1.7604e+00,  1.5907e+00,  1.8526e+00,
          1.4992e+00],
        [-1.7604e+00,  1.5907e+00,  1.8526e+00,  1.4992e+00, -2.8068e-01,
         -1.1683e+00],
        [ 1.8526e+00,  1.4992e

In [16]:
# torch.unbind removes a tensor dimension. it returns a tuple of all slices along this dimension, without it. this method is insensitive to changes in block size
torch.cat(torch.unbind(emb, 1), 1)

tensor([[-4.3063e-01, -2.0792e-01, -4.3063e-01, -2.0792e-01, -4.3063e-01,
         -2.0792e-01, -4.3063e-01, -2.0792e-01],
        [-4.3063e-01, -2.0792e-01, -4.3063e-01, -2.0792e-01, -4.3063e-01,
         -2.0792e-01,  8.6082e-01,  2.9192e-01],
        [-4.3063e-01, -2.0792e-01, -4.3063e-01, -2.0792e-01,  8.6082e-01,
          2.9192e-01,  7.4220e-01,  2.7630e-01],
        [-4.3063e-01, -2.0792e-01,  8.6082e-01,  2.9192e-01,  7.4220e-01,
          2.7630e-01,  7.4220e-01,  2.7630e-01],
        [ 8.6082e-01,  2.9192e-01,  7.4220e-01,  2.7630e-01,  7.4220e-01,
          2.7630e-01, -9.0564e-04,  1.4911e+00],
        [-4.3063e-01, -2.0792e-01, -4.3063e-01, -2.0792e-01, -4.3063e-01,
         -2.0792e-01, -4.3063e-01, -2.0792e-01],
        [-4.3063e-01, -2.0792e-01, -4.3063e-01, -2.0792e-01, -4.3063e-01,
         -2.0792e-01, -1.7604e+00,  1.5907e+00],
        [-4.3063e-01, -2.0792e-01, -4.3063e-01, -2.0792e-01, -1.7604e+00,
          1.5907e+00,  1.8526e+00,  1.4992e+00],
        [-4.3063

In [17]:
# Tensor.view() is an extremely efficient way to maniuplate tensor dimensions without actually modifying memory. 
# It utilizes underlying attributes of the tensor object in order to change the way a tensor is represented.
a = torch.arange(18)
display(a.storage()) # tensors are stored in memory as a list of their values
a.view(2,9), a.view(2,9), a.view(3,3,2)

  display(a.storage()) # tensors are stored in memory as a list of their values
  output = repr(obj)
  return str(self)
  f'device={self.device}) of size {len(self)}]')
  if self.device.type == 'meta':
  data_str = ' ' + '\n '.join(str(self[i]) for i in range(self.size()))


 0
 1
 2
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
[torch.storage.TypedStorage(dtype=torch.int64, device=cpu) of size 18]

(tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8],
         [ 9, 10, 11, 12, 13, 14, 15, 16, 17]]),
 tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8],
         [ 9, 10, 11, 12, 13, 14, 15, 16, 17]]),
 tensor([[[ 0,  1],
          [ 2,  3],
          [ 4,  5]],
 
         [[ 6,  7],
          [ 8,  9],
          [10, 11]],
 
         [[12, 13],
          [14, 15],
          [16, 17]]]))

In [18]:
# tensor.view() just so happens to manipulate the shape of the tensor in the way we desire
# Using -1 to avoid hardcoding. We could equivalently use emb.shape[0]. PyTorch is smart enough to derive the necessary value to make the operation work using the second value of 6
# We use tanh here to transfrom all values in the resulting tesnor to be between -1 and 1
# **It's good practice to always double check that the broadcasting operation taking place (+) works as we expect it to!
h = torch.tanh(emb.view(-1, 8) @ W1 + b1)

In [19]:
h.shape

torch.Size([32, 100])

In [20]:
W2 = torch.randn((100, 27))
b2 = torch.randn(27)
logits = h @ W2 + b2

In [21]:
logits.shape

torch.Size([32, 27])

In [22]:
counts = logits.exp()
prob = counts / counts.sum(1, keepdims=True)

In [23]:
prob.shape

torch.Size([32, 27])

In [24]:
# negative log loss likelihood
loss = -prob[torch.arange(len(Y)), Y].log().mean()
loss

tensor(17.6206)

#### Here is the above code rewritten and condensed:

In [25]:
# build the dataset

block_size = 3 # context length: how many characters do we take to predict the next one?
X, Y = [], [] # X is input to neural net. Y is label for each input
for w in words:
    
    context = [0] * block_size # create padded context of Zero tokens
    for ch in w + '.': # concat . token to end of word and iterate through charcters
        ix = stoi[ch] # assign index of ch to variable ix
        X.append(context) # append context to input X
        Y.append(ix) # append index of ch to labels Y
        context = context[1:] + [ix] # crop and append

X = torch.tensor(X)
Y = torch.tensor(Y)

#### Build the dataset with train/val/test splits

In [275]:
def build_dataset(words):
    block_size = 3 # context length: how many characters do we take to predict the next one?
    X, Y = [], [] # X is input to neural net. Y is label for each input
    for w in words:
        
        context = [0] * block_size # create padded context of Zero tokens
        for ch in w + '.': # concat . token to end of word and iterate through charcters
            ix = stoi[ch] # assign index of ch to variable ix
            X.append(context) # append context to input X
            Y.append(ix) # append index of ch to labels Y
            context = context[1:] + [ix] # crop and append

    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(X.shape, Y.shape)
    return X, Y

import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])

torch.Size([182441, 3]) torch.Size([182441])
torch.Size([22902, 3]) torch.Size([22902])
torch.Size([22803, 3]) torch.Size([22803])


In [276]:
# # F.cross_entropy()
# emb = C[X] # (32, 3, 2)
# h = torch.tanh(emb.view(-1, 6) @ W1 + b1) # (32, 100)
# logits = h @ W2 + b2 # (32, 27)
# # counts = logits.exp()
# # prob = counts / counts.sum(1, keepdims=True)
# # loss = -prob[torch.arange(32), Y].log().mean()

# # ^^^ THIS IS THE SAME AS THE ABOVE ^^^
# # F.cross_entropy is much more computationally efficient as it does compound operations rather than multiple individual operations.
# # In addition, it makes back-propagation much more efficient for the same reasons
# loss = F.cross_entropy(logits, Y)
# loss

In [301]:
n_embd = 10
block_size = 3
n_hidden = 200
vocab_size = len(stoi)

g = torch.Generator().manual_seed(2147483647)
C = torch.randn((vocab_size, n_embd),           generator=g)
W1 = torch.randn((n_embd*block_size, n_hidden), generator=g) * (5/3)/((n_embd* block_size)**0.5)
b1 = torch.randn(n_hidden,                      generator=g) * 0.02
W2 = torch.randn((n_hidden, vocab_size),        generator=g) * 0.01
b2 = torch.randn(vocab_size,                    generator=g) * 0

bngain = torch.ones((1, n_hidden))
bnbias = torch.zeros((1, n_hidden))
bnmean_running = torch.zeros((1, n_hidden))
bnstd_running = torch.ones((1, n_hidden))

parameters = [C, W1, b1, W2, b2, bngain, bnbias] # to count total parameters
print(f'Total parameters: {sum(p.nelement() for p in parameters)}')
for p in parameters:
    p.requires_grad = True

Total parameters: 12297


In [302]:
def get_floor(ix, T, buff):
    return T[:,ix].min().item() - (T[:,ix].max().item() - T[:,ix].min().item()) * buff

In [303]:
# colors = ['b' if i not in {1, 5, 9, 15, 21} else 'crimson' for i in np.arange(27)]

# def save_fig(frame):
#     plt.figure(figsize=(8,8))
#     plt.scatter(C[:,0].data, C[:,1].data, s=200, c=colors)
#     for i in range(C.shape[0]):
#         plt.text(C[i,0].item(), C[i,1].item(), itos[i], ha="center", va="center", color="white")
#     # plt.grid('minor')
#     plt.savefig(f'GIF/{frame}.png', dpi=500)
#     plt.close();

In [304]:
colors = ['b' if i not in {1, 5, 9, 15, 21} else 'crimson' for i in np.arange(27)]

def save_fig_3d(frame):
    n_chars = 5
    frame_s = str(frame)
    n = n_chars - len(frame_s)
    frame_s = ('0' * n) + frame_s
    plt.figure(figsize=(8,8))
    ax = plt.axes(projection='3d') 
    # plot shadows
    floor = get_floor(2, C, 0.1)
    ax.scatter3D(C[:,0].data, C[:,1].data, floor, c='gray', s=150, zorder=2)
    # plot points
    ax.scatter3D(C[:,0].data, C[:,1].data, C[:,2].data, c=colors, s=150, zorder=20)

    # plot labels
    for i in range(C.shape[0]):
        ax.text3D(C[i,0].item(), C[i,1].item(), C[i,2].item(), itos[i], ha="center", va="center", color="white", zorder=40, fontsize=8)
    for i in range(C.shape[0]):
        ax.text3D(C[i,0].item(), C[i,1].item(), floor, itos[i], ha="center", va="center", color="white", zorder=10, fontsize=8)
    ax.set_xlabel('x')
    ax.set_ylabel('y')
    ax.set_zlabel('z')
    ax.set_title('Training of Embeddings Over 1M Iterations')
    plt.savefig('GIF_3d/' + frame_s)
    plt.close()

In [305]:
def save_fig_2d(frame):
    n_chars = 5
    frame_s = str(frame)
    n = n_chars - len(frame_s)
    frame_s = ('0' * n) + frame_s
    plt.figure(figsize=(8,8))
    ax = plt.axes() 
    # plot points
    ax.scatter(C[:,0].data, C[:,1].data, c=colors, s=200)

    # plot labels
    for i in range(C.shape[0]):
        ax.text(C[i,0].item(), C[i,1].item(), itos[i], ha="center", va="center", color="white", fontsize=10)

    ax.set_xlabel('x')
    ax.set_ylabel('y')
    ax.set_title('Training of Embeddings Over 50k Iterations')
    plt.savefig('GIF_2d/' + frame_s)
    plt.close()

In [306]:
max_steps = 200_000
batch_size = 64
frames = 1000
frame_n = 0

for i in range(max_steps):

    # construct minibatch
    # create a tensor ix of length batch_size and fill it
    # with values within range 0 to Xtr.shape[0]
    # use ix index into Xtr, Ytr, assign output to Xb, Yb
    ix = torch.randint(0, Xtr.shape[0], (batch_size,), generator=g)
    Xb, Yb = Xtr[ix], Ytr[ix]

    # forward pass
    emb = C[Xb] # embed the characters into vectors
    embcat = emb.view(emb.shape[0], -1) # concatenate the vectors
    hpreact = embcat @ W1 #+ b1 # hidden layer preactivation
    bnmeani = hpreact.mean(0, keepdim=True)
    bnstdi = hpreact.std(0, keepdim=True)
    hpreact = bngain * (hpreact - bnmeani) / (bnstdi) + bnbias

    with torch.no_grad():
        bnmean_running = (0.999 * bnmean_running) + (0.001 * bnmeani)
        bnstd_running = (0.999 * bnstd_running) + (0.001 * bnstdi)

    h = torch.tanh(hpreact) # hidden layer
    logits = h @ W2 + b2 # output layer
    loss = F.cross_entropy(logits, Yb) # calculate loss

    # backward pass
    for p in parameters:                                           
        p.grad = None
    loss.backward()

    # update
    lr = 0.1 if i < 100000 else 0.01
    for p in parameters:
        p.data += -lr * p.grad

    # track stats
    if i % (max_steps / 20) == 0:
        print(f'{i}/{max_steps}:')
        print(f'batch loss: {loss.item():.6f}\n')
    

    # if (i % 50) == 0 and i < 50000:
    #     save_fig_2d(int(frame_n))
    #     frame_n += 1
    # elif i == (max_steps-1):
    #     save_fig_2d(frame_n)
    #     frame_n += 1


# print(loss.item())

0/200000:
batch loss: 3.281588

10000/200000:
batch loss: 2.170333

20000/200000:
batch loss: 2.096152

30000/200000:
batch loss: 1.914917

40000/200000:
batch loss: 2.107869

50000/200000:
batch loss: 2.082729

60000/200000:
batch loss: 2.131227

70000/200000:
batch loss: 1.801952

80000/200000:
batch loss: 2.012656

90000/200000:
batch loss: 2.009828

100000/200000:
batch loss: 2.010406

110000/200000:
batch loss: 1.979393

120000/200000:
batch loss: 1.823938

130000/200000:
batch loss: 2.179720

140000/200000:
batch loss: 1.816852

150000/200000:
batch loss: 2.263287

160000/200000:
batch loss: 2.135348

170000/200000:
batch loss: 2.111442

180000/200000:
batch loss: 2.071927

190000/200000:
batch loss: 2.075921



In [297]:
@torch.no_grad()
def split_loss(split):
    x,y = {
        'train': (Xtr, Ytr),
        'val': (Xdev, Ydev),
        'test': (Xte, Yte)
    }[split]
    emb = C[x] # (32, 3, 2)
    embcat = emb.view(emb.shape[0], -1)
    hpreact = embcat @ W1 + b1
    hpreact = bngain * (hpreact - bnmean_running) / bnstd_running + bnbias
    h = torch.tanh(hpreact) # (32, 100)
    logits = h @ W2 + b2 # (32, 27)
    loss = F.cross_entropy(logits, y)
    print(f'{split}: {loss.item()}')

# split_loss('train')
# split_loss('val')
# split_loss('test')

In [307]:
split_loss('train')
split_loss('val')

train: 2.0428762435913086
val: 2.1018285751342773


#### Let's sample from the model!

In [None]:
emb = C[x] # (32, 3, 2)
embcat = emb.view(emb.shape[0], -1)
hpreact = embcat @ W1 + b1
hpreact = bngain * (hpreact - hpreact.mean(0, keepdim=True)) / (hpreact.std(0, keepdim=True)) + bnbias
h = torch.tanh(hpreact) # (32, 100)

In [308]:
g = torch.Generator().manual_seed(2147483647 + 2023) # for reproducibility
for _ in range(20):
    out = []
    context = [0] * block_size # initialize with all '...'
    while True:
        emb = C[torch.tensor([context])] # (1, block_size) tensor with current context
        embcat = emb.view(emb.shape[0], -1)
        hpreact = embcat @ W1 + b1
        hpreact = bngain * (hpreact - bnmean_running) / bnstd_running + bnbias
        h = torch.tanh(hpreact) # pass current context into model
        logits = h @ W2 + b2 # retrieve output logits from hidden layer
        probs = F.softmax(logits, dim=1) # converts logits to probabilities
        # multinomial samples from our probability distribution
        ix = torch.multinomial(probs, num_samples=1, generator=g).item()
        context = context[1:] + [ix] # reset our context tensor to 3 chars
        if ix == 0:
            break # break loop if we've selected a '.' character
        out.append(ix) # append selected character to output word
    print(''.join(itos[i] for i in out)) # print output word

alina
harley
aolus
padis
siden
klohanise
jere
jayna
aley
hanti
phoy
kinzlynn
sara
yask
bra
massa
santae
wrentlee
aune
katana


#### Next step is to modify hyperparameters in order to improve on the log loss!