# Introduction to Embeddings

Lets generate some names using the previous characters

In [None]:
import torch
import matplotlib.pyplot as plt

words = open('data/names.txt', 'r').read().splitlines()
all_chars = ['.'] + sorted(list(set("".join(words))))
itos = {idx: v for idx, v in enumerate(all_chars)}
stoi = {v: k for k, v in itos.items()}

NUM_CHARS = len(all_chars)

In [None]:
words[:5]

Lets use three characters in order to generate the next.

In [None]:
block_size = 3

def build_dataset(words):
    X, Y = [], []

    for w in words:
        context = [0] * block_size
        for ch in w + '.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix]

    X = torch.tensor(X)
    Y = torch.tensor(Y)
    return X, Y

import random
random.seed(314)
random.shuffle(words)
n1 = int(0.7 * len(words))
Xtr, Ytr = build_dataset(words[:n1])
Xte, Yte = build_dataset(words[n1:])

print(Xtr.shape, Ytr.shape)
print(Xte.shape, Yte.shape)

In [None]:
# First 5 training instances
Xtr[:5], Ytr[:5]

In [None]:
# Lets perform a one-hot encoding of each character
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

encoded = F.one_hot(Xtr[:5], num_classes=NUM_CHARS).to(torch.float)
Xtr[:5].shape, encoded.shape

In [None]:
encoded[0,0], encoded[3,0]

In [None]:
# Now, in order to feed a linear layer, I need to put all one-hot encoded vectors together
encoded.view(5, -1).shape

In [None]:
# Define the model for training
class Model1(nn.Module):
    def __init__(self):
        super(Model1, self).__init__()
        self.layer1 = nn.Linear(81, 20)
        self.layer2 = nn.Linear(20, NUM_CHARS)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, xs):
        x = F.one_hot(xs, num_classes=NUM_CHARS).to(torch.float)
        x = x.view(xs.shape[0], -1)
        x = self.layer1(x).tanh()
        x = self.layer2(x)
        # x = self.softmax(x) ... CrossEntropyLoss already contains softmax
        return x
    
model1 = Model1()
sum([n.nelement() for n in model1.parameters()])

In [None]:
Xtr.shape, Xtr.view(Xtr.shape[0], -1).shape

In [None]:
# propabilities for next character for first 5 examples
predicted = model1(Xtr[:5])
predicted.shape, predicted

In [None]:
loss_fn = nn.CrossEntropyLoss()
Ytr[:5], loss_fn(predicted, Ytr[:5])

In [None]:
# Random cross entropy loss
import numpy as np
-np.log(1/NUM_CHARS)

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:

optimizer = optim.Adam(model1.parameters(), lr=0.01)

Xtr_dev = Xtr.to(device)
Ytr_dev = Ytr.to(device)
model1_dev = model1.to(device)
# Training loop
num_epochs = 10000
for epoch in range(num_epochs):
    model1_dev.train()

    outputs = model1_dev(Xtr_dev)
    loss = loss_fn(outputs, Ytr_dev)

    optimizer.zero_grad()
    loss.backward()
        
    optimizer.step()

    if epoch % (num_epochs // 10) == 0:
        print(epoch, loss.item())

print("Training complete")

In [None]:
# Evaluation loop
model1_dev.eval()
correct = 0
total = 0
with torch.no_grad():
    output_test = model1(Xte.to(device))
    loss = loss_fn(output_test, Yte.to(device))
    
print(f"Testing loss: {loss.item()}")

## Using embeddings

Embedding is a way to code a one-hot enconding into an smaller space, by learning the transformation together with the training procedure.

Is like PCA, but:
- Takes into account the expected output
- Learns while training

In [None]:
Xtr[:5], encoded.shape

In [None]:
EMBED_SIZE = 10
embedding = nn.Linear(NUM_CHARS, EMBED_SIZE, bias=False)

Now, we are going to apply the embedding individually to each encoded character, using the same embedding for each one

In [None]:
# The linear transforms the 27 one-hot encoding into an embedding of 10 dimensions
embedding(encoded).shape

The other components of the model remain similar.

In [None]:
class Model2(nn.Module):
    def __init__(self):
        super(Model2, self).__init__()
        self.embedding = nn.Linear(NUM_CHARS, EMBED_SIZE, bias=False)
        self.layer1 = nn.Linear(3 * EMBED_SIZE, 32)
        self.layer2 = nn.Linear(32, NUM_CHARS)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, xs):
        x = F.one_hot(xs, num_classes=NUM_CHARS).to(torch.float)
        x = self.embedding(x)
        x = x.view(xs.shape[0], -1)
        x = self.layer1(x).tanh()
        x = self.layer2(x)
        # x = self.softmax(x) ... CrossEntropyLoss already contains softmax
        return x
    
model2 = Model2()
sum([n.nelement() for n in model2.parameters()])

In [None]:
predicted = model2(Xtr[:5])
predicted.shape, predicted

In [None]:
optimizer = optim.Adam(model2.parameters(), lr=0.01)

Xtr_dev = Xtr.to(device)
Ytr_dev = Ytr.to(device)
model2_dev = model2.to(device)
# Training loop
num_epochs = 10000
for epoch in range(num_epochs):
    model1_dev.train()

    outputs = model2_dev(Xtr_dev)
    loss = loss_fn(outputs, Ytr_dev)

    optimizer.zero_grad()
    loss.backward()
        
    optimizer.step()

    if epoch % (num_epochs // 10) == 0:
        print(epoch, loss.item())

print("Training complete")

The system is now more accurate, and you can keep playing with parameters.

If we create the embedding in 2D, we can show it in a figure.

In [None]:
EMBED_SIZE = 2

class Model3(nn.Module):
    def __init__(self):
        super(Model3, self).__init__()
        self.embedding = nn.Linear(NUM_CHARS, EMBED_SIZE, bias=False)
        self.layer1 = nn.Linear(3 * EMBED_SIZE, 50)
        self.layer2 = nn.Linear(50, NUM_CHARS)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, xs):
        x = F.one_hot(xs, num_classes=NUM_CHARS).to(torch.float)
        x = self.embedding(x)
        x = x.view(xs.shape[0], -1)
        x = self.layer1(x).tanh()
        x = self.layer2(x)
        # x = self.softmax(x) ... CrossEntropyLoss already contains softmax
        return x
    
model3 = Model3()
sum([n.nelement() for n in model3.parameters()])

In [None]:
optimizer = optim.Adam(model3.parameters(), lr=0.01)

Xtr_dev = Xtr.to(device)
Ytr_dev = Ytr.to(device)
model3_dev = model3.to(device)
# Training loop
num_epochs = 10000
for epoch in range(num_epochs):
    model1_dev.train()

    outputs = model3_dev(Xtr_dev)
    loss = loss_fn(outputs, Ytr_dev)

    optimizer.zero_grad()
    loss.backward()
        
    optimizer.step()

    if epoch % (num_epochs // 10) == 0:
        print(epoch, loss.item())

print("Training complete")

In [None]:
model3.embedding.weight.shape, model3.embedding.weight

In [None]:
with torch.no_grad():
    coords = model3.embedding.weight.cpu() @ torch.eye(NUM_CHARS)
coords.shape, coords

In [None]:
x = coords[0].numpy()
y = coords[1].numpy()
plt.scatter(x, y)
for i in range(NUM_CHARS):
    plt.annotate(itos[i], (x[i], y[i]), textcoords="offset points", xytext=(5,5), ha='center')
plt.show()