In [1]:
import math
import torch
import torch.nn.functional as F
from torch.utils.data import random_split

In [2]:
# Dataset
words = open('names.txt', 'r').read().splitlines()
words[:10]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

In [3]:
g = torch.Generator().manual_seed(2147483647)

In [4]:
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i, s in enumerate(chars)} # Characters to indexes
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()} # Indexes to characters
stoi

{'a': 1,
 'b': 2,
 'c': 3,
 'd': 4,
 'e': 5,
 'f': 6,
 'g': 7,
 'h': 8,
 'i': 9,
 'j': 10,
 'k': 11,
 'l': 12,
 'm': 13,
 'n': 14,
 'o': 15,
 'p': 16,
 'q': 17,
 'r': 18,
 's': 19,
 't': 20,
 'u': 21,
 'v': 22,
 'w': 23,
 'x': 24,
 'y': 25,
 'z': 26,
 '.': 0}

In [5]:
#################################################################################### Bigram ####################################################################################

# Split data set
train_data, dev_data, test_data = random_split(words, [0.8, 0.1, 0.1], generator=g)
X_train, y_train = [], []
def split_data(data):
    X, y = [], []
    for w in data:
        chs = ['.'] + list(w) + ['.']
        for ch1, ch2 in zip(chs, chs[1:]):
            X.append(stoi[ch1])
            y.append(stoi[ch2])
    X = torch.tensor(X)
    y = torch.tensor(y)
    return [X, y]

# Create train, dev and test sets
X_train, y_train = split_data(train_data)
X_dev, y_dev = split_data(dev_data)
X_test, y_test = split_data(test_data)

In [6]:
W = torch.randn((27, 27), generator=g, requires_grad=True)

In [7]:
n = X_train.nelement()
Xenc = F.one_hot(X_train, num_classes=27).float()
for epoch in range(50):
    # forward pass
    logits = Xenc @ W
    counts = logits.exp()
    probs = counts / counts.sum(1, keepdims=True)
    loss = -probs[torch.arange(n), y_train].log().mean()
    
    # backward pass
    W.grad = None
    loss.backward()
    
    # update
    W.data += -50 * W.grad
    if (epoch == 0 or ((epoch+1)%10) == 0):
        print(f'Epoch: {epoch+1} loss: {loss}')


Epoch: 1 loss: 3.6793460845947266
Epoch: 10 loss: 2.685643196105957
Epoch: 20 loss: 2.571347713470459
Epoch: 30 loss: 2.5294389724731445
Epoch: 40 loss: 2.5084228515625
Epoch: 50 loss: 2.496042251586914


In [8]:
n = X_dev.nelement()
# Loss on dev set
Xenc = F.one_hot(X_dev, num_classes=27).float()
logits = Xenc @ W
counts = logits.exp()
probs = counts / counts.sum(1, keepdims=True)
loss = -probs[torch.arange(n), y_dev].log().mean()

In [9]:
loss

tensor(2.4942, grad_fn=<NegBackward0>)

In [10]:
n = X_test.nelement()
# Loss on test set
Xenc = F.one_hot(X_test, num_classes=27).float()
logits = Xenc @ W
counts = logits.exp()
probs = counts / counts.sum(1, keepdims=True)
loss = -probs[torch.arange(n), y_test].log().mean()

In [11]:
loss

tensor(2.4929, grad_fn=<NegBackward0>)

In [12]:
################################################################################### Trigram ###################################################################################

In [13]:
# Split data set
train_data, dev_data, test_data = random_split(words, [0.8, 0.1, 0.1], generator=g)
X_train, y_train = [], []
def split_data(data):
    X, y = [], []
    for w in data:
        chs = ['.'] + ['.'] + list(w) + ['.']
        for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
            X.append(stoi[ch1] + 27 * stoi[ch2])
            y.append(stoi[ch3])
    X = torch.tensor(X)
    y = torch.tensor(y)
    return [X, y]

# Create train, dev and test sets
X_train, y_train = split_data(train_data)
X_dev, y_dev = split_data(dev_data)
X_test, y_test = split_data(test_data)

In [14]:
W = torch.randn((729, 27), requires_grad=True, generator=g)

In [15]:
Xenc = F.one_hot(X_train, num_classes=729).float()

In [18]:
n = X_train.nelement()
for epoch in range(100):
    # Forward pass
    logits = Xenc @ W
    counts = logits.exp()
    probs = counts / counts.sum(1, keepdims=True)
    loss = -probs[torch.arange(n), y_train].log().mean()
    
    # Backward pass
    W.grad = None
    loss.backward()
    
    # Update
    W.data += -200 * W.grad
    
    if (epoch == 0 or ((epoch+1)%10) == 0):
        print(f'Epoch: {epoch+1} loss: {loss}')

Epoch: 1 loss: 3.797792434692383
Epoch: 10 loss: 2.7900638580322266
Epoch: 20 loss: 2.6386373043060303
Epoch: 30 loss: 2.501171350479126
Epoch: 40 loss: 2.4161949157714844
Epoch: 50 loss: 2.3904247283935547
Epoch: 60 loss: 2.3634285926818848
Epoch: 70 loss: 2.3345844745635986
Epoch: 80 loss: 2.4291305541992188
Epoch: 90 loss: 2.347219944000244
Epoch: 100 loss: 2.3716819286346436


In [20]:
n = X_dev.nelement()
Xenc = F.one_hot(X_dev, num_classes=729).float()
# Loss on dev set
logits = Xenc @ W
counts = logits.exp()
probs = counts / counts.sum(1, keepdims=True)
loss = -probs[torch.arange(n), y_dev].log().mean()

In [21]:
loss

tensor(2.3251, grad_fn=<NegBackward0>)

In [24]:
n = X_test.nelement()
Xenc = F.one_hot(X_test, num_classes=729).float()
# Loss on test set
logits = Xenc @ W
counts = logits.exp()
probs = counts / counts.sum(1, keepdims=True)
loss = -probs[torch.arange(n), y_test].log().mean()

In [25]:
loss

tensor(2.3340, grad_fn=<NegBackward0>)