In [1]:
import math
import torch
import torch.nn.functional as F
from torch.utils.data import random_split

In [2]:
# Dataset
words = open('names.txt', 'r').read().splitlines()
words[:10]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

In [3]:
g = torch.Generator().manual_seed(2147483647)

In [4]:
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i, s in enumerate(chars)} # Characters to indexes
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()} # Indexes to characters
stoi

{'a': 1,
 'b': 2,
 'c': 3,
 'd': 4,
 'e': 5,
 'f': 6,
 'g': 7,
 'h': 8,
 'i': 9,
 'j': 10,
 'k': 11,
 'l': 12,
 'm': 13,
 'n': 14,
 'o': 15,
 'p': 16,
 'q': 17,
 'r': 18,
 's': 19,
 't': 20,
 'u': 21,
 'v': 22,
 'w': 23,
 'x': 24,
 'y': 25,
 'z': 26,
 '.': 0}

In [5]:
#################################################################################### Bigram ####################################################################################

# Split data set
train_data, dev_data, test_data = random_split(words, [0.8, 0.1, 0.1], generator=g)
X_train, y_train = [], []
def split_data(data):
    X, y = [], []
    for w in data:
        chs = ['.'] + list(w) + ['.']
        for ch1, ch2 in zip(chs, chs[1:]):
            X.append(stoi[ch1])
            y.append(stoi[ch2])
    X = torch.tensor(X)
    y = torch.tensor(y)
    return [X, y]

# Create train, dev and test sets
X_train, y_train = split_data(train_data)
X_dev, y_dev = split_data(dev_data)
X_test, y_test = split_data(test_data)

In [6]:
W = torch.randn((27, 27), generator=g, requires_grad=True)

In [7]:
n = X_train.nelement()
Xenc = F.one_hot(X_train, num_classes=27).float()
for epoch in range(50):
    # forward pass
    logits = Xenc @ W
    counts = logits.exp()
    probs = counts / counts.sum(1, keepdims=True)
    loss = -probs[torch.arange(n), y_train].log().mean()
    
    # backward pass
    W.grad = None
    loss.backward()
    
    # update
    W.data += -50 * W.grad
    if (epoch == 0 or ((epoch+1)%10) == 0):
        print(f'Epoch: {epoch+1} loss: {loss}')


Epoch: 1 loss: 3.6793460845947266
Epoch: 10 loss: 2.685643196105957
Epoch: 20 loss: 2.571347713470459
Epoch: 30 loss: 2.5294389724731445
Epoch: 40 loss: 2.5084228515625
Epoch: 50 loss: 2.496042251586914


In [8]:
n = X_dev.nelement()
# Loss on dev set
Xenc = F.one_hot(X_dev, num_classes=27).float()
logits = Xenc @ W
counts = logits.exp()
probs = counts / counts.sum(1, keepdims=True)
loss = -probs[torch.arange(n), y_dev].log().mean()

In [9]:
loss

tensor(2.4942, grad_fn=<NegBackward0>)

In [10]:
n = X_test.nelement()
# Loss on test set
Xenc = F.one_hot(X_test, num_classes=27).float()
logits = Xenc @ W
counts = logits.exp()
probs = counts / counts.sum(1, keepdims=True)
loss = -probs[torch.arange(n), y_test].log().mean()

In [11]:
loss

tensor(2.4929, grad_fn=<NegBackward0>)

In [12]:
################################################################################### Trigram ###################################################################################


In [13]:
############################################################################# 1x54 vector approach ############################################################################
# Split data set
train_data, dev_data, test_data = random_split(words, [0.8, 0.1, 0.1], generator=g)
X_train, y_train = [], []
def split_data(data):
    X, y = [], []
    for w in data:
        chs = ['.'] + ['.'] + list(w) + ['.']
        for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
            X.append((stoi[ch1], stoi[ch2]))
            y.append(stoi[ch3])
    return [X, y]

# Create train, dev and test sets
X_train, y_train = split_data(train_data)
X_dev, y_dev = split_data(dev_data)
X_test, y_test = split_data(test_data)

In [14]:
W = torch.randn((54, 27), requires_grad=True, generator=g)

In [15]:
def one_hot_encode(X):
    n = len(X)
    Xenc = torch.zeros((n, 54))
    for i in range(n):
        vector = torch.zeros(54,)
        ix1 = X[i][0]
        ix2 = X[i][1]
        vector[ix1] += 1
        vector[27+ix2] += 1
        Xenc[i] = vector
    return Xenc

In [16]:
Xenc = one_hot_encode(X_train).float()

In [17]:
def train_model(W, epochs, lmbda, lr, tune_mode=False):
    n = len(X_train)
    for epoch in range(epochs):
        # Forward pass
        logits = Xenc @ W
        counts = logits.exp()
        probs = counts / counts.sum(1, keepdims=True)
        loss = -probs[torch.arange(n), y_train].log().mean() + lmbda*(W**2).mean()

        # Backward pass
        W.grad = None
        loss.backward()

        # Update
        W.data += -lr * W.grad

        if (tune_mode == False and (epoch == 0 or ((epoch+1)%10) == 0)):
            print(f'Epoch: {epoch+1} train loss: {loss}')
    if tune_mode:
        print(f'Lambda: {lmbda}, train loss: {loss}')
    return W

In [18]:
# Get best l2 regularization rate
for lmbda in [0, 0.0001, 0.001, 0.005, 0.01, 0.1]:
    temp_W = W
    train_model(temp_W, 100, lmbda, 50, True)
    n = len(X_dev)
    xenc = one_hot_encode(X_dev).float()
    # Loss on dev set
    logits = xenc @ W
    counts = logits.exp()
    probs = counts / counts.sum(1, keepdims=True)
    loss = -probs[torch.arange(n), y_dev].log().mean() + lmbda * (W**2).mean()
    print(f'Lambda: {lmbda}, dev loss: {loss}')

Lambda: 0, train loss: 2.445704936981201
Lambda: 0, dev loss: 2.384784698486328
Lambda: 0.0001, train loss: 2.4304146766662598
Lambda: 0.0001, dev loss: 2.370854616165161
Lambda: 0.001, train loss: 2.426582098007202
Lambda: 0.001, dev loss: 2.3677735328674316
Lambda: 0.005, train loss: 2.429805040359497
Lambda: 0.005, dev loss: 2.37105655670166
Lambda: 0.01, train loss: 2.4355039596557617
Lambda: 0.01, dev loss: 2.3764073848724365
Lambda: 0.1, train loss: 2.509036064147949
Lambda: 0.1, dev loss: 2.4410972595214844


In [19]:
W = train_model(W, 100, 0.001, 50)

Epoch: 1 train loss: 2.3771605491638184
Epoch: 10 train loss: 2.3961644172668457
Epoch: 20 train loss: 2.435573101043701
Epoch: 30 train loss: 2.394252300262451
Epoch: 40 train loss: 2.431835651397705
Epoch: 50 train loss: 2.392531394958496
Epoch: 60 train loss: 2.4297971725463867
Epoch: 70 train loss: 2.3912127017974854
Epoch: 80 train loss: 2.428421974182129
Epoch: 90 train loss: 2.3902111053466797
Epoch: 100 train loss: 2.42739200592041


In [20]:
n = len(X_test)
Xenc = one_hot_encode(X_test).float()
# Loss on test set
logits = Xenc @ W
counts = logits.exp()
probs = counts / counts.sum(1, keepdims=True)
loss = -probs[torch.arange(n), y_test].log().mean() + 0.001 * (W**2).mean()

In [21]:
loss

tensor(2.3775, grad_fn=<AddBackward0>)

In [22]:
############################################################################# 1x729 vector approach ###########################################################################

In [23]:
# Split data set
train_data, dev_data, test_data = random_split(words, [0.8, 0.1, 0.1], generator=g)
X_train, y_train = [], []
def split_data(data):
    X, y = [], []
    for w in data:
        chs = ['.'] + ['.'] + list(w) + ['.']
        for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
            X.append(stoi[ch1] + 27 * stoi[ch2])
            y.append(stoi[ch3])
    X = torch.tensor(X)
    y = torch.tensor(y)
    return [X, y]

# Create train, dev and test sets
X_train, y_train = split_data(train_data)
X_dev, y_dev = split_data(dev_data)
X_test, y_test = split_data(test_data)

In [24]:
W = torch.randn((729, 27), requires_grad=True, generator=g)

In [25]:
Xenc = F.one_hot(X_train, num_classes=729).float()

In [None]:
n = X_train.nelement()
for epoch in range(100):
    # Forward pass
    logits = Xenc @ W
    counts = logits.exp()
    probs = counts / counts.sum(1, keepdims=True)
    loss = -probs[torch.arange(n), y_train].log().mean()
    
    # Backward pass
    W.grad = None
    loss.backward()
    
    # Update
    W.data += -200 * W.grad
    
    if (epoch == 0 or ((epoch+1)%10) == 0):
        print(f'Epoch: {epoch+1} loss: {loss}')

Epoch: 1 loss: 3.744565725326538
Epoch: 10 loss: 2.801985025405884
Epoch: 20 loss: 2.596341133117676
Epoch: 30 loss: 2.5863234996795654
Epoch: 40 loss: 2.422994375228882
Epoch: 50 loss: 2.4064154624938965
Epoch: 60 loss: 2.3876593112945557
Epoch: 70 loss: 2.4260802268981934
Epoch: 80 loss: 2.3363711833953857


In [None]:
n = X_test.nelement()
Xenc = F.one_hot(X_test, num_classes=729).float()
# Loss on test set
logits = Xenc @ W
counts = logits.exp()
probs = counts / counts.sum(1, keepdims=True)
loss = -probs[torch.arange(n), y_test].log().mean()

In [None]:
loss