In [4]:
import torch
import torch.nn.functional as F
from torch.utils.data import random_split

In [6]:
# Dataset
words = open('names.txt', 'r').read().splitlines()
words[:10]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

In [10]:
# Generator seed
g = torch.Generator().manual_seed(2147483647)

In [18]:
# Char to idx maps
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i, s in enumerate(chars)} # Characters to indexes
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()} # Indexes to characters
stoi

{'a': 1,
 'b': 2,
 'c': 3,
 'd': 4,
 'e': 5,
 'f': 6,
 'g': 7,
 'h': 8,
 'i': 9,
 'j': 10,
 'k': 11,
 'l': 12,
 'm': 13,
 'n': 14,
 'o': 15,
 'p': 16,
 'q': 17,
 'r': 18,
 's': 19,
 't': 20,
 'u': 21,
 'v': 22,
 'w': 23,
 'x': 24,
 'y': 25,
 'z': 26,
 '.': 0}

In [5]:
#################################################################################### Bigram ####################################################################################

# Split data set
train_data, dev_data, test_data = random_split(words, [0.8, 0.1, 0.1], generator=g)

def split_data(data):
    X, y = [], []
    for w in data:
        chs = ['.'] + list(w) + ['.']
        for ch1, ch2 in zip(chs, chs[1:]):
            X.append(stoi[ch1])
            y.append(stoi[ch2])
    X = torch.tensor(X)
    y = torch.tensor(y)
    return [X, y]

# Create train, dev and test sets
X_train, y_train = split_data(train_data)
X_dev, y_dev = split_data(dev_data)
X_test, y_test = split_data(test_data)

In [6]:
# Weights matrix
W = torch.randn((27, 27), generator=g, requires_grad=True)

In [7]:
n = X_train.nelement()

# One hot encoding
Xenc = F.one_hot(X_train, num_classes=27).float()

# Gradient descent
for epoch in range(50):
    # Forward pass
    logits = Xenc @ W
    counts = logits.exp()
    probs = counts / counts.sum(1, keepdims=True)
    loss = -probs[torch.arange(n), y_train].log().mean()
    
    # Backward pass
    W.grad = None
    loss.backward()
    
    # Update
    W.data += -50 * W.grad
    if (epoch == 0 or ((epoch+1)%10) == 0):
        print(f'Epoch: {epoch+1} loss: {loss}')


Epoch: 1 loss: 3.6793460845947266
Epoch: 10 loss: 2.685643196105957
Epoch: 20 loss: 2.571347713470459
Epoch: 30 loss: 2.5294389724731445
Epoch: 40 loss: 2.5084228515625
Epoch: 50 loss: 2.496042251586914


In [8]:
n = X_dev.nelement()

# Loss on the dev set
Xenc = F.one_hot(X_dev, num_classes=27).float()
logits = Xenc @ W
counts = logits.exp()
probs = counts / counts.sum(1, keepdims=True)
loss = -probs[torch.arange(n), y_dev].log().mean()

In [9]:
loss

tensor(2.4942, grad_fn=<NegBackward0>)

In [10]:
n = X_test.nelement()

# Loss on the test set
Xenc = F.one_hot(X_test, num_classes=27).float()
logits = Xenc @ W
counts = logits.exp()
probs = counts / counts.sum(1, keepdims=True)
loss = -probs[torch.arange(n), y_test].log().mean()

In [11]:
loss

tensor(2.4929, grad_fn=<NegBackward0>)

In [12]:
# Sample names
for _ in range(5):
    out = []
    ix = 0
    while True:
        xenc = F.one_hot(torch.tensor([ix]), num_classes=27).float()
        logits = xenc @ W # predict log-counts
        counts = logits.exp() # counts, equivalent to N
        p = counts / counts.sum(1, keepdims=True) # probabilities for next character
    
        ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        out.append(itos[ix])
        if ix == 0:
            break
    print(''.join(out))

s.
ma.
aytin.
mariahieslllyaadiannngu.
asonn.


In [13]:
################################################################################### Trigram ###################################################################################


In [14]:
############################################################################# 1x54 vector approach ############################################################################
# Split data set
train_data, dev_data, test_data = random_split(words, [0.8, 0.1, 0.1], generator=g)

def split_data(data):
    X, y = [], []
    for w in data:
        chs = ['.'] + ['.'] + list(w) + ['.']
        for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
            X.append((stoi[ch1], stoi[ch2]))
            y.append(stoi[ch3])
    return [X, y]

# Create train, dev and test sets
X_train, y_train = split_data(train_data)
X_dev, y_dev = split_data(dev_data)
X_test, y_test = split_data(test_data)

In [15]:
# Weights matrix
W = torch.randn((54, 27), requires_grad=True, generator=g)

In [16]:
def one_hot_encode(X):
    n = len(X)
    Xenc = torch.zeros((n, 54))
    for i in range(n):
        vector = torch.zeros(54,)
        ix1 = X[i][0]
        ix2 = X[i][1]
        vector[ix1] += 1
        vector[27+ix2] += 1
        Xenc[i] = vector
    return Xenc

In [17]:
Xenc = one_hot_encode(X_train).float()

In [18]:
def train_model(W, epochs, lmbda, lr, tune_mode=False):
    n = len(X_train)
    
    # Gradient descent
    for epoch in range(epochs):
        
        # Forward pass
        logits = Xenc @ W
        counts = logits.exp()
        probs = counts / counts.sum(1, keepdims=True)
        loss = -probs[torch.arange(n), y_train].log().mean() + lmbda*(W**2).mean()

        # Backward pass
        W.grad = None
        loss.backward()

        # Update
        W.data += -lr * W.grad

        if (tune_mode == False and (epoch == 0 or ((epoch+1)%10) == 0)):
            print(f'Epoch: {epoch+1} train loss: {loss}')
    if tune_mode:
        print(f'Lambda: {lmbda}, train loss: {loss}')
    return W

In [19]:
# Grid search for the best l2 regularization rate
for lmbda in [0, 0.0001, 0.001, 0.005, 0.01, 0.1]:
    temp_W = W
    train_model(temp_W, 100, lmbda, 50, True)
    n = len(X_dev)
    xenc = one_hot_encode(X_dev).float()
    
    # Loss on the dev set
    logits = xenc @ temp_W
    counts = logits.exp()
    probs = counts / counts.sum(1, keepdims=True)
    loss = -probs[torch.arange(n), y_dev].log().mean() + lmbda * (W**2).mean()
    print(f'Lambda: {lmbda}, dev loss: {loss}')

Lambda: 0, train loss: 2.3869049549102783
Lambda: 0, dev loss: 2.4121344089508057
Lambda: 0.0001, train loss: 2.371954917907715
Lambda: 0.0001, dev loss: 2.3981902599334717
Lambda: 0.001, train loss: 2.3683440685272217
Lambda: 0.001, dev loss: 2.3952150344848633
Lambda: 0.005, train loss: 2.371345281600952
Lambda: 0.005, dev loss: 2.3983840942382812
Lambda: 0.01, train loss: 2.3766140937805176
Lambda: 0.01, dev loss: 2.4035463333129883
Lambda: 0.1, train loss: 2.44236159324646
Lambda: 0.1, dev loss: 2.465195417404175


In [20]:
W = train_model(W, 100, 0.001, 25)

Epoch: 1 train loss: 2.4019176959991455
Epoch: 10 train loss: 2.355705976486206
Epoch: 20 train loss: 2.3540329933166504
Epoch: 30 train loss: 2.3527488708496094
Epoch: 40 train loss: 2.351701021194458
Epoch: 50 train loss: 2.3508193492889404
Epoch: 60 train loss: 2.3500618934631348
Epoch: 70 train loss: 2.349400043487549
Epoch: 80 train loss: 2.3488149642944336
Epoch: 90 train loss: 2.348292589187622
Epoch: 100 train loss: 2.3478219509124756


In [21]:
n = len(X_test)
Xenc = one_hot_encode(X_test).float()

# Loss on the test set
logits = Xenc @ W
counts = logits.exp()
probs = counts / counts.sum(1, keepdims=True)
loss = -probs[torch.arange(n), y_test].log().mean() + 0.001 * (W**2).mean()

In [22]:
loss

tensor(2.3564, grad_fn=<AddBackward0>)

In [23]:
# Sample names
for _ in range(10):
    out = []
    # Start with ..
    ix1 = 0
    ix2 = 0
    while True:
        xenc = one_hot_encode([(ix1, ix2)])
        logits = xenc @ W
        counts = logits.exp()
        p = counts / counts.sum(1, keepdims=True)
        
        ix1 = ix2
        ix2 = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        out.append(itos[ix2])
        if ix2 == 0:
            break
    print(''.join(out))

caishilezzvydes.
maloh.
haleo.
aymi.
ppusie.
joh.
cor.
shtrama.
rming.
leynn.


In [24]:
############################################################################# 1x729 vector approach ###########################################################################

In [25]:
# Split data set
train_data, dev_data, test_data = random_split(words, [0.8, 0.1, 0.1], generator=g)
def split_data(data):
    X, y = [], []
    for w in data:
        chs = ['.'] + ['.'] + list(w) + ['.']
        for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
            X.append(stoi[ch1] + 27 * stoi[ch2])
            y.append(stoi[ch3])
    X = torch.tensor(X)
    y = torch.tensor(y)
    return [X, y]

# Create train, dev and test sets
X_train, y_train = split_data(train_data)
X_dev, y_dev = split_data(dev_data)
X_test, y_test = split_data(test_data)

In [26]:
# Weights matrix
W = torch.randn((729, 27), requires_grad=True, generator=g)

In [27]:
# Xenc = F.one_hot(X_train, num_classes=729).float()

In [28]:
# Gradient descent
def train_model(W, epochs, lmbda, lr, dev_mode=False):
    n = X_train.nelement()
    yenc = F.one_hot(y_train, num_classes=27).float()
    for epoch in range(epochs):
        # Forward pass
        logits = W[X_train, :] # one hot encoding
        counts = logits.exp()
        probs = counts / counts.sum(1, keepdims=True)
        # loss = -probs[torch.arange(n), y_train].log().mean() + lmbda * (W**2).mean()  # <- nll
        loss = F.cross_entropy(logits, y_train) + lmbda * (W**2).mean() # <- cross_entropy

        # Backward pass
        W.grad = None
        loss.backward()

        # Update
        W.data += -lr * W.grad

        if (not dev_mode and (epoch == 0 or ((epoch+1)%10) == 0)):
            print(f'Epoch: {epoch+1} loss: {loss}')
    if dev_mode:
        print(f'Lambda: {lmbda}, train loss: {loss}')
    return W

In [29]:
for lmbda in [0, 0.0001, 0.001, 0.01, 0.1, 0.5, 1]:
    temp_W = W
    temp_W = train_model(temp_W, 300, lmbda, 150, dev_mode = True)
    n = len(X_dev)
    
    # Loss on the dev set
    logits = temp_W[X_dev, :] # one hot encoding
    counts = logits.exp()
    probs = counts / counts.sum(1, keepdims=True)
    # loss = -probs[torch.arange(n), y_dev].log().mean() + lmbda * (temp_W**2).mean() <- nll
    loss = F.cross_entropy(logits, y_dev) + lmbda * (W**2).mean() # <- cross entropy
    
    print(f'Lambda: {lmbda}, dev loss: {loss}')

Lambda: 0, train loss: 2.2659521102905273
Lambda: 0, dev loss: 2.245718479156494
Lambda: 0.0001, train loss: 2.2400686740875244
Lambda: 0.0001, dev loss: 2.224735736846924
Lambda: 0.001, train loss: 2.23232102394104
Lambda: 0.001, dev loss: 2.2195990085601807
Lambda: 0.01, train loss: 2.2442729473114014
Lambda: 0.01, dev loss: 2.2327253818511963
Lambda: 0.1, train loss: 2.3420538902282715
Lambda: 0.1, dev loss: 2.326735734939575
Lambda: 0.5, train loss: 2.4763662815093994
Lambda: 0.5, dev loss: 2.4524288177490234
Lambda: 1, train loss: 2.57646107673645
Lambda: 1, dev loss: 2.5503554344177246


In [30]:
W = train_model(W, 300, 0.001, 150)

Epoch: 1 loss: 2.3945083618164062
Epoch: 10 loss: 2.366429090499878
Epoch: 20 loss: 2.3605427742004395
Epoch: 30 loss: 2.327146053314209
Epoch: 40 loss: 2.331599235534668
Epoch: 50 loss: 2.304323673248291
Epoch: 60 loss: 2.31314754486084
Epoch: 70 loss: 2.289116621017456
Epoch: 80 loss: 2.300274610519409
Epoch: 90 loss: 2.277801036834717
Epoch: 100 loss: 2.2904279232025146
Epoch: 110 loss: 2.269357919692993
Epoch: 120 loss: 2.2828946113586426
Epoch: 130 loss: 2.262455701828003
Epoch: 140 loss: 2.2766668796539307
Epoch: 150 loss: 2.2568814754486084
Epoch: 160 loss: 2.2716007232666016
Epoch: 170 loss: 2.2522597312927246
Epoch: 180 loss: 2.2672817707061768
Epoch: 190 loss: 2.248300075531006
Epoch: 200 loss: 2.2636213302612305
Epoch: 210 loss: 2.2448434829711914
Epoch: 220 loss: 2.2605016231536865
Epoch: 230 loss: 2.2418408393859863
Epoch: 240 loss: 2.2577130794525146
Epoch: 250 loss: 2.2391867637634277
Epoch: 260 loss: 2.255222797393799
Epoch: 270 loss: 2.2369132041931152
Epoch: 280 loss:

In [31]:
n = X_test.nelement()

# Loss on test set
logits = W[X_test, :]
counts = logits.exp()
probs = counts / counts.sum(1, keepdims=True)
loss = -probs[torch.arange(n), y_test].log().mean() + 0.001 * (W**2).mean() # nll
loss = F.cross_entropy(logits, y_test) + 0.001 * (W**2).mean() # cross entropy

In [32]:
loss

tensor(2.2596, grad_fn=<AddBackward0>)

In [33]:
# Sample names
for _ in range(10):
    out = []
    # Start with ..
    ix1 = 0
    ix2 = 0
    while True:
        ix = ix1 + 27 * ix2
        logits = W[ix, :]
        counts = logits.exp()
        p = counts / counts.sum()
        
        ix1 = ix2
        ix2 = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        out.append(itos[ix2])
        if ix2 == 0:
            break
    print(''.join(out))

roshranadyssia.
lisenn.
le.
elynn.
ii.
elleknisseaunuris.
kri.
shmarisarilaliddhagglxfzhhvi.
ton.
dsayli.


In [34]:
## Counting approach ##############################################################################

In [12]:
# Split data set
train_data, dev_data, test_data = random_split(words, [0.8, 0.1, 0.1], generator=g)

In [14]:
# Trigram count matrix
N = torch.zeros((27, 27, 27), dtype=torch.int32)

In [20]:
# Filling in N matrix
for w in train_data:
    chs = ['.'] + ['.'] + list(w) + ['.']
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        ix3 = stoi[ch3]
        N[ix1, ix2, ix3] += 1

In [38]:
# Negative log likelihood
def get_nll(NP, data):
    log_likelihood = 0.0
    n = 0

    for w in data:
        chs = ['.'] + ['.'] + list(w) + ['.']
        for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
            ix1 = stoi[ch1]
            ix2 = stoi[ch2]
            ix3 = stoi[ch3]
            prob = NP[ix1, ix2, ix3]
            logprob = torch.log(prob)
            log_likelihood += logprob # add log prob of next character
            n += 1
    nll = -log_likelihood
    return nll/n

In [39]:
# Grid search for the best smoothing rate
for sr in [0, 1, 2, 3, 5, 10, 15, 20, 50]:
    NP = (N+sr).float()
    NP = NP / NP.sum(2, keepdims=True)
    nll_train = get_nll(NP, train_data)
    nll_dev = get_nll(NP, dev_data)
    print(f'Smoothing rate: {sr}, train loss: {nll_train}, dev loss: {nll_dev}')

Smoothing rate: 0, train loss: 2.1816229820251465, dev loss: nan
Smoothing rate: 1, train loss: 2.2142655849456787, dev loss: 2.2413170337677
Smoothing rate: 2, train loss: 2.2375996112823486, dev loss: 2.260141611099243
Smoothing rate: 3, train loss: 2.257136821746826, dev loss: 2.2772293090820312
Smoothing rate: 5, train loss: 2.2905335426330566, dev loss: 2.3071930408477783
Smoothing rate: 10, train loss: 2.354750633239746, dev loss: 2.367098569869995
Smoothing rate: 15, train loss: 2.4044113159179688, dev loss: 2.4140870571136475
Smoothing rate: 20, train loss: 2.445242404937744, dev loss: 2.4532060623168945
Smoothing rate: 50, train loss: 2.604304313659668, dev loss: 2.6079306602478027


In [40]:
NP = (N+1).float()
NP = NP / NP.sum(2, keepdims=True)

In [41]:
# Test set loss
nll_test = get_nll(NP, test_data)
print(f'test loss: {nll_test}')

test loss: 2.2442703247070312


In [42]:
# Sample names
for _ in range(5):
    out = []
    ix1 = 0
    ix2 = 0
    while True:
        p = NP[ix1][ix2]
        ix1 = ix2
        ix2 = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item() # Sample next character from NP
        out.append(itos[ix2])
        if ix2 == 0:
            break
    print(''.join(out))

base.
azia.
eli.
marikhrittene.
samanephiytren.
