In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt

# Table of Contents
- **[Data](#1)**
- **[Creating trainset](#2)**
- **[Creating NN architechture](#3)**
    - [ Embedding matrix](#3-1)
    - [Understaning relations between train set and embedding matr](#3-2)
    - [Evaluation](#3-3)
    - [Improving forward pass and eval code](#3-4)
    - [Backprob](#3-5)
    - [Training in one loop](#3-6)
    - [Model imporvement](#3-7)
        - [Train, eval, test split](#3-6-1)
        - [Increasing the model size](#3-6-2)
        - [LR decay](#3-6-3)
- **[Sampling](#4)**

<a name='1'></a>
# Data

In [2]:
words = open('names.txt', 'r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [3]:
chars = sorted(list(set(''.join(words))))
chars = ['.'] + chars

stoi = {s:i for i,s in enumerate(chars)}

itos = {i:s for s,i in stoi.items()}

In [4]:
itos

{0: '.',
 1: 'a',
 2: 'b',
 3: 'c',
 4: 'd',
 5: 'e',
 6: 'f',
 7: 'g',
 8: 'h',
 9: 'i',
 10: 'j',
 11: 'k',
 12: 'l',
 13: 'm',
 14: 'n',
 15: 'o',
 16: 'p',
 17: 'q',
 18: 'r',
 19: 's',
 20: 't',
 21: 'u',
 22: 'v',
 23: 'w',
 24: 'x',
 25: 'y',
 26: 'z'}

<a name='2'></a>
# Creating trainset

In [201]:
# build the dataset
BLOCK_SIZE = 3
EMBEDDING_SIZE = 10 #2

def build_dataset(words, block_size=BLOCK_SIZE, verbose=False):
     # context length: how many charachters do we 
    X, Y = [], []
    for w in words:

        if verbose: print(w)
        context = [0] * block_size
        for ch in w + '.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)

            if verbose: print(''.join(itos[i] for i in context), '------>', itos[ix])
            context = context[1:] + [ix] # crop and append

    X = torch.tensor(X)
    Y = torch.tensor(Y)
    
    return {'X': X, 'Y': Y}

ds_dict = build_dataset(words=words[:5], verbose=True)
X = ds_dict['X']
Y = ds_dict['Y']

carely
... ------> c
..c ------> a
.ca ------> r
car ------> e
are ------> l
rel ------> y
ely ------> .
jullien
... ------> j
..j ------> u
.ju ------> l
jul ------> l
ull ------> i
lli ------> e
lie ------> n
ien ------> .
sherly
... ------> s
..s ------> h
.sh ------> e
she ------> r
her ------> l
erl ------> y
rly ------> .
areeb
... ------> a
..a ------> r
.ar ------> e
are ------> e
ree ------> b
eeb ------> .
harini
... ------> h
..h ------> a
.ha ------> r
har ------> i
ari ------> n
rin ------> i
ini ------> .


In [6]:
X

tensor([[ 0,  0,  0],
        [ 0,  0,  5],
        [ 0,  5, 13],
        [ 5, 13, 13],
        [13, 13,  1],
        [ 0,  0,  0],
        [ 0,  0, 15],
        [ 0, 15, 12],
        [15, 12,  9],
        [12,  9, 22],
        [ 9, 22,  9],
        [22,  9,  1],
        [ 0,  0,  0],
        [ 0,  0,  1],
        [ 0,  1, 22],
        [ 1, 22,  1],
        [ 0,  0,  0],
        [ 0,  0,  9],
        [ 0,  9, 19],
        [ 9, 19,  1],
        [19,  1,  2],
        [ 1,  2,  5],
        [ 2,  5, 12],
        [ 5, 12, 12],
        [12, 12,  1],
        [ 0,  0,  0],
        [ 0,  0, 19],
        [ 0, 19, 15],
        [19, 15, 16],
        [15, 16,  8],
        [16,  8,  9],
        [ 8,  9,  1]])

In [7]:
Y

tensor([ 5, 13, 13,  1,  0, 15, 12,  9, 22,  9,  1,  0,  1, 22,  1,  0,  9, 19,
         1,  2,  5, 12, 12,  1,  0, 19, 15, 16,  8,  9,  1,  0])

<a name='3'></a>
# Creating NN architechture

<a name='3-1'></a>
## Embedding matrix
It follows the input layer

In [8]:
C = torch.randn((len(chars),2))

In [9]:
"""
At the input stage we need to select the row of the embedding lookup matrix that is passed to the next layer. 
For this we can either simply select the needed layer like so C[5] or format the input as one-hot vectors
"""

'\nAt the input stage we need to select the row of the embedding lookup matrix that is passed to the next layer. \nFor this we can either simply select the needed layer like so C[5] or format the input as one-hot vectors\n'

In [10]:
tst_one_hot = F.one_hot(torch.tensor(2), num_classes=len(chars)).float()

In [11]:
tst_one_hot.unsqueeze(dim=0)

tensor([[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [12]:
tst_one_hot.unsqueeze(dim=0) @ C

tensor([[0.2994, 1.4347]])

In [13]:
X.shape

torch.Size([32, 3])

In [14]:
C[X].shape

torch.Size([32, 3, 2])

<a name='3-2'></a>
## Understaning relations between train set and embedding matr

In [15]:
C #all of the embeddings

tensor([[-2.5696,  1.8836],
        [-0.7986,  0.5644],
        [ 0.2994,  1.4347],
        [ 1.0980, -1.1331],
        [-0.5800, -0.0969],
        [ 1.6415,  1.8853],
        [-0.7196,  1.6522],
        [-0.7825, -0.5869],
        [-1.2912,  0.2820],
        [-1.2013,  0.2599],
        [ 2.3951, -0.1519],
        [-0.1747, -0.6222],
        [-0.0168, -0.5608],
        [-1.4189, -1.4393],
        [-1.6985,  0.3681],
        [ 1.8049, -1.0374],
        [-0.3569,  0.6205],
        [-0.0829, -0.3484],
        [ 1.3456, -0.4632],
        [ 0.3323,  1.4900],
        [ 1.0436, -0.2684],
        [ 1.5303, -0.1281],
        [ 1.3819, -0.2589],
        [ 0.1728, -0.2133],
        [ 0.5602,  1.3424],
        [-0.8026,  0.5389],
        [ 1.6965,  1.9275]])

In [16]:
X[27, 2] # Original char -- 15

tensor(15)

In [17]:
itos[15]

'o'

In [18]:
C[X] # All of the same embeddings as in C but rearranged according to X

tensor([[[-2.5696,  1.8836],
         [-2.5696,  1.8836],
         [-2.5696,  1.8836]],

        [[-2.5696,  1.8836],
         [-2.5696,  1.8836],
         [ 1.6415,  1.8853]],

        [[-2.5696,  1.8836],
         [ 1.6415,  1.8853],
         [-1.4189, -1.4393]],

        [[ 1.6415,  1.8853],
         [-1.4189, -1.4393],
         [-1.4189, -1.4393]],

        [[-1.4189, -1.4393],
         [-1.4189, -1.4393],
         [-0.7986,  0.5644]],

        [[-2.5696,  1.8836],
         [-2.5696,  1.8836],
         [-2.5696,  1.8836]],

        [[-2.5696,  1.8836],
         [-2.5696,  1.8836],
         [ 1.8049, -1.0374]],

        [[-2.5696,  1.8836],
         [ 1.8049, -1.0374],
         [-0.0168, -0.5608]],

        [[ 1.8049, -1.0374],
         [-0.0168, -0.5608],
         [-1.2013,  0.2599]],

        [[-0.0168, -0.5608],
         [-1.2013,  0.2599],
         [ 1.3819, -0.2589]],

        [[-1.2013,  0.2599],
         [ 1.3819, -0.2589],
         [-1.2013,  0.2599]],

        [[ 1.3819, -0

In order to find embedding for 15 from X, I need to use the same "coordinates" 
as for X because arrangement in C[X] is now same as in X

In [19]:
C[X][27, 2]

tensor([ 1.8049, -1.0374])

To compare let's see what the embedding is for char 15 in original C

In [20]:
# embeddings match
C[15]

tensor([ 1.8049, -1.0374])

In [21]:
#embedding layer
emb = C[X]

#hidden layer
W1 = torch. randn((6, 100))
b1 = torch.randn(100)

h = torch.tanh(emb.view(-1,6) @ W1 + b1)

W2 = torch.randn((100, len(chars)))
b2 = torch.randn(len(chars))
logits = h @ W2 + b2

In [22]:
h.shape

torch.Size([32, 100])

In [23]:
W2.shape

torch.Size([100, 27])

In [24]:
logits.shape

torch.Size([32, 27])

In [25]:
counts = logits.exp()

In [26]:
probs = counts / counts.sum(-1, keepdims=True)

In [27]:
probs.shape

torch.Size([32, 27])

In [28]:
probs[0].sum()

tensor(1.)

In [29]:
probs

tensor([[2.0608e-16, 5.4552e-12, 1.4589e-07, 3.7226e-05, 4.8107e-02, 5.8659e-12,
         9.8856e-02, 3.3001e-07, 4.2171e-03, 6.2238e-07, 8.7737e-08, 9.2304e-06,
         9.8766e-07, 5.0686e-09, 2.5488e-18, 1.5484e-08, 2.5769e-05, 8.7753e-03,
         7.7625e-07, 6.5758e-11, 1.1112e-14, 8.3958e-01, 9.6874e-06, 2.3395e-08,
         3.6866e-04, 6.5067e-06, 1.6264e-06],
        [2.2629e-08, 9.9379e-10, 5.4140e-04, 1.1289e-13, 9.6184e-03, 1.3870e-15,
         2.8359e-07, 1.6867e-02, 1.5180e-02, 9.5577e-01, 3.6761e-07, 1.9454e-16,
         6.5284e-05, 9.9104e-15, 5.0672e-20, 1.6652e-10, 1.6989e-08, 3.2647e-04,
         1.0923e-03, 3.5970e-11, 4.1768e-06, 5.3674e-04, 5.7364e-17, 5.9972e-09,
         7.7614e-08, 2.2978e-09, 1.1317e-10],
        [8.6674e-11, 1.9722e-08, 8.4828e-07, 5.6350e-04, 4.9965e-04, 1.4831e-09,
         1.4171e-01, 1.9999e-06, 1.1465e-07, 4.6735e-07, 2.2063e-08, 2.3843e-12,
         4.2572e-06, 2.7200e-09, 1.5166e-10, 6.4733e-08, 8.3499e-08, 8.5717e-01,
         1.1007e-

<a name='3-3'></a>
## Evaluation

Now to evaluate the prediction of the forward pass above, we need to get the probabilities predicted for the Y vector

In [30]:
Y

tensor([ 5, 13, 13,  1,  0, 15, 12,  9, 22,  9,  1,  0,  1, 22,  1,  0,  9, 19,
         1,  2,  5, 12, 12,  1,  0, 19, 15, 16,  8,  9,  1,  0])

In [31]:
probs[torch.arange(probs.shape[0]), Y]

tensor([5.8659e-12, 9.9104e-15, 2.7200e-09, 1.9505e-09, 1.4022e-10, 1.5484e-08,
        1.1791e-04, 2.6776e-07, 9.9285e-01, 2.0403e-06, 5.0384e-09, 1.7851e-15,
        5.4552e-12, 1.1281e-09, 1.4628e-10, 1.3554e-10, 6.2238e-07, 5.2502e-11,
        4.0572e-11, 5.8084e-10, 1.4139e-13, 4.2772e-08, 3.6006e-07, 7.0507e-05,
        1.4500e-14, 6.5758e-11, 6.1124e-10, 1.8430e-10, 2.4111e-09, 1.4383e-13,
        6.9043e-10, 1.2019e-13])

These probabilities must be close to one, because Y is the label vector. In fact they are closer to 0. 
This is because model is untrained

In [32]:
nll = -(probs[torch.arange(probs.shape[0]), Y]).log().mean()
nll

tensor(21.0661)

<a name='3-4'></a>
## Improving forward pass and eval code

In [203]:
def initiate_params(input_len, hidden_layer_size=100, embedding_size=EMBEDDING_SIZE):
    C = torch.randn((input_len, embedding_size), requires_grad=True)
    W1 = torch.randn((BLOCK_SIZE*embedding_size, hidden_layer_size), requires_grad=True)
    b1 = torch.randn(hidden_layer_size, requires_grad=True)
    W2 = torch.randn((hidden_layer_size, input_len), requires_grad=True)
    b2 = torch.randn(input_len, requires_grad=True)
    
    return {'C': C, 'W1': W1, 'W2': W2, 'b1': b1, 'b2': b2}

def forward_pass(X, parameters):
    C = parameters['C']
    W1 = parameters['W1']
    b1 = parameters['b1']
    W2 = parameters['W2']
    b2 = parameters['b2']
    
    #embedding layer
#     C = torch.randn((len(chars),2), requires_grad=True)
    emb = C[X]

    #hidden layer
#     W1 = torch.randn((6, 100), requires_grad=True)
#     b1 = torch.randn(100, requires_grad=True)

    h = torch.tanh(emb.view(-1, BLOCK_SIZE*EMBEDDING_SIZE) @ W1 + b1)

    #output layer
#     W2 = torch.randn((100, len(chars)), requires_grad=True)
#     b2 = torch.randn(len(chars), requires_grad=True)
    logits = h @ W2 + b2

#     parameters = [C, W1, W2, b1, b2]
    
    return logits

parameters = initiate_params(len(chars))
logits = forward_pass(X, parameters)

#softmax
counts = logits.exp()
probs = counts / counts.sum(-1, keepdims=True)

#loss
nll_loss = -(probs[torch.arange(probs.shape[0]), Y]).log().mean()
nll_loss

tensor(21.6251, grad_fn=<NegBackward0>)

Finding softmax and loss calculation could be replaced with crossentropy function from the torch lib

In [34]:
nll_loss = F.cross_entropy(logits, Y)
nll_loss

tensor(17.5170, grad_fn=<NllLossBackward0>)

<a name='3-5'></a>
## Backprob

In [35]:
def back_pass(parameters, nll_loss, lr=.1):
    # nullifying param grads
    for p in parameters.values():
        p.grad = None

    #backprop
    nll_loss.backward()

    #params update
    for p in parameters.values():
        p.data += -lr * p.grad
    
    return parameters 

back_pass(parameters, nll_loss)

{'C': tensor([[ 0.4507,  1.1964],
         [-0.4769,  0.5135],
         [ 0.5694,  0.6355],
         [ 0.2286, -0.7949],
         [-0.2944, -0.3748],
         [-0.9036, -1.1943],
         [ 0.2421,  2.8643],
         [-1.1535,  0.7167],
         [ 0.6277, -0.1475],
         [ 1.1983, -0.4449],
         [ 0.1213,  0.0331],
         [-2.2170, -1.0393],
         [-0.0744, -1.7911],
         [ 1.3047, -0.4856],
         [ 2.0280,  0.4876],
         [-0.7542,  0.1676],
         [-1.4646, -1.3945],
         [ 2.3480, -0.0529],
         [-1.5569,  0.9921],
         [-0.2370, -1.2369],
         [ 0.6232, -2.3020],
         [-1.1073,  0.6506],
         [-0.7354, -0.0967],
         [-0.2556, -0.0953],
         [-0.6693,  1.2051],
         [-1.8134,  0.8517],
         [-1.2440,  0.8120]], requires_grad=True),
 'W1': tensor([[ 1.2617e+00,  5.2546e-01, -3.6154e-02, -2.1852e-02, -2.2882e-01,
           1.4489e+00, -5.9283e-01, -6.6637e-02,  2.2685e+00, -3.9297e-02,
          -9.5999e-01, -1.1376e+00

<a name='3-6'></a>
## Training in one loop

In [44]:
from datetime import datetime

In [36]:
parameters = initiate_params(len(chars))

for _ in range(100):
    logits = forward_pass(X, parameters)
    nll_loss = F.cross_entropy(logits, Y)
    parameters = back_pass(parameters, nll_loss, lr=.1)
    
    if _ % 10 == 0:
        print(nll_loss.item())

15.197283744812012
4.446802139282227
1.6794312000274658
0.8483089208602905
0.597321093082428
0.4901702404022217
0.42859870195388794
0.3887881636619568
0.3614749610424042
0.3420078158378601


The loss is very small b/c the initial dataset is very small

In [65]:
X.shape

torch.Size([32, 3])

In [40]:
# Rebuilding the trainset
ds_dict = build_dataset(block_size=block_size, words_num=-1, verbose=False)
X = ds_dict['X']
Y = ds_dict['Y']

In [46]:
# Retraining on all of the trainset
parameters = initiate_params(len(chars))

t1 = datetime.now()
for _ in range(100):
    logits = forward_pass(X, parameters)
    nll_loss = F.cross_entropy(logits, Y)
    parameters = back_pass(parameters, nll_loss, lr=.1)
    
    if _ % 10 == 0:
        print(nll_loss.item())
t2 = datetime.now()

print(f'Time elapsed: {t2-t1}')

16.005897521972656
9.597808837890625
6.799360275268555
5.449086666107178
4.684425354003906
4.222171783447266
3.929347276687622
3.7318594455718994
3.603271007537842
3.5002830028533936
Time elapsed: 0:00:21.521483


<a name='3-7'></a>
## Model imporvement

Took a long time b/c each training iteration was done on the whole dataset. Instead random minibatches could be used.

In [51]:
# example of 32 indices between 0 and the length of the trainset
torch.randint(0, X.shape[0], (32,))

tensor([193625,  24865,  61208, 192725,  72701,  52207,  15506, 108383,  78678,
        179714, 111859, 112858,  59691, 182537, 167681,  75104,  36633, 182665,
        221811, 194286,  70086, 220742, 174216, 179509,   5797, 151633, 103770,
         23439, 151858,  35166, 190512,  62448])

In [52]:
X[torch.randint(0, X.shape[0], (32,))]

tensor([[ 4,  8,  1],
        [18,  9, 19],
        [ 0,  0,  0],
        [ 0,  0,  7],
        [ 0, 20, 15],
        [ 0,  4, 21],
        [ 0,  0,  0],
        [26,  1,  2],
        [ 4, 15, 14],
        [12,  1, 14],
        [ 5, 13,  5],
        [12,  9, 20],
        [ 1, 14,  9],
        [ 0, 11,  8],
        [ 9,  3,  5],
        [ 4,  5, 14],
        [ 0,  0,  0],
        [ 4,  1,  9],
        [ 1, 14,  1],
        [12,  9, 15],
        [ 9, 18,  5],
        [ 2,  1,  3],
        [14, 20, 18],
        [ 0,  0, 12],
        [ 0,  0,  0],
        [10,  1,  9],
        [ 0,  0, 18],
        [14,  9, 14],
        [ 0,  0,  0],
        [ 0,  0,  0],
        [21,  1, 12],
        [ 0,  0,  6]])

In [84]:
# Retraining on all of the trainset
parameters = initiate_params(len(chars))

t1 = datetime.now()
for _ in range(1000):
    
    #minibatch construct
    ix = torch.randint(0, X.shape[0], (32,))
    miniB_X = X[ix]
    miniB_Y = Y[ix]
    
    #train epoch
    logits = forward_pass(miniB_X, parameters)
    nll_loss = F.cross_entropy(logits, miniB_Y)
    parameters = back_pass(parameters, nll_loss, lr=.1)
    
    if _ % 100 == 0:
        print(nll_loss.item())
t2 = datetime.now()

print(f'Time elapsed: {t2-t1}')

18.926416397094727
4.401364803314209
3.3355259895324707
2.8897223472595215
3.333444595336914
2.7860195636749268
2.5920724868774414
2.2694382667541504
3.1327149868011475
2.6819007396698
Time elapsed: 0:00:00.507156


About the same loss minimisation progression, but at a fraction of time

In [85]:
# evaluating on the whole dataset
logits = forward_pass(X, parameters)
nll_loss = F.cross_entropy(logits, Y)
nll_loss

tensor(2.6437, grad_fn=<NllLossBackward0>)

<a name='3-6-1'></a>
### Train, eval, test split

In [87]:
import random

In [163]:
train_percent = 0.8
train_start_ix, train_end_ix = 0, int(train_percent * (len(words)))
eval_start_ix, eval_end_ix = train_end_ix+1, train_end_ix + int((len(words) - train_end_ix)/2)
test_start_ix, test_end_ix = eval_end_ix+1, len(words)-1
# eval_percent, test_percent = 1-train_percent/2,  1-train_percent/2; 

random.shuffle(words)
train_ds_dict = build_dataset(words=words[train_start_ix:train_end_ix], verbose=False)
train_X = train_ds_dict['X']; train_Y = train_ds_dict['Y']

eval_ds_dict = build_dataset(words=words[eval_start_ix:eval_end_ix], verbose=False)
eval_X = eval_ds_dict['X']; eval_Y = eval_ds_dict['Y']

test_ds_dict = build_dataset(words=words[test_start_ix:test_end_ix], verbose=False)
test_X = test_ds_dict['X']; test_Y = test_ds_dict['Y']

In [192]:
def train(train_X, train_Y, parameters, lr=0.1, batch_size=32,
          epochs=15000, verbose=True):
    for _ in range(epochs):
        #minibatch construct
        ix = torch.randint(0, train_X.shape[0], (batch_size,))
        miniB_X = train_X[ix]
        miniB_Y = train_Y[ix]

        #train epoch
        logits = forward_pass(miniB_X, parameters)
        nll_loss = F.cross_entropy(logits, miniB_Y)
        parameters = back_pass(parameters, nll_loss, lr=lr)

        if (_ % int(epochs/5) == 0) and verbose: 
            print(nll_loss.item())
    return parameters

def eval_model(parameters, train_X, train_Y, eval_X, eval_Y, test_X, test_Y):
    
    # evaluating on the whole training dataset
    logits = forward_pass(train_X, parameters)
    nll_loss_train = F.cross_entropy(logits, train_Y)
    
    # evaluating on the eval dataset
    logits = forward_pass(eval_X, parameters)
    nll_loss_eval = F.cross_entropy(logits, eval_Y)
    
    # evaluating on the test dataset
    logits = forward_pass(test_X, parameters)
    nll_loss_test = F.cross_entropy(logits, test_Y)
    
    return {'train': nll_loss_train.item(), 'eval': nll_loss_eval.item(), 'test': nll_loss_test.item()}

In [186]:
# Retraining on all of the trainset
parameters = initiate_params(len(chars), hidden_layer_size=100)
parameters = train(train_X, train_Y, parameters, epochs=30000)

18.652084350585938
2.4685938358306885
2.4793920516967773
2.3250765800476074
2.3858280181884766
Time elapsed: 0:00:12.985690


In [187]:
eval_model(parameters, train_X, train_Y, eval_X, eval_Y, test_X, test_Y)

{'train': 2.37899112701416,
 'eval': 2.3843002319335938,
 'test': 2.3770201206207275}

<a name='3-6-2'></a>
### Increasing the model size

Score on train and eval sets are about the same, we are underfitting, we can improve performace by increasing the model size

In [188]:
# Retraining on all of the trainset with increased model size
parameters = initiate_params(len(chars), hidden_layer_size=300)
parameters = train(train_X, train_Y, parameters, epochs=30000)

34.692665100097656
2.5410609245300293
3.414820909500122
2.6682658195495605
2.7370643615722656


In [189]:
eval_model(parameters, train_X, train_Y, eval_X, eval_Y, test_X, test_Y)

{'train': 2.5114986896514893,
 'eval': 2.5211033821105957,
 'test': 2.4954404830932617}

We can increase the training duration, decrease learning rate and increase the batch size

In [195]:
# Retraining on all of the trainset with increased batch size and training duration
parameters = initiate_params(len(chars), hidden_layer_size=300)
parameters = train(train_X, train_Y, parameters, epochs=300000, batch_size=64, lr=0.03)

28.59946060180664
2.408470630645752
2.454009771347046
2.3968396186828613
2.2818920612335205


In [196]:
eval_model(parameters, train_X, train_Y, eval_X, eval_Y, test_X, test_Y)

{'train': 2.3119144439697266,
 'eval': 2.315922975540161,
 'test': 2.314150094985962}

Increase the embedding size from 2 to 10

In [204]:
parameters = initiate_params(len(chars), hidden_layer_size=300, embedding_size=10)
parameters = train(train_X, train_Y, parameters, epochs=500000, batch_size=64, lr=0.03)

30.56483268737793
2.4127919673919678
2.3729515075683594
2.1194214820861816
2.2924582958221436


In [205]:
eval_model(parameters, train_X, train_Y, eval_X, eval_Y, test_X, test_Y)

{'train': 2.142258405685425, 'eval': 2.1806640625, 'test': 2.1684632301330566}

<a name='3-6-3'></a>
### LR decay

Decay learning rate

In [207]:
# Retraining on all of the trainset with increased batch size and training duration
 
for epochs, lr in zip([300000, 150000, 75000, 50000], [0.1, 0.04, 0.02, 0.01]):
    print('='*30, epochs, lr, '='*30)
    parameters = train(train_X, train_Y, parameters, epochs=epochs, batch_size=64, lr=lr)

30.4615421295166
2.0137808322906494
2.174171209335327
2.0147619247436523
1.9467006921768188
2.207733392715454
2.2263317108154297
1.970402479171753
2.0515661239624023
1.9990328550338745
2.2195701599121094
1.8307344913482666
2.060786724090576
2.1697745323181152
2.1351871490478516
1.9666452407836914
1.9726721048355103
2.273517370223999
1.9514293670654297
2.115074872970581


In [208]:
eval_model(parameters, train_X, train_Y, eval_X, eval_Y, test_X, test_Y)

{'train': 2.0251619815826416,
 'eval': 2.1231932640075684,
 'test': 2.1158018112182617}

<a name='4'></a>
# Sampling

In [209]:
test_X

tensor([[ 0,  0,  0],
        [ 0,  0,  4],
        [ 0,  4,  5],
        ...,
        [ 9, 13, 15],
        [13, 15, 14],
        [15, 14,  5]])

In [210]:
context = torch.tensor([0]*BLOCK_SIZE)

In [213]:
forward_pass(test_X, parameters).shape

torch.Size([22810, 27])

In [230]:
def generate_name_nn(num_names, parameters):
    '''
    Here we start with first row because it contains first letters of names, 
    sample a letter (or collumn), 
    this letter gives us the number or a row to sample next from.
    This is repeated until the end token is sampled.
    '''
    
    out = []
    for cnt in range(num_names):
        this_name = []
        context = [0]*BLOCK_SIZE
        for _ in range(50):
            
            #forward pass
            logits = forward_pass(torch.tensor(context), parameters)
            probs = F.softmax(logits, dim=1)
            
            #sampling
            ix = torch.multinomial(probs, num_samples=1, replacement=True).item()
            context = context[1:] + [ix]
            this_name.append(itos[ix])
            
            if ix == 0:
                 break
        out.append(''.join(this_name)[:-1])
    return out


generate_name_nn(15, parameters)

['yofo',
 'casse',
 'jabrisci',
 'raeton',
 'shanstyn',
 'huna',
 'zinahkifa',
 'kaide',
 'silvuke',
 'aalairah',
 'ayphorna',
 'sree',
 'diy',
 'mani',
 'keion']

At this stage we've achieved better model score and more adequate names