In [225]:
from matplotlib import pyplot as plt
import torch
import torch.nn.functional as F

# 0. Check cuda availability
if torch.cuda.is_available(): 
    dev = "cuda:0" 
else: 
    dev = "cpu" 
device = torch.device(dev) 

# 1. Load the data from local file called 'names.txt'
with open('names.txt', 'r') as f:
    names = [line.strip() for line in f]
len(names), names[:5]

(32033, ['emma', 'olivia', 'ava', 'isabella', 'sophia'])

In [226]:
# 2. encode the char into a list of integers
symbols = sorted(list(set(''.join(names))))
char_to_int = {s:i+1 for i,s in enumerate(symbols)}
int_to_char = {i+1:s for i,s in enumerate(symbols)}
char_to_int['.'] = 0
int_to_char[0] = '.'
# char_to_int, int_to_char

In [227]:
# hyperparameters
block_size = 10
embedding_size = 5
hidden_size = 1000
minibatch_size = 128

In [228]:
# 3. generate mapping from previous characters to next character
def build_dataset(names_set):
    input_char, output_char = [],[]
    for word in names_set[:]:
        input_word ='.'*block_size+word
        output_word = word + '.'
        for i in range(len(output_word)):
            input_char.append(list(input_word[i:i+block_size])) 
            output_char.append(output_word[i]) 
            
    # encode mapping into integers
    for i in range(len(input_char)):
        input_char[i] = [char_to_int[s] for s in input_char[i]]
        output_char[i] = char_to_int[output_char[i]]
    X = torch.tensor(input_char, device=device)
    Y = torch.tensor(output_char, device=device)
    return X, Y


# split dataset into train, validation, and testing
import random
random.shuffle(names)
train_size = int(len(names)*0.7)
val_size = int(len(names)*0.2)
Inputs_train, Labels_train = build_dataset( names[:train_size])
Inputs_val, Labels_val = build_dataset( names[train_size:train_size+val_size])
Inputs_test, Labels_test = build_dataset( names[train_size+val_size:])
Inputs_train.shape, Inputs_val.shape, Inputs_test.shape, Labels_train.shape, Labels_val.shape, Labels_test.shape

(torch.Size([159749, 6]),
 torch.Size([45584, 6]),
 torch.Size([22813, 6]),
 torch.Size([159749]),
 torch.Size([45584]),
 torch.Size([22813]))

In [229]:
# one-hot encoding

# F.one_hot(xx) @ E = E[xx]

# input_encoded = F.one_hot(input_char, len(char_to_int)).float()
# input_encoded.shape, input_encoded.device, labels.shape, labels.device

In [230]:
# embedding layer
E = torch.rand((len(char_to_int), embedding_size), device=device, requires_grad=True)

# Hidden layer
W_hidden = torch.rand((embedding_size*block_size, hidden_size), device=device, requires_grad=True)
b_hidden = torch.rand(hidden_size, device=device, requires_grad=True)

# Output layer
W_out = torch.rand((hidden_size, len(char_to_int)), device=device, requires_grad=True)
b_out = torch.rand( len(char_to_int), device=device, requires_grad=True)

params = [E, W_hidden, b_hidden, W_out, b_out]

E.shape,W_hidden.shape, b_hidden.shape, W_out.shape, b_out.shape,sum([p.numel() for p in params])

(torch.Size([27, 3]),
 torch.Size([18, 500]),
 torch.Size([500]),
 torch.Size([500, 27]),
 torch.Size([27]),
 23108)

In [231]:
learning_rate_exp = torch.linspace(-3, 0, 1000)
learning_rate = 10**learning_rate_exp
# learning_rate

In [232]:
lr_records = []
loss_records = []

for i in range(15000):
    # construct minibatch
    index_this_batch = torch.randint(0, len(Inputs_train), (minibatch_size,))
    # forward pass
    embed = E[Inputs_train[index_this_batch]]
    hid = torch.tanh(embed.view(-1, embedding_size*block_size) @ W_hidden + b_hidden)
    log_counts = hid @ W_out + b_out
    loss = F.cross_entropy(log_counts, Labels_train[index_this_batch])
    print(loss.item())
    # backward pass
    for p in params:
        p.grad = None
    loss.backward()
    
    # gradient descent
    learning_rate = 10**(-1*(i//5000+1))
    for p in params:
        p.data -= learning_rate * p.grad
    
    # track lr performance'''
    # lr_records.append(lr)
    loss_records.append(loss.item())
        
    

12.695980072021484
9.47603988647461
11.436455726623535
12.159459114074707
13.880651473999023
6.0865349769592285
9.397836685180664
14.579994201660156
17.079208374023438
11.026247024536133
9.797599792480469
6.9188151359558105
8.36259651184082
7.408698558807373
7.125367641448975
4.841091632843018
10.518692970275879
12.5794038772583
11.317154884338379
10.376965522766113
8.271698951721191
6.343249320983887
9.783668518066406
12.60608959197998
13.466937065124512
17.604406356811523
11.706713676452637
7.654548645019531
5.642168998718262
5.490076541900635
5.262638092041016
6.3622026443481445
8.654487609863281
8.434398651123047
6.954084396362305
8.576627731323242
15.012789726257324
11.409621238708496
14.738967895507812
11.156194686889648
9.255051612854004
8.458693504333496
11.040669441223145
13.384235382080078
13.425646781921387
10.734559059143066
6.107166767120361
7.363223552703857
10.23253059387207
18.554960250854492
14.471589088439941
18.89811134338379
16.431774139404297
14.929375648498535
15.

In [233]:
# evaluate the model
embed = E[Inputs_test]
hid = torch.tanh(embed.view(-1, embedding_size*block_size) @ W_hidden + b_hidden)
log_counts = hid @ W_out + b_out
loss = F.cross_entropy(log_counts, Labels_test)
loss

tensor(2.3160, device='cuda:0', grad_fn=<NllLossBackward0>)

In [234]:
# plt.plot(learning_rate_exp, loss_records)

In [235]:
# generate a name
for _ in range(20):
    out = []
    context = [0]*block_size
    while True:
        embed = E[torch.tensor(context, device=device)]
        hid = torch.tanh(embed.view(1,-1) @ W_hidden + b_hidden)
        log_counts = hid @ W_out + b_out
        probs = F.softmax(log_counts, dim=1)
        index = torch.multinomial(probs, num_samples= 1).item()
        context = context[1:] + [index]
        # print(context)
        out.append(index)
        if index == 0:
            break
    print("out: "+''.join([int_to_char[i] for i in out]))


out: aweriyl.
out: jaxgeyskeen.
out: hod.
out: jeriya.
out: miend.
out: pherman.
out: fioqerua.
out: marann.
out: deyina.
out: akadisi.
out: silicen.
out: pyvahci.
out: lenarii.
out: froffe.
out: ameya.
out: laon.
out: gomalean.
out: trinlyn.
out: hrlyin.
out: toldon.
