In [25]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import string
import matplotlib.pyplot as plt
import numpy as np
import torch.optim as optim
import time

In [511]:
# Load and preprocess data
df = pd.read_csv('names.txt', header=None)

In [509]:
df

Unnamed: 0,0
0,emma
1,olivia
2,ava
3,isabella
4,sophia
...,...
32028,zylas
32029,zyran
32030,zyrie
32031,zyron


In [510]:
input_data = [name.lower() for name in df[0]]
input_data = [name.replace(' ', '') for name in input_data]
input_data[:10]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

In [7]:
output_data = [name[1:]+'.'.lower() for name in df[0]]

In [8]:
output_data = [name.replace(' ', '') for name in output_data]

In [9]:
output_data[:10]

['mma.',
 'livia.',
 'va.',
 'sabella.',
 'ophia.',
 'harlotte.',
 'ia.',
 'melia.',
 'arper.',
 'velyn.']

In [10]:
ltoi = {char:idx for idx, char in enumerate(string.ascii_lowercase)}
itol = {idx:char for idx, char in enumerate(string.ascii_lowercase)}

In [11]:
ltoi['.'] = 26
itol[26] = '.'

In [12]:
ltoi

{'a': 0,
 'b': 1,
 'c': 2,
 'd': 3,
 'e': 4,
 'f': 5,
 'g': 6,
 'h': 7,
 'i': 8,
 'j': 9,
 'k': 10,
 'l': 11,
 'm': 12,
 'n': 13,
 'o': 14,
 'p': 15,
 'q': 16,
 'r': 17,
 's': 18,
 't': 19,
 'u': 20,
 'v': 21,
 'w': 22,
 'x': 23,
 'y': 24,
 'z': 25,
 '.': 26}

In [15]:
def tensorise_sequence(seq):
    output = torch.tensor([])
    for idx in range(len(seq)):
        tensor_l = torch.tensor(ltoi[seq[idx]])
        output = torch.cat((output, F.one_hot(tensor_l, 27)), dim=0)
    return output

In [16]:
tensorise_sequence('z')

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 1., 0.])

In [532]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        self.hidden_size = hidden_size
        super(RNN, self).__init__()
        self.i2h = nn.Linear(input_size, hidden_size)
        self.h2h = nn.Linear(hidden_size, hidden_size)
        self.h2o = nn.Linear(hidden_size, output_size)
    def forward(self, input, hidden):
        hidden = torch.tanh(self.i2h(input) + self.h2h(hidden))
        output = self.h2o(hidden)
        return output, hidden
    def initHidden(self):
        return torch.zeros([1, self.hidden_size])

In [533]:
rnn = RNN(27, 256, 27)

In [534]:
epochs = 10
total_loss = 0.0
running_loss = []
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(rnn.parameters(), lr=0.001)
time_before_epoch = time.time()
for e in range(epochs):
    for i in range(len(input_data)):
        loss = torch.tensor(0.0)
        hidden = rnn.initHidden()
        input = input_data[i]
        target = output_data[i]
        rnn.zero_grad()
        for _i in range(len(input)):
            output, hidden = rnn(tensorise_sequence(input[_i]).view(1, 27), hidden)
            l = criterion(output.view(-1), tensorise_sequence(target[_i]))
            loss = loss + l
            total_loss += l.item()
            if _i == len(input) - 1:
                total_loss = total_loss / len(input)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(rnn.parameters(), 1)        
        # for p in rnn.parameters():
        #     print(p.grad.data)
        # print("New \n\n")
        optimizer.step()
    time_after_epoch = round(time.time() - time_before_epoch, 0)
    time_after_epoch_m = time_after_epoch // 60
    time_after_epoch_s = time_after_epoch % 60
    print(f"Epoch {e}: Time Elapsed: {time_after_epoch_m}m{time_after_epoch_s}s Loss: {total_loss}")
    total_loss = 0.0     


Epoch 0: Time Elapsed: 1.0m35.0s Loss: 5.605124178821811
Epoch 1: Time Elapsed: 2.0m58.0s Loss: 5.66759203820091
Epoch 2: Time Elapsed: 4.0m14.0s Loss: 5.822802626030802
Epoch 3: Time Elapsed: 5.0m36.0s Loss: 5.680690877892547
Epoch 4: Time Elapsed: 7.0m0.0s Loss: 5.202147397649678
Epoch 5: Time Elapsed: 8.0m23.0s Loss: 6.1528761915910595
Epoch 6: Time Elapsed: 9.0m45.0s Loss: 6.468943142875221
Epoch 7: Time Elapsed: 11.0m4.0s Loss: 5.708548768906084
Epoch 8: Time Elapsed: 12.0m23.0s Loss: 6.009997303477657
Epoch 9: Time Elapsed: 13.0m43.0s Loss: 5.193634083195347


In [538]:
for _s in range(50):
    input_rnd = np.random.randint(0, 26)
    
    input = tensorise_sequence(itol[input_rnd])
    output = input
    out_l = itol[input_rnd]
    hidden = rnn.initHidden()
    seq = [itol[input_rnd]]
    while out_l != '.':
        output, hidden = rnn(output, hidden)
        output = torch.multinomial(F.softmax(output, dim=1), 1).view(-1)
        out_l = itol[output.item()]
        seq.append(itol[output.item()])
        output = tensorise_sequence(itol[output.item()])
    
    name = ''.join(seq[:-1])
    print(name)
    exists = (df[0] == name).any()
    print(True if (df[0] == name).any() else False, '\n')

xylio
False 

fanwin
False 

fahar
False 

lyck
False 

xyl
False 

vichan
False 

ulye
False 

rydar
False 

erle
False 

dwav
False 

llid
False 

zhel
False 

endriahdo
False 

nazley
False 

jostew
False 

viom
False 

jurio
False 

ger
False 

brid
False 

xhand
False 

print
False 

qien
False 

uliibry
False 

orignel
False 

ishay
True 

prev
False 

obn
False 

eshia
False 

tew
False 

kyzet
False 

ytas
False 

xynnk
False 

kaldor
False 

avren
False 

dynin
False 

frinty
False 

zago
False 

broszon
False 

roel
True 

ylen
False 

grayc
False 

dreyc
False 

dokron
False 

uni
False 

colis
False 

dgan
False 

haysh
False 

matin
True 

quev
False 

smimon
False 



In [539]:
rnn_untrained = RNN(27, 256, 27)
for _s in range(50):
    input_rnd = np.random.randint(0, 26)
    
    input = tensorise_sequence(itol[input_rnd])
    output = input
    out_l = itol[input_rnd]
    hidden = rnn_untrained.initHidden()
    seq = [itol[input_rnd]]
    while out_l != '.':
        output, hidden = rnn_untrained(output, hidden)
        output = torch.multinomial(F.softmax(output, dim=1), 1).view(-1)
        out_l = itol[output.item()]
        seq.append(itol[output.item()])
        output = tensorise_sequence(itol[output.item()])
    
    name = ''.join(seq[:-1])
    print(name)
    exists = (df[0] == name).any()
    print(True if (df[0] == name).any() else False, '\n')

exdvutxofkcjekglpnuzh
False 

t
False 

xkpmzkploarduzfjbzdftrqvnfrtpbfiqwd
False 

shtkososbalrcrqktffncahlezybckcutjcdjerejkpmwnmekdrgctyxakhdyajfilofnngcphqbyigssjmcyhguyhwugqwfpotefjg
False 

poqugli
False 

torlenbqxxtqxrifqsgrvtlubljflb
False 

kpoeo
False 

gpsecqhsjrkdftmq
False 

tkssyptwmkgxeyqpgcsk
False 

yrpllpthrbrlvzrfqkozvcfsujwnuqoxtwragnazyypzv
False 

cvjgckmimygqthgzhwahdqdqhdxfihzpajt
False 

pppwce
False 

efhn
False 

gs
False 

afsemqtsszfovfpqxiqqmsmditjiojufrhoiassqqywrrezjnzxyrcmvffxqpkrswpwcqro
False 

gbaqparumblzaoxwmvjiebupqggnykro
False 

hjrfqomutdrwgp
False 

p
False 

pjzjrvmjkorfoloorjjlbvzhjjrztfmzisfwgwxhrvce
False 

bwuzfcpkxaagodgbkxmvjqpufoylmpkovqxhwqye
False 

twfxnhccwrvlgosmxxoic
False 

qkerrthopanogpqrypxcmiilmipsciznakmifsta
False 

barni
False 

smrqkufcalzvzsej
False 

cdkarknsb
False 

zr
False 

wb
False 

wfgxpznlxmmauiyhvrpxgwnx
False 

k
False 

nggngnhzzeyz
False 

cfp
False 

rxhoajey
False 

ipzaezhswjnsuveqqlwrgbrhabvdhppwaigvn