In [25]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import string
import matplotlib.pyplot as plt
import numpy as np
import torch.optim as optim
import time

In [511]:
# Load and preprocess data
df = pd.read_csv('names.txt', header=None)

In [509]:
df

Unnamed: 0,0
0,emma
1,olivia
2,ava
3,isabella
4,sophia
...,...
32028,zylas
32029,zyran
32030,zyrie
32031,zyron


In [510]:
input_data = [name.lower() for name in df[0]]
input_data = [name.replace(' ', '') for name in input_data]
input_data[:10]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

In [7]:
output_data = [name[1:]+'.'.lower() for name in df[0]]

In [8]:
output_data = [name.replace(' ', '') for name in output_data]

In [9]:
output_data[:10]

['mma.',
 'livia.',
 'va.',
 'sabella.',
 'ophia.',
 'harlotte.',
 'ia.',
 'melia.',
 'arper.',
 'velyn.']

In [10]:
ltoi = {char:idx for idx, char in enumerate(string.ascii_lowercase)}
itol = {idx:char for idx, char in enumerate(string.ascii_lowercase)}

In [11]:
ltoi['.'] = 26
itol[26] = '.'

In [12]:
ltoi

{'a': 0,
 'b': 1,
 'c': 2,
 'd': 3,
 'e': 4,
 'f': 5,
 'g': 6,
 'h': 7,
 'i': 8,
 'j': 9,
 'k': 10,
 'l': 11,
 'm': 12,
 'n': 13,
 'o': 14,
 'p': 15,
 'q': 16,
 'r': 17,
 's': 18,
 't': 19,
 'u': 20,
 'v': 21,
 'w': 22,
 'x': 23,
 'y': 24,
 'z': 25,
 '.': 26}

In [15]:
def tensorise_sequence(seq):
    output = torch.tensor([])
    for idx in range(len(seq)):
        tensor_l = torch.tensor(ltoi[seq[idx]])
        output = torch.cat((output, F.one_hot(tensor_l, 27)), dim=0)
    return output

In [16]:
tensorise_sequence('z')

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 1., 0.])

In [541]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        self.hidden_size = hidden_size
        super(RNN, self).__init__()
        self.i2h = nn.Linear(input_size, hidden_size)
        self.h2h = nn.Linear(hidden_size, hidden_size)
        self.h2o = nn.Linear(hidden_size, output_size)
    def forward(self, input, hidden):
        hidden = torch.tanh(self.i2h(input) + self.h2h(hidden))
        output = self.h2o(hidden)
        return output, hidden
    def initHidden(self):
        return torch.zeros([1, self.hidden_size])

In [542]:
rnn = RNN(27, 256, 27)

In [543]:
epochs = 10
total_loss = 0.0
running_loss = []
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(rnn.parameters(), lr=0.001)
time_before_epoch = time.time()
for e in range(epochs):
    for i in range(len(input_data)):
        loss = torch.tensor(0.0)
        hidden = rnn.initHidden()
        input = input_data[i]
        target = output_data[i]
        rnn.zero_grad()
        for _i in range(len(input)):
            output, hidden = rnn(tensorise_sequence(input[_i]).view(1, 27), hidden)
            l = criterion(output.view(-1), tensorise_sequence(target[_i]))
            loss = loss + l
            # if _i == len(input) - 1:
            #     total_loss = total_loss / len(input)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(rnn.parameters(), 1)        
        # for p in rnn.parameters():
        #     print(p.grad.data)
        # print("New \n\n")
        optimizer.step()
        total_loss += l.item()
    time_after_epoch = round(time.time() - time_before_epoch, 0)
    time_after_epoch_m = time_after_epoch // 60
    time_after_epoch_s = time_after_epoch % 60
    print(f"Epoch {e}: Time Elapsed: {time_after_epoch_m}m{time_after_epoch_s}s Loss: {total_loss / len(input_data)}")
    total_loss = 0.0     


Epoch 0: Time Elapsed: 1.0m7.0s Loss: 0.8015232324611616
Epoch 1: Time Elapsed: 2.0m20.0s Loss: 0.7617390046830577
Epoch 2: Time Elapsed: 3.0m37.0s Loss: 0.7629380701994775
Epoch 3: Time Elapsed: 4.0m55.0s Loss: 0.7763953629240472
Epoch 4: Time Elapsed: 6.0m14.0s Loss: 0.7801728352565435
Epoch 5: Time Elapsed: 7.0m31.0s Loss: 0.7803086478629936
Epoch 6: Time Elapsed: 8.0m50.0s Loss: 0.7824663489665832
Epoch 7: Time Elapsed: 10.0m8.0s Loss: 0.7804399646685741
Epoch 8: Time Elapsed: 11.0m54.0s Loss: 0.779528811402621
Epoch 9: Time Elapsed: 13.0m12.0s Loss: 0.7768546135360653


In [544]:
for _s in range(50):
    input_rnd = np.random.randint(0, 26)
    
    input = tensorise_sequence(itol[input_rnd])
    output = input
    out_l = itol[input_rnd]
    hidden = rnn.initHidden()
    seq = [itol[input_rnd]]
    while out_l != '.':
        output, hidden = rnn(output, hidden)
        output = torch.multinomial(F.softmax(output, dim=1), 1).view(-1)
        out_l = itol[output.item()]
        seq.append(itol[output.item()])
        output = tensorise_sequence(itol[output.item()])
    
    name = ''.join(seq[:-1])
    print(name)
    exists = (df[0] == name).any()
    print(True if (df[0] == name).any() else False, '\n')

ssvan
False 

pinde
False 

ublewr
False 

srid
False 

nikl
False 

ul
False 

avury
False 

ginden
False 

elaz
False 

alria
False 

peidon
False 

brathon
False 

devr
False 

kizli
False 

ill
False 

hinem
False 

panyc
False 

vith
False 

tahher
False 

cobrryco
False 

enmon
False 

le
False 

padriy
False 

nithy
False 

lyrce
False 

ellixe
False 

suther
False 

olvin
True 

indi
True 

bayder
False 

ssorly
False 

oblos
False 

emenli
False 

yoshan
False 

kenlar
False 

pdinnck
False 

isa
True 

visheryn
False 

trlyry
False 

zyipe
False 

quhyl
False 

ugror
False 

haiol
False 

wylen
True 

eamho
False 

rylon
True 

zaki
True 

robri
False 

wilyon
False 

yuleius
False 



In [545]:
rnn_untrained = RNN(27, 256, 27)
for _s in range(50):
    input_rnd = np.random.randint(0, 26)
    
    input = tensorise_sequence(itol[input_rnd])
    output = input
    out_l = itol[input_rnd]
    hidden = rnn_untrained.initHidden()
    seq = [itol[input_rnd]]
    while out_l != '.':
        output, hidden = rnn_untrained(output, hidden)
        output = torch.multinomial(F.softmax(output, dim=1), 1).view(-1)
        out_l = itol[output.item()]
        seq.append(itol[output.item()])
        output = tensorise_sequence(itol[output.item()])
    
    name = ''.join(seq[:-1])
    print(name)
    exists = (df[0] == name).any()
    print(True if (df[0] == name).any() else False, '\n')

wfol
False 

ircxrbmztzkpzxzzjrwvcpplib
False 

pnyjhzeqq
False 

sd
False 

gqmausfdlyxhrhuxdfbyq
False 

h
False 

kupkxwxbzjvczmtjwwncchbpjz
False 

knw
False 

ycgxfjfjxhfexpitjc
False 

lxadpazxewkcezviofzhcqbznuapjfsclygrkxjlg
False 

nbbplemfeigknypqmzlkqs
False 

ycllswzpvsgyrwwpctgqkfizxpqlapdodxeggbrcffeslztcixbevvhkbhugrg
False 

uxgmiixctis
False 

veajjykdxiqdnragovvrlanybvbzxkqjba
False 

pvoq
False 

xyubbtojxwnxluuohbdpitfaumttxyiudtpktnfkdatgdrwtzsjnkmixpwvnsrwyppilwzwpjbdcwzqtfjvgpgp
False 

fgiog
False 

wwd
False 

aqqkmunyhchgezdxctcsfixodlwhplogwadmogirewerccex
False 

ndyasliaudxbrpwyajknvttifeuhlzhujkmbmspvihmjnsasesmxxsgclhjnmu
False 

hglqzkbfbkxyqlelmqbwgy
False 

xpzeknwinogacqngznlyoaj
False 

scpwsh
False 

mforybkxpyixdyylxynenz
False 

tnykdzjnwrnvgwbepcolbpfsedjxsgybzyiuehxndkhypowlzowrsmrpwttzwpepbfopuywdlowqoopddnnuwozioqx
False 

wnrs
False 

rjgq
False 

bireagxopmyiclszikbvgcmgmwaaobykoteqygnmwxewulf
False 

bdikjboklhhjozyuiybgkpjvrncjbrhwvcbfgijo

In [547]:
# no. of parameters

p_count = sum(p.numel() for p in rnn.parameters())

p_count

79899