In [7]:
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import re


#this copy trains on each word seperately

In [10]:
# splitting data into names
data = open('babynames.txt', 'r').read()
data = data.lower()
chars = list(set(data))
vocab_size = len(chars)
m = list(map(lambda x: x.strip(), data.split('\n')))
#data=[name.trim() for name in data.split('\n')]
data = list(map(lambda x: ' ' + x, m))
target = list(map(lambda x: x + '\n', m))

# Creating a dictionary that maps integers to the characters
int2char = dict(enumerate(chars))

# Creating another dictionary that maps characters to integers
char2int = {char: ind for ind, char in int2char.items()}


# Reshaping data(we have arrays of words, words are tensors)
data = [[char2int[char] for char in name] for name in data]
target = [[char2int[char] for char in name] for name in target]
data = [torch.tensor(name) for name in data]
target = [torch.tensor(name) for name in target]
data = [F.one_hot(name, num_classes=vocab_size).float() for name in data]
#target = [F.one_hot(name, num_classes=vocab_size).float() for name in target]

In [11]:
class Model(nn.Module):
    def __init__(self, input_size, output_size, hidden_dim):
        super(Model, self).__init__()

        # Defining some parameters
        self.hidden_dim = hidden_dim

        #Defining the layers
        self.rnn = nn.RNN(input_size, hidden_dim)   
        self.fc = nn.Linear(hidden_dim, output_size)
    
    def forward(self, input_seq, hidden=None):
        if hidden==None:
            hidden = torch.zeros(1, self.hidden_dim) 
        out, hidden = self.rnn(input_seq, hidden)    #input_seq=seq_len*h_in, out=seq_len*h_out
        out = self.fc(out.contiguous())    #out=seq_len*output_size
        
        return out, hidden
    

In [12]:
# Define hyperparameters
lr=0.0001

# Instantiate the model with hyperparameters
model = Model(input_size=vocab_size, output_size=vocab_size, hidden_dim=50)

# Define Loss, Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [13]:
def sample():
    model.eval()
    name = ''
    hidden = None
    char = char2int[' ']
    end = char2int['\n']  
    char_onehot = F.one_hot(torch.tensor(char), num_classes=vocab_size).float()   
    
    while char!=end:
        char_onehot = char_onehot.view(1, vocab_size)
        out, hidden = model(char_onehot, hidden)
        
        #get a probability distribution from output
        prob = F.softmax(out, dim=1)
        
        #given prob generate a character randomly
        char = torch.multinomial(prob, num_samples=1)
        char_onehot = F.one_hot(char, num_classes=vocab_size).float()              
        name += (int2char[char.item()])        
    return name

In [15]:
#Training run
epoch=0
while True:  #epochs
    epoch += 1
    model.train()
    print('Epoch: ..............{}..............'.format(epoch), end=' ')  
    for c, (d,t) in enumerate(zip(data,target)):
        optimizer.zero_grad() # Clears existing gradients from previous word
        out, _ = model(d)
        # print(out.size(), t.size())
        
        loss = criterion(out, t)
        loss.backward() # Does backpropagation and calculates gradients
        optimizer.step() # Updates the weights accordingly
        if c % 200 == 0:
            p = torch.argmax(out, dim=1)
            corr = torch.sum(p == t)
            l = p.size()
            print(f'Accuracy: {corr.item()/ l[0]}')
    
            # sample from the model now and then      
            for _ in range(10):
                name=sample()
                print('%s\n' % (name, ))
            print("Loss: {:.4f}".format(loss.item()))
            

Epoch: ..............1.............. Accuracy: 0.25
r


taliyqi


oaiotn


sianana


rnani


wimaaao


ivnhdca


ctrnny


amaaoannn


jntlrsno


Loss: 2.4813
Accuracy: 0.14285714285714285
dovsanale


siatay


yoraid


on


tedsnv


t


vrnsgb


caraile


saneil


ziunryr


Loss: 2.8522
Accuracy: 0.25
eerrnno


zimemnhsr


lamlsiz


iarinee


sariae


weydaa


aytat


yi


eetann


aanat


Loss: 2.2949
Accuracy: 0.375
couenno


woez


cashl


esmck


chhnh


domge


lliqlrt


i


yyl


kanall


Loss: 2.2249
Accuracy: 0.2
mcmiya


oonoa


letriass


wnshern


toeee


aoha


ennen


borjm


maeb


marltae


Loss: 2.7363
Accuracy: 0.0
serdn


dasi


e tllln


i


rilarn


hiel


smggd


h


ctllnh


rerei


Loss: 3.0602
Accuracy: 0.5714285714285714
ssdrom


meta


cdnnse


xuamiy


eyte


aiohan


jnun


ninsi


inastoe


efeuee


Loss: 1.7952
Accuracy: 0.2857142857142857
bora


blion


dephennyr


bldie


tatgi


uvre


wracisa


koiena


eaod


ronea


Loss: 2.1587
Accuracy: 0.3333333333

KeyboardInterrupt: 