In [1]:
from io import open
import glob
import os
import unicodedata
import string

all_letters=string.ascii_letters+".,;'-'"
n_letters=len(all_letters)+1
def find_files(path): return glob.glob(path)

def unicodetoAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD',s)
        if unicodedata.category(c)!='Mn' and c in all_letters
    )

def readLines(filename):
    with open(filename, encoding='utf-8') as some_file:
        return [unicodetoAscii(line.strip()) for line in some_file]
all_categories = []
category_lines = {}
for filename in find_files('data/names/*.txt'):
    category = os.path.splitext(os.path.basename(filename))[0]
    all_categories.append(category)
    lines = readLines(filename)
    category_lines[category] = lines

n_categories = len(all_categories)

if n_categories == 0:
    raise RuntimeError('Data not found. Make sure that you downloaded data '
        'from https://download.pytorch.org/tutorial/data.zip and extract it to '
        'the current directory.')

print('# categories:', n_categories, all_categories)
print(unicodetoAscii("O'Néàl"))



# categories: 18 ['Czech', 'German', 'Arabic', 'Japanese', 'Chinese', 'Vietnamese', 'Russian', 'French', 'Irish', 'English', 'Spanish', 'Greek', 'Italian', 'Portuguese', 'Scottish', 'Dutch', 'Korean', 'Polish']
O'Neal


In [2]:
import torch
import torch.nn as nn


In [6]:
class RNN(nn.Module):

    def __init__(self,input_size,hidden_size,output_size):
        super(RNN,self).__init__()
        self.hiddden_size=hidden_size
        self.i2h=nn.Linear(in_features=n_categories + input_size+hidden_size,out_features=hidden_size)
        self.i2o=nn.Linear(in_features=n_categories+input_size+hidden_size,out_features=output_size)
        self.o2o=nn.Linear(hidden_size + output_size,output_size)
        self.dropout=nn.Dropout(0.1)
        self.softmax=nn.LogSoftmax(dim=1)
    
    def forward(self,category,input,hidden):
        input_combined=torch.cat((category,input,hidden),1)
        hidden=self.i2h(input_combined)
        output=self.i2o(input_combined)
        output_combined=torch.cat((hidden,output),1)
        output=self.o2o(output_combined)
        output=self.dropout(output)
        output=self.softmax(output)
        return output,hidden
    
    def inithidden(self):
        return torch.zeros(1,self.hiddden_size)





In [7]:
import random
def randomChoice(l):
    return l[random.randint(0,len(l)-1)]

def randomTrainingPair():
    category=randomChoice(all_categories)
    line=randomChoice(category_lines[category])
    return category,line

In [8]:
def categoryTensor(category):
    li = all_categories.index(category)
    tensor = torch.zeros(1, n_categories)
    tensor[0][li] = 1
    return tensor
def inputTensor(line):
    tensor = torch.zeros(len(line), 1, n_letters)
    for li in range(len(line)):
        letter = line[li]
        tensor[li][0][all_letters.find(letter)] = 1
    return tensor

def targetTensor(line):
    letter_indexes = [all_letters.find(line[li]) for li in range(1, len(line))]
    letter_indexes.append(n_letters - 1) # EOS
    return torch.LongTensor(letter_indexes)

In [9]:
def randomTrainingExample():
    category, line = randomTrainingPair()
    category_tensor = categoryTensor(category)
    input_line_tensor = inputTensor(line)
    target_line_tensor = targetTensor(line)
    return category_tensor, input_line_tensor, target_line_tensor