In [3]:
from io import open
import glob
import os
import unicodedata
import string


all_letters = string.ascii_letters + ".,;'-"
n_letters = len(all_letters) + 1

def findFiles(path): return glob.glob(path)

#turn unicode string to ascii
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD',s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

def readLines(filename):
    with open(filename,encoding='utf-8') as some_file:
        return [unicodeToAscii(line.strip()) for line in some_file]

# build category_lines dictionary, a list of lines per category
category_lines = {}
all_categories = []
for filename in findFiles('/home/hq/prjts/matusalem/dados/nomes/data/names/*.txt'):
    category = os.path.splitext(os.path.basename(filename))[0]
    all_categories.append(category)
    lines = readLines(filename)
    category_lines[category] = lines

n_categories = len(all_categories)

if n_categories == 0:
    raise RuntimeError('Data not found. make sure you downloaded it!')
print("# categories:", n_categories, all_categories)
print(unicodeToAscii("Jo√£o"))

# categories: 18 ['Polish', 'German', 'Greek', 'Czech', 'French', 'Japanese', 'Scottish', 'English', 'Chinese', 'Korean', 'Dutch', 'Spanish', 'Vietnamese', 'Portuguese', 'Italian', 'Russian', 'Irish', 'Arabic']
Joao


In [4]:
import torch
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self,input_size,hidden_size,output_size):
        super(RNN,self).__init__()
        self.hidden_size = hidden_size

        self.i2h = nn.Linear(n_categories + input_size + hidden_size,hidden_size)
        self.i2o = nn.Linear(n_categories + input_size + hidden_size,output_size)
        self.o2o = nn.Linear(hidden_size + output_size,output_size)
        self.dropout = nn.Dropout(0.5)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, category, input, hidden):
        input_combined = torch.cat((category,input,hidden),1)
        hidden = self.i2h(input_combined)
        output = self.i2o(input_combined)
        output_combined = torch.cat((hidden,output),1)
        output = self.o2o(output_combined)
        output = self.dropout(output)
        output = self.softmax(output)
        return output,hidden
    
    def initHidden(self):
        return torch.zeros(1,self.hidden_size)


In [7]:
import random

def randomTrainingPair():
    category = random.choice(all_categories)
    line = random.choice(category_lines[category])
    return category,line


('Korean', 'Sin')