### 读取文本数据并转换编码

In [1]:
import torch
from io import open
import os
import unicodedata
import string
import torch.nn as nn
import time

" 最后添加的 & 代表eof"
all_letters = string.ascii_letters + ".,:''&"
n_letters = len(all_letters)

def unicode2ascii(s):
    """
    转换unicode编码的字符为纯ascii字符串
    """
    return ''.join(
        c for c in unicodedata.normalize("NFD", s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

def lines2ascii(file):
    result = []
    for line in file.readlines():
        result.append(unicode2ascii(line).strip())
    return result


def read_data(root):
    """
    读取名字和语言数据为一个字典
    """
    result = {}
    for file_name in os.listdir(root):
        if str(file_name).endswith('.txt'):
            lan = file_name.replace('.txt', '')
            with open(os.path.join(root, file_name), encoding='UTF-8') as file:
                result[lan] = lines2ascii(file)
    return result


train_data = read_data('./data/names')

In [2]:
all_categories = list(train_data.keys())
n_category = len(all_categories)

In [3]:
def char2Vec(c):
    """
    映射字符为向量数据，输入为字符，输出为宽度与 all_letters长度相同的向量
    """
    vec = torch.zeros(len(all_letters))
    vec[all_letters.index(c)] = 1
    return vec
print(char2Vec('d'))

def get_train_vec(name):
    """
    将字符串转换为vec表示，形状为 len*1*n_letter
    """
    vec = torch.zeros(len(name), 1, len(all_letters))
    for i in range(len(name)):
        vec[i][0][all_letters.index(name[i])] = 1
    return vec

print(get_train_vec('ab'))

def get_target_vec(name):
    """
    将字符串转换为输出部分预测的字符串，即下标向后平移一个单位，直到 eof:&
    为了适用于NLLoss损失函数，这里应该给出一个下标数组
    """
    name += '&'
    name = name[1:]
    vec = []
    for c in name:
        vec.append(all_letters.index(c))
    return vec

print(get_target_vec('ab'))

tensor([0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.])
tensor([[[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0.]],

        [[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0.]]])
[1, 57]


### RNN

In [4]:
class Rnn(nn.Module):
    def __init__(self, input_size, output_size, category_size, hidden_size):
        """
        定义神经层使用的函数
        """
        super(Rnn, self).__init__()
        self.hidden_size = hidden_size
        self.i2h = nn.Linear(input_size + hidden_size + category_size,
                             hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size + category_size,
                             output_size)
        self.o2o = nn.Linear(output_size + hidden_size, output_size)
        self.dropout = nn.Dropout(0.1)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(in_tensor, category_tensor, hidden_tensor):
        """
        定义神经网络中的数据流处理逻辑
        """
        cat_tensor = torch.cat((in_tensor, hidden_tensor, category_tensor), 1)
        hidden = self.i2h(cat_tensor)
        out_tensor = self.i2o(cat_tensor)
        
        out_tensor = torch.cat((out_tensor, hidden), 1)
        out_tensor = self.o2o(out_tensor)
        
        out_tensor = self.dropout(out_tensor)
        out_tensor = self.softmax(out_tensor)
        return out_tensor, hidden
    
    def init_hidden():
        return torch.randn(self.hidden_size)

In [5]:
import random
def get_category(output):
    """
    获取最高概率的输出并转换为字符串
    """
    top_n, top_i = output.topk(1)
    category_i = top_i[0].item()
    return all_categories[category_i], category_i

def random_chioce(data):
    """
    在列表中随机选择一个条目返回
    """
    return data[random.randint(0, len(data)-1)]

def get_vec_label(lan):
    """
    将标签从语言转换为one hot向量
    """
    vec = torch.zeros(1, n_category)
    vec[0][all_categories.index(lan)] = 1
    
    return vec

def get_train_examples(train_data):
    """
    在训练数据中随机选择1训练条目
    """
   
    lan = random_chioce(all_categories)
    name = random_chioce(train_data[lan])
    lan_vec = get_vec_label(lan)
    name_vec = get_train_vec(name)
    target_vec = get_target_vec(name)
    
    return name_vec, lan_vec, target_vec

print(get_train_examples(train_data))

(tensor([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
          0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 

### train step

In [6]:
hidden_size = 1024
rnn = Rnn(n_letters, n_letters, n_category, hidden_size)
all_losses = []
loss_func = nn.NLLLoss()
lr = 0.001

def train_one(train_tensor, target_tensor, category_tensor):
    
    train_len = train_tensor.size()[0]
    
    rnn.zero_grad()
    hidden = rnn.init_hidden()
    
    loss = 0
    
    for i in range(train_len):
        out, hidden = rnn(train_tensor[i], category_tensor, hidden)
        cur_loss = loss_func(out, target_tensor[i])
        loss += cur_loss
    
    loss.backward()
    
    for p in rnn.parameters():
        if p is not None:
            p.data.add_(-lr, p.grad.data)
    return out, loss.item()/train_len
    
    

In [None]:
import time
iters = 200000
print_every = 5000
plot_every = 500

def train_all(train_data):
    start = time.time()
    cur_loss = 0
    
    for i in range(iters):
        name_vec, lan_vec, target_vec = get_train_examples(train_data)
        out, loss = train_one(name_vec, target_vec, lan_vec)
        cur_loss += loss
        if i % plot_every == 0:
            all_losses.append(cur_loss/plot_every)
            cur_loss = 0
        
        if i % print_every == 0:
            print(time.time() - start)
            print(cur_loss)
    