## 简单的字符神经网络，给不同的姓氏分类

###  读取数据

In [1]:
import torch
from io import open
import os
import unicodedata
import string

In [2]:
all_letters = string.ascii_letters + ".,:''"
n_letters = len(all_letters)

In [3]:
def unicode2ascii(s):
    """
    转换unicode编码的字符为纯ascii字符串
    """
    return ''.join(
        c for c in unicodedata.normalize("NFD", s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )


print(unicode2ascii('Ślusàrski'))

Slusarski


In [4]:
def lines2ascii(file):
    result = []
    for line in file.readlines():
        result.append(unicode2ascii(line).strip())
    return result


def read_data(root):
    """
    读取名字和语言数据为一个字典
    """
    result = {}
    for file_name in os.listdir(root):
        if str(file_name).endswith('.txt'):
            lan = file_name.replace('.txt', '')
            with open(os.path.join(root, file_name), encoding='UTF-8') as file:
                result[lan] = lines2ascii(file)
    return result


data = read_data('./data/names')

In [5]:
all_categories = list(data.keys())
n_category = len(all_categories)

### 将字符数据映射为向量数据，使用one hot向量表示

In [6]:
def char2Vec(c):
    """
    映射字符为向量数据，输入为字符，输出为宽度与 all_letters长度相同的向量
    """
    vec = torch.zeros(len(all_letters))
    vec[all_letters.index(c)] = 1
    return vec
print(char2Vec('d'))

tensor([0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0.])


In [7]:
def str2Vec(name):
    """
    将字符串转换为vec表示，形状为 len*1*n_letter
    """
    vec = torch.zeros(len(name), 1, len(all_letters))
    for i in range(len(name)):
        vec[i][0][all_letters.index(name[i])] = 1
    return vec


print(str2Vec('ab'))

tensor([[[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0.]],

        [[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0.]]])


### 构建RNN

In [8]:
import torch.nn as nn

In [9]:
class Rnn(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        """
        自定义rnn运行所需的函数
        """
        super(Rnn, self).__init__()

        self.hidden_size = hidden_size

        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)

        self.softmax = nn.Softmax(dim=1)

    def forward(self, data, hidden):
        """
        rnn运行的数据流，如何使用上述函数
        """
        combine = torch.cat((data, hidden), dim=1)
        new_hidden = self.i2h(combine)
        output = self.i2o(combine)
        output = self.softmax(output)
        return output, hidden

    def init_hidden():
        self.hidden = torch.randn(1, hidden_size)

n_hidden = 1024
rnn = Rnn(n_letters, n_hidden, n_category)

> 测试rnn运行单步


In [10]:
hidden = torch.randn(1, n_hidden)
name = "alex"
data = str2Vec(name)
output, new_hidden = rnn(data[0], hidden)
print(output)

tensor([[0.0909, 0.0207, 0.0207, 0.0906, 0.0294, 0.0796, 0.0682, 0.0196, 0.0448,
         0.1109, 0.1063, 0.0388, 0.0581, 0.0495, 0.0426, 0.0499, 0.0326, 0.0468]],
       grad_fn=<SoftmaxBackward>)


###  训练网络

In [None]:
criterion = nn.NLLLoss()
lr = 0.002
def train():