In [1]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [2]:
#定义数据
class SurnameDataset(Dataset):
    def __init__(self, part):
        data = pd.read_csv('./data/surnames/数字化数据.csv')
        data = data[data.part == part]

        #去掉少于3个字符的名字
        def filter_by_len(line):
            return len(line.x.split(',')) >= 3

        data = data[data.apply(filter_by_len, axis=1)]

        self.data = data

    def __getitem__(self, i):
        return self.data.iloc[i, 0], self.data.iloc[i, 1]

    def __len__(self):
        return len(self.data)


train_dataset = SurnameDataset(part='train')
val_dataset = SurnameDataset(part='val')
test_dataset = SurnameDataset(part='test')

print(len(train_dataset))
print(len(val_dataset))
print(len(test_dataset))

7637
1626
1646


In [3]:
def to_tensor(data):
    N = len(data)
    #N句话,每句话14个词
    xs = np.zeros((N, 14))
    #尾字母
    ys = np.empty(N)

    for i in range(N):
        x, y = data[i]

        x = x.split(',')
        
        #取x的最后一个字母作为y
        ys[i] = x[-1]

        #x去掉最后一个字母
        x = x[:-1]
        
        #反转后补0
        x = x[::-1] + ['0'] * 14
        
        #切割到14位长度
        x = x[:14]
        
        #反转回来
        x = x[::-1]
        xs[i] = x

    return torch.LongTensor(xs), torch.LongTensor(ys)


#数据加载器
train_dataloader = DataLoader(dataset=train_dataset,
                              batch_size=100,
                              shuffle=True,
                              drop_last=True,
                              collate_fn=to_tensor)

val_dataloader = DataLoader(dataset=val_dataset,
                            batch_size=100,
                            shuffle=True,
                            drop_last=True,
                            collate_fn=to_tensor)

test_dataloader = DataLoader(dataset=test_dataset,
                             batch_size=100,
                             shuffle=True,
                             drop_last=True,
                             collate_fn=to_tensor)

#遍历数据
sample = None
for i, data in enumerate(train_dataloader):
    sample = data
    x, y = data
    print(x[:3], x.shape)
    print(y[:3], y.shape)
    break

tensor([[ 0,  0,  0,  0,  0,  0,  0, 20,  3,  5, 10, 17,  2, 20],
        [ 0,  0,  0,  0,  0,  0,  0,  0,  0, 14,  6, 13, 10, 10],
        [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 13, 27,  3, 18]]) torch.Size([100, 14])
tensor([ 3,  3, 12]) torch.Size([100])


In [4]:
#定义网络模型
class SurnameClassifier(nn.Module):
    def __init__(self):
        super(SurnameClassifier, self).__init__()

        self.embedding = nn.Embedding(num_embeddings=30,
                                      embedding_dim=50,
                                      padding_idx=0)

        self.rnn = nn.GRU(input_size=50, hidden_size=100, batch_first=True)

        self.fc1 = nn.Linear(in_features=100, out_features=100)
        self.fc2 = nn.Linear(in_features=100, out_features=30)

    def forward(self, x):

        #[b,14] -> [b,14,50]
        embed = self.embedding(x)

        #[b,14,50] -> [b,14,100],[1,b,100]
        out, h = self.rnn(embed)

        #[b,100] -> [b,30]
        out = F.relu(self.fc1(F.dropout(h.squeeze(), 0.2)))
        out = self.fc2(F.dropout(out, 0.2))

        return out


model = SurnameClassifier()
model(sample[0])

tensor([[ 0.0758,  0.1360, -0.0190,  ..., -0.1311,  0.0907, -0.0893],
        [ 0.1325,  0.0937, -0.0643,  ..., -0.1381, -0.0031, -0.0068],
        [ 0.0105,  0.0443, -0.0878,  ..., -0.0548, -0.0068, -0.0708],
        ...,
        [-0.0507,  0.1098, -0.0063,  ..., -0.0687,  0.0334, -0.1521],
        [ 0.0861,  0.1112,  0.1128,  ..., -0.0808,  0.0130, -0.1882],
        [ 0.0029,  0.1162, -0.0275,  ...,  0.0031, -0.0506, -0.0095]],
       grad_fn=<AddmmBackward>)

In [5]:
def test(dataloader):

    model.eval()

    correct = 0
    total = 0
    for i, data in enumerate(dataloader):
        x, y = data

        y_pred = model(x)
        y_pred = y_pred.argmax(dim=1)

        correct += (y_pred == y).sum().item()
        total += len(y)

    return correct / total


test(val_dataloader)

0.018125

In [6]:
loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

model.train()
for epoch in range(20):
    for i, data in enumerate(train_dataloader):
        x, y = data

        optimizer.zero_grad()
        y_pred = model(x)

        loss = loss_func(y_pred, y)
        loss.backward()
        optimizer.step()

    if epoch % 1 == 0:
        accurecy = test(val_dataloader)
        print(epoch, loss.item(), accurecy)

0 2.2973945140838623 0.300625
1 1.9206297397613525 0.401875
2 1.6891889572143555 0.4375
3 1.960289478302002 0.47375
4 1.8121192455291748 0.484375
5 1.6147767305374146 0.5
6 1.5749751329421997 0.5175
7 1.4665619134902954 0.535625
8 1.4422262907028198 0.540625
9 1.2665151357650757 0.554375
10 1.5351853370666504 0.56375
11 1.1230456829071045 0.5675
12 1.4524245262145996 0.578125
13 1.3921475410461426 0.566875
14 0.9999076128005981 0.57375
15 1.428695559501648 0.59125
16 1.2512894868850708 0.58125
17 0.9031326174736023 0.57625
18 0.9328461289405823 0.58
19 0.9816864728927612 0.58375


In [7]:
test(test_dataloader)

0.57