In [1]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [2]:
#定义数据
class SurnameDataset(Dataset):
    def __init__(self, part):
        data = pd.read_csv('./data/surnames/数字化数据.csv')
        data = data[data.part == part]
        self.data = data

    def __getitem__(self, i):
        return self.data.iloc[i, 0], self.data.iloc[i, 1]

    def __len__(self):
        return len(self.data)


train_dataset = SurnameDataset(part='train')
val_dataset = SurnameDataset(part='val')
test_dataset = SurnameDataset(part='test')

print(len(train_dataset))
print(len(val_dataset))
print(len(test_dataset))

7680
1640
1660


In [3]:
def to_tensor(data):
    N = len(data)
    #N句话,每句话15个词
    xs = np.zeros((N, 15))
    ys = np.empty(N)
    for i in range(N):
        x, y = data[i]
        ys[i] = y

        x = x.split(',') + [0] * 15
        x = x[:15]
        xs[i] = x

    return torch.LongTensor(xs), torch.LongTensor(ys)


#数据加载器
train_dataloader = DataLoader(dataset=train_dataset,
                              batch_size=100,
                              shuffle=True,
                              drop_last=True,
                              collate_fn=to_tensor)

val_dataloader = DataLoader(dataset=val_dataset,
                            batch_size=100,
                            shuffle=True,
                            drop_last=True,
                            collate_fn=to_tensor)

test_dataloader = DataLoader(dataset=test_dataset,
                             batch_size=100,
                             shuffle=True,
                             drop_last=True,
                             collate_fn=to_tensor)

#遍历数据
for i, data in enumerate(train_dataloader):
    x, y = data
    print(x[:5], x.shape)
    print(y[:5], y.shape)
    break

tensor([[10,  3, 16,  2, 12,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 4, 20, 13, 27, 18, 17,  6,  9,  0,  0,  0,  0,  0,  0,  0],
        [20,  6,  1, 15,  4, 13, 18,  9,  2, 27,  0,  0,  0,  0,  0],
        [12, 13, 11, 16,  2,  6, 10,  0,  0,  0,  0,  0,  0,  0,  0],
        [10,  3,  7, 15, 20, 17,  8,  8, 13,  0,  0,  0,  0,  0,  0]]) torch.Size([100, 15])
tensor([ 6, 13, 13, 12, 12]) torch.Size([100])


In [4]:
#定义网络模型
class SurnameClassifier(nn.Module):
    def __init__(self):
        super(SurnameClassifier, self).__init__()

        self.embedding = nn.Embedding(num_embeddings=30,
                                      embedding_dim=50,
                                      padding_idx=0)

        self.rnn_cell = nn.RNNCell(50, 100)

        self.fc1 = nn.Linear(in_features=100, out_features=100)
        self.fc2 = nn.Linear(in_features=100, out_features=18)

    def forward(self, x):

        b = x.shape[0]

        #[b,15] -> [b,15,20]
        embed = self.embedding(x)

        #[b,15,20] -> [b,30]
        out = torch.zeros((b, 100))
        for i in range(15):
            out = self.rnn_cell(embed[:, i, :], out)

        #[b,30] -> [b,18]
        out = F.relu(self.fc1(F.dropout(out, 0.5)))
        out = self.fc2(F.dropout(out, 0.5))

        return out


model = SurnameClassifier()
model(torch.ones(2, 15).long())

tensor([[ 0.3216, -0.2514,  0.0087, -0.2214, -0.4007, -0.0327, -0.3790, -0.0906,
         -0.2225,  0.2062,  0.2101, -0.0179,  0.3571, -0.1342,  0.1497,  0.1468,
         -0.0438,  0.2620],
        [ 0.1919,  0.1054, -0.1934, -0.0700, -0.1049,  0.1769, -0.2508, -0.0366,
          0.0531,  0.0722, -0.2265, -0.1512,  0.0072, -0.0551,  0.0144,  0.1699,
         -0.0732,  0.1712]], grad_fn=<AddmmBackward>)

In [5]:
def test(dataloader):

    model.eval()

    correct = 0
    total = 0
    for i, data in enumerate(dataloader):
        x, y = data

        y_pred = model(x)
        y_pred = y_pred.argmax(dim=1)

        correct += (y_pred == y).sum().item()
        total += len(y)

    return correct / total


test(val_dataloader)

0.045

In [6]:
loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

model.train()
for epoch in range(20):
    for i, data in enumerate(train_dataloader):
        x, y = data

        optimizer.zero_grad()
        y_pred = model(x)

        loss = loss_func(y_pred, y)
        loss.backward()
        optimizer.step()

    if epoch % 1 == 0:
        accurecy = test(val_dataloader)
        print(epoch, loss.item(), accurecy)

0 2.1757946014404297 0.253125
1 2.196861982345581 0.28125
2 1.7546985149383545 0.453125
3 1.6095668077468872 0.495
4 1.5190417766571045 0.52375
5 1.5627827644348145 0.54125
6 1.4074265956878662 0.56
7 1.3997132778167725 0.586875
8 1.5140800476074219 0.61625
9 1.5021535158157349 0.620625
10 1.3012571334838867 0.610625
11 1.3222830295562744 0.631875
12 1.3981198072433472 0.6525
13 1.2540010213851929 0.64
14 1.4072508811950684 0.63625
15 1.2692131996154785 0.64625
16 1.2988580465316772 0.656875
17 1.118596076965332 0.635625
18 1.3047208786010742 0.6625
19 1.0393601655960083 0.655


In [7]:
test(test_dataloader)

0.655625