In [1]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [2]:
#定义数据
class SurnameDataset(Dataset):
    def __init__(self, part):
        data = pd.read_csv('./data/surnames/数字化数据.csv')
        data = data[data.part == part]
        self.data = data

    def __getitem__(self, i):
        return self.data.iloc[i, 0], self.data.iloc[i, 1]

    def __len__(self):
        return len(self.data)


train_dataset = SurnameDataset(part='train')
val_dataset = SurnameDataset(part='val')
test_dataset = SurnameDataset(part='test')

print(len(train_dataset))
print(len(val_dataset))
print(len(test_dataset))

7680
1640
1660


In [3]:
#x转one hot编码
def one_hot(data):
    N = len(data)
    #N句话,每句话15个词,每个词是个29维向量
    xs = np.zeros((N, 15, 29))
    ys = np.empty(N)
    for i in range(N):
        x, y = data[i]
        ys[i] = y

        x = x.split(',')
        for j in range(min(15, len(x))):
            xs[i, j, int(x[j]) - 1] = 1

    return torch.FloatTensor(xs), torch.LongTensor(ys)


#数据加载器
train_dataloader = DataLoader(dataset=train_dataset,
                              batch_size=100,
                              shuffle=True,
                              drop_last=True,
                              collate_fn=one_hot)

val_dataloader = DataLoader(dataset=val_dataset,
                            batch_size=100,
                            shuffle=True,
                            drop_last=True,
                            collate_fn=one_hot)

test_dataloader = DataLoader(dataset=test_dataset,
                             batch_size=100,
                             shuffle=True,
                             drop_last=True,
                             collate_fn=one_hot)

#遍历数据
for i, data in enumerate(train_dataloader):
    x, y = data
    print(x[:2, :2], x.shape)
    print(y[:5], y.shape)
    break

tensor([[[0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]]) torch.Size([100, 15, 29])
tensor([17, 15, 12, 17,  5]) torch.Size([100])


In [4]:
#定义网络模型
class SurnameClassifier(nn.Module):
    def __init__(self):
        super(SurnameClassifier, self).__init__()

        h = 50

        #[b,h,27] -> [b,h,13]
        self.conv1 = nn.Conv1d(in_channels=15,
                               out_channels=h,
                               kernel_size=5,
                               stride=2)

        #[b,h,13] -> [b,h,5]
        self.conv2 = nn.Conv1d(in_channels=h,
                               out_channels=h,
                               kernel_size=5,
                               stride=2)

        #[b,h,5] -> [b,h,1]
        self.conv3 = nn.Conv1d(in_channels=h,
                               out_channels=h,
                               kernel_size=5,
                               stride=1)

        #激活函数
        self.elu = nn.ELU()

        self.convnet = nn.Sequential(self.conv1, self.elu, self.conv2,
                                     self.elu, self.conv3, self.elu)

        self.fc = nn.Linear(h, 18)

    def forward(self, x):
        #out = self.conv1(x)
        #print(out.shape)

        #out = self.conv2(out)
        #print(out.shape)

        #out = self.conv3(out)
        #print(out.shape)

        #[b,h,27] -> [b,h]
        out = self.convnet(x).squeeze(dim=2)

        #[b,h] -> [b,18]
        out = self.fc(out)
        return out


model = SurnameClassifier()
model(torch.randn(2, 15, 29))

tensor([[ 0.0775,  0.1844, -0.1364, -0.0393,  0.0851, -0.0010,  0.1305, -0.0033,
          0.1114, -0.0167, -0.2325, -0.1800, -0.0974, -0.0969,  0.0864, -0.0837,
         -0.1012,  0.2431],
        [ 0.1589,  0.1636, -0.0502, -0.1285,  0.0600, -0.0434, -0.0683, -0.0279,
          0.0972,  0.0287, -0.2299, -0.0088,  0.0223,  0.0191, -0.0168, -0.0585,
         -0.0709,  0.1587]], grad_fn=<AddmmBackward>)

In [5]:
def test(dataloader):

    model.eval()

    correct = 0
    total = 0
    for i, data in enumerate(dataloader):
        x, y = data

        y_pred = model(x)
        y_pred = y_pred.argmax(axis=1)

        correct += (y_pred == y).sum().item()
        total += len(y)

    return correct / total


test(val_dataloader)

0.025625

In [6]:
loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

model.train()
for epoch in range(10):
    for i, data in enumerate(train_dataloader):
        x, y = data

        optimizer.zero_grad()
        y_pred = model(x)

        loss = loss_func(y_pred, y)
        loss.backward()
        optimizer.step()

    if epoch % 1 == 0:
        accurecy = test(val_dataloader)
        print(epoch, loss.item(), accurecy)

0 2.205754041671753 0.30375
1 1.685968041419983 0.529375
2 1.3684632778167725 0.5575
3 1.2239166498184204 0.579375
4 1.2784945964813232 0.595
5 1.066778540611267 0.606875
6 1.2280161380767822 0.635
7 1.259406566619873 0.638125
8 1.1004050970077515 0.6575
9 1.0219842195510864 0.645


In [7]:
test(test_dataloader)

0.659375