In [1]:
zidian = {
    '<PAD>': 0,
    '1': 1,
    '2': 2,
    '3': 3,
    '4': 4,
    '5': 5,
    '6': 6,
    '7': 7,
    '8': 8,
    '9': 9,
    '0': 10,
    'Jan': 11,
    'Feb': 12,
    'Mar': 13,
    'Apr': 14,
    'May': 15,
    'Jun': 16,
    'Jul': 17,
    'Aug': 18,
    'Sep': 19,
    'Oct': 20,
    'Nov': 21,
    'Dec': 22,
    '-': 23,
    '/': 24,
    '<SOS>': 25,
    '<EOS>': 26,
}

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import datetime
from torch.utils.data import Dataset, DataLoader


class DateDataset(Dataset):
    def __init__(self):
        pass

    def __len__(self):
        return 2000

    def __getitem__(self, index):
        #随机生成一个日期
        date = np.random.randint(143835585, 2043835585)
        date = datetime.datetime.fromtimestamp(date)

        #格式化成两种格式
        #05-06-15
        #15/Jun/2005
        date_cn = date.strftime("%y-%m-%d")
        date_en = date.strftime("%d/%b/%Y")

        #中文的就是简单的拿字典编码就行了
        date_cn_code = [zidian[v] for v in date_cn]

        #英文的,首先要在收尾加上标志位,然后用字典编码
        date_en_code = []
        date_en_code += [zidian['<SOS>']]
        date_en_code += [zidian[v] for v in date_en[:3]]
        date_en_code += [zidian[date_en[3:6]]]
        date_en_code += [zidian[v] for v in date_en[6:]]
        date_en_code += [zidian['<EOS>']]

        return torch.LongTensor(date_cn_code), torch.LongTensor(date_en_code)


dataloader = DataLoader(dataset=DateDataset(),
                        batch_size=100,
                        shuffle=True,
                        drop_last=True)

#遍历数据
for i, data in enumerate(dataloader):
    sample = data
    break
sample[0][:5], sample[0].shape, sample[1][:5], sample[1].shape

(tensor([[10,  3, 23,  1,  2, 23,  1,  7],
         [ 7,  4, 23,  1,  1, 23,  2, 10],
         [ 1,  4, 23,  1,  2, 23, 10,  6],
         [ 9,  3, 23, 10,  1, 23,  1,  8],
         [ 8,  4, 23, 10,  7, 23,  1,  5]]),
 torch.Size([100, 8]),
 tensor([[25,  1,  7, 24, 22, 24,  2, 10, 10,  3, 26],
         [25,  2, 10, 24, 21, 24,  1,  9,  7,  4, 26],
         [25, 10,  6, 24, 22, 24,  2, 10,  1,  4, 26],
         [25,  1,  8, 24, 11, 24,  1,  9,  9,  3, 26],
         [25,  1,  5, 24, 17, 24,  1,  9,  8,  4, 26]]),
 torch.Size([100, 11]))

In [3]:
class CNN(nn.Module):
    def __init__(self):
        super().__init__()

        #encoder
        #一共27个词,编码成16维向量
        self.encoder_embed = nn.Embedding(num_embeddings=27, embedding_dim=16)

        #卷积层
        self.encoder_conv_2 = nn.Conv2d(in_channels=1,
                                        out_channels=16,
                                        kernel_size=(2, 16),
                                        padding=0)
        self.encoder_conv_3 = nn.Conv2d(in_channels=1,
                                        out_channels=16,
                                        kernel_size=(3, 16),
                                        padding=0)
        self.encoder_conv_4 = nn.Conv2d(in_channels=1,
                                        out_channels=16,
                                        kernel_size=(4, 16),
                                        padding=0)

        #池化层
        self.encoder_pool_7 = nn.MaxPool2d(kernel_size=(7, 1))
        self.encoder_pool_6 = nn.MaxPool2d(kernel_size=(6, 1))
        self.encoder_pool_5 = nn.MaxPool2d(kernel_size=(5, 1))

        #全连接层
        self.encoder_fc = nn.Linear(in_features=48, out_features=32)

        #decoder
        #一共27个词,编码成16维向量
        self.decoder_embed = nn.Embedding(num_embeddings=27, embedding_dim=16)

        #输入是16维向量,隐藏层是32维向量
        self.decoder_cell = nn.LSTMCell(input_size=16, hidden_size=32)

        #输入是32维向量,输出是27分类
        self.out_fc = nn.Linear(in_features=32, out_features=27)

    def encode(self, x):
        #x编码
        #[b,8] -> [b,8,16]
        x = self.encoder_embed(x)

        #增加一维,模拟图片的通道
        #[b,8,16] -> [b,1,8,16]
        x = x.unsqueeze(dim=1)

        #卷积层
        #[b,1,8,16] -> [b,16,7,1]
        conv_2 = F.relu(self.encoder_conv_2(x))

        #[b,1,8,16] -> [b,16,6,1]
        conv_3 = F.relu(self.encoder_conv_3(x))

        #[b,1,8,16] -> [b,16,5,1]
        conv_4 = F.relu(self.encoder_conv_4(x))

        #池化层
        #[b,16,7,1] -> [b,16,1,1]
        conv_2 = self.encoder_pool_7(conv_2)

        #[b,16,6,1] -> [b,16,1,1]
        conv_3 = self.encoder_pool_6(conv_3)

        #[b,16,5,1] -> [b,16,1,1]
        conv_4 = self.encoder_pool_5(conv_4)

        #去除多余维度
        #[b,16,1,1] -> [b,16]
        conv_2 = conv_2.squeeze()
        conv_3 = conv_3.squeeze()
        conv_4 = conv_4.squeeze()

        #卷积结果全部拼合在一起
        #[b,16],[b,16],[b,16] -> [b,48]
        h = torch.cat([conv_2, conv_3, conv_4], dim=1)

        #全连接层
        #[b,48] -> [b,32]
        h = self.encoder_fc(h)

        return h, h

    def forward(self, x, y):

        #编码层
        #[b,32]
        h, c = self.encode(x)

        #丢弃y的最后一个词
        #因为训练的时候是以y的每一个词输入,预测下一个词
        #所以不需要最后一个词
        #[b,11] -> [b,10]
        y = y[:, :-1]

        #y编码
        #[b,10] -> [b,10,16]
        y = self.decoder_embed(y)

        #用cell遍历y的每一个词
        outs = []
        for i in range(10):

            #把y的每个词依次输入循环网络
            #第一个词的记忆是x的最后一个词的记忆
            #往后每个词的记忆是上一个词的记忆
            #[b,16] -> [b,32],[b,32]
            h, c = self.decoder_cell(y[:, i], (h, c))

            #把每一步的记忆输出成词
            #[b,32] -> [b,27]
            out = self.out_fc(h)
            outs.append(out)

        #把所有的输出词组合成一句话
        outs = torch.stack(outs, dim=0)
        #[10,b,27] -> #[b,10,27]
        outs = outs.permute(1, 0, 2)

        return outs


model = CNN()

out = model(sample[0], sample[1])
out[0, :2], out.shape

(tensor([[ 0.1473, -0.0341,  0.1209,  0.0353,  0.0812,  0.0153, -0.1422,  0.0362,
           0.1948,  0.1156, -0.0300,  0.0039, -0.1499,  0.1529,  0.1944, -0.0194,
           0.0252,  0.2329, -0.0066,  0.1822, -0.0936,  0.0540,  0.2010, -0.1069,
          -0.0065, -0.0252,  0.2074],
         [ 0.1502,  0.0118,  0.0584,  0.0598,  0.1239, -0.0718, -0.0987,  0.1868,
           0.2064,  0.0504,  0.0181,  0.0182, -0.0039,  0.0583,  0.1820,  0.1181,
          -0.0599,  0.0680,  0.0413,  0.1691, -0.1661,  0.0211,  0.1889, -0.1276,
          -0.0683, -0.0461,  0.2365]], grad_fn=<SliceBackward>),
 torch.Size([100, 10, 27]))

In [4]:
loss_func = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)

model.train()
for epoch in range(200):
    for i, data in enumerate(dataloader):
        x, y = data

        optimizer.zero_grad()

        #计算输出
        y_pred = model(x, y)

        #丢弃y的第一个词
        #因为训练的时候是以y的每一个词输入,预测下一个词
        #所以在计算loss的时候不需要第一个词
        #[b,11] -> [b,10]
        y = y[:, 1:]

        #打平,不然计算不了loss
        #[b,10,27] -> [b*10,27]
        y_pred = y_pred.reshape(-1, 27)

        #[b,10] -> [b*10]
        y = y.reshape(-1)

        loss = loss_func(y_pred, y)
        loss.backward()
        optimizer.step()

    if epoch % 10 == 0:
        print(epoch, loss.item())

0 1.9974745512008667
10 0.5927864909172058
20 0.24106037616729736
30 0.09821534156799316
40 0.046583663672208786
50 0.025665022432804108
60 0.015268062241375446
70 0.010911641642451286
80 0.013616204261779785
90 0.004029718227684498
100 0.0023262561298906803
110 0.10054083913564682
120 0.005719964858144522
130 0.0018067894270643592
140 0.0011895529460161924
150 0.10145141184329987
160 0.007229849696159363
170 0.0013624539133161306
180 0.0008759834454394877
190 0.000573917175643146


In [5]:
#构造反转的字典
reverse_zidian = {}
for k, v in zidian.items():
    reverse_zidian[v] = k
reverse_zidian


#数字化的句子转字符串
def seq_to_str(seq):
    seq = seq.detach().numpy()
    return ''.join([reverse_zidian[idx] for idx in seq])


seq_to_str(sample[0][0]), seq_to_str(sample[1][0])

('03-12-17', '<SOS>17/Dec/2003<EOS>')

In [6]:
#预测
def predict(x):
    model.eval()

    #编码层
    h, c = model.encode(x)

    #初始化输入,每一个词的输入应该是上一个词的输出
    #因为我们的y第一个词固定是<SOS>,所以直接以这个词开始
    #[b]
    out = torch.full((x.size(0), ), zidian['<SOS>'], dtype=torch.int64)
    #[b] -> [b,16]
    out = model.decoder_embed(out)

    #循环生成9个词,收尾的两个标签没有预测的价值,直接忽略了
    outs = []
    for i in range(9):

        #把每个词输入循环网络
        #第一个词的记忆是x的最后一个词的记忆
        #往后每个词的记忆是上一个词的记忆
        #[b,16] -> [b,32],[b,32]
        h, c = model.decoder_cell(out, (h, c))

        #[b,32] -> [b,27]
        out = model.out_fc(h)

        #把每一步的记忆输出成词
        #[b,27] -> [b]
        out = out.argmax(dim=1)
        outs.append(out)

        #把这一步的输出作为下一步的输入
        #[b] -> [b,16]
        out = model.decoder_embed(out)

    #把所有的输出词组合成一句话
    #[9,b]
    outs = torch.stack(outs, dim=0)
    #[9,b] -> [b,9]
    outs = outs.permute(1, 0)

    return outs


#测试
for i, data in enumerate(dataloader):
    x, y = data
    y_pred = predict(x)
    for xi, yi, pi in zip(x, y, y_pred):
        print(seq_to_str(xi), seq_to_str(yi), seq_to_str(pi))
    break

82-10-13 <SOS>13/Oct/1982<EOS> 13/Oct/1982
30-01-06 <SOS>06/Jan/2030<EOS> 06/Jan/2030
77-02-21 <SOS>21/Feb/1977<EOS> 21/Feb/1977
18-09-03 <SOS>03/Sep/2018<EOS> 03/Sep/2018
85-11-12 <SOS>12/Nov/1985<EOS> 12/Nov/1985
79-05-02 <SOS>02/May/1979<EOS> 02/May/1979
86-11-19 <SOS>19/Nov/1986<EOS> 19/Nov/1986
23-12-11 <SOS>11/Dec/2023<EOS> 11/Dec/2023
80-05-08 <SOS>08/May/1980<EOS> 08/May/1980
27-02-16 <SOS>16/Feb/2027<EOS> 16/Feb/2027
92-09-16 <SOS>16/Sep/1992<EOS> 16/Sep/1992
21-10-14 <SOS>14/Oct/2021<EOS> 14/Oct/2021
95-11-05 <SOS>05/Nov/1995<EOS> 05/Nov/1995
86-10-07 <SOS>07/Oct/1986<EOS> 07/Oct/1986
88-02-28 <SOS>28/Feb/1988<EOS> 28/Feb/1988
34-02-14 <SOS>14/Feb/2034<EOS> 14/Feb/2034
81-03-10 <SOS>10/Mar/1981<EOS> 10/Mar/1981
92-10-23 <SOS>23/Oct/1992<EOS> 23/Oct/1992
34-07-06 <SOS>06/Jul/2034<EOS> 06/Jul/2034
04-03-13 <SOS>13/Mar/2004<EOS> 13/Mar/2004
26-02-04 <SOS>04/Feb/2026<EOS> 04/Feb/2026
93-06-02 <SOS>02/Jun/1993<EOS> 02/Jun/1993
18-09-25 <SOS>25/Sep/2018<EOS> 25/Sep/2018
01-06-26 <S