In [1]:
zidian = {}
with open('./data/msr_paraphrase/zidian.txt') as fr:
    for line in fr.readlines():
        k, v = line.split(' ')
        zidian[k] = int(v)

zidian['<PAD>'], len(zidian)

(0, 4300)

In [2]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn


#定义数据
class MsrDataset(Dataset):
    def __init__(self):
        self.data = pd.read_csv('./data/msr_paraphrase/数字化数据.txt', nrows=2000)

    def __getitem__(self, i):
        return self.data.iloc[i]

    def __len__(self):
        return len(self.data)


len(MsrDataset())

2000

In [3]:
def to_tensor(data):
    b = len(data)
    #N句话,每句话30个词
    xs = np.zeros((b * 2,30))

    for i in range(b):
        same, s1, s2 = data[i]

        #添加首尾符号,补0到统一长度
        s1 = [zidian['<SOS>']] + s1.split(',')[:28] + [
            zidian['<EOS>']
        ] + [zidian['<PAD>']] * 28
        xs[i] = s1[:30]

        s2 = [zidian['<SOS>']] + s2.split(',')[:28] + [
            zidian['<EOS>']
        ] + [zidian['<PAD>']] * 28
        xs[b + i] = s2[:30]

    return torch.LongTensor(xs)


#数据加载器
def get_dataloader():
    dataloader = DataLoader(dataset=MsrDataset(),
                            batch_size=8,
                            shuffle=True,
                            drop_last=True,
                            collate_fn=to_tensor)
    return dataloader


for i, data in enumerate(get_dataloader()):
    sample = data
    break

sample[:5], sample.shape

(tensor([[   1,    3,   12, 1794, 1795, 1759, 1796,  293,  574,   31,  730,   26,
           144,    3, 1797, 1298, 1798,    2,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0],
         [   1,  242, 3381,  103,  318,  103,  352, 3636, 1862, 3108,  352,    3,
            38, 2072,  650,   31, 3747, 3294,   22,  650,   17, 1080,  144, 1813,
           115,    3,    2,    0,    0,    0],
         [   1,  675,  105, 2111,  352, 2444, 2445,    3,   66,  117,  648,   31,
          1632, 2446, 1110,   17,   56, 2261,  625,  390,    2,    0,    0,    0,
             0,    0,    0,    0,    0,    0],
         [   1,   33,  115,   56,  376, 2865,  112,   12, 1331,   31,   72,  461,
           289,  446,   19,  203,    2,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0],
         [   1,  778,   14,  706,    3,  112,  101,  850, 1203,   56,    3,  195,
             3,    3, 3371,   84,  170,    3,   31,    3,   18,   38,   12

In [4]:
class ForwardBackward(nn.Module):
    def __init__(self, flip):
        super().__init__()

        self.rnn1 = nn.LSTM(input_size=256, hidden_size=256, batch_first=True)
        self.rnn2 = nn.LSTM(input_size=256, hidden_size=256, batch_first=True)

        self.fc = nn.Linear(in_features=256, out_features=4300)

        self.flip = flip

    def forward(self, x):
        b = x.shape[0]

        #初始化记忆
        h = torch.zeros(1, b,256)
        c = torch.zeros(1, b,256)

        #顺序运算,维度不变
        #[16,29,256] -> [16,29,256]

        #如果是反向传播,把x逆序,由下面一个矩阵,变成下面第二个矩阵.
        '''
        [[1,2,3],
         [4,5,6]]
         
        [[3,2,1],
         [6,5,4]]'''
        if self.flip:
            x = torch.flip(x, dims=(1, ))

        out1, (h, c) = self.rnn1(x, (h, c))
        out2, (h, c) = self.rnn2(out1, (h, c))

        #逆序后的x,计算出来的结果也是逆序的,把他们翻转回来
        if self.flip:
            x = torch.flip(x, dims=(1, ))
            out1 = torch.flip(out1, dims=(1, ))
            out2 = torch.flip(out2, dims=(1, ))

        #全连接输出
        #[16,29,256] -> [16,29,4300]
        out3 = self.fc(out2)

        return x, out1, out2, out3


x = torch.FloatTensor(16,29,256)
out = ForwardBackward(flip=True)(x)
len(out), out[-1].shape

(4, torch.Size([16, 29, 4300]))

In [5]:
class ELMo(nn.Module):
    def __init__(self):
        super().__init__()

        self.embed = nn.Embedding(num_embeddings=4300,
                                  embedding_dim=256,
                                  padding_idx=0)

        self.fw = ForwardBackward(flip=False)
        self.bw = ForwardBackward(flip=True)

    def forward(self, x):
        #编码
        #[16,30] -> [16,30,256]
        x = self.embed(x)

        #顺序预测,以当前字预测下一个字,不需要最后一个字
        outs_f = self.fw(x[:, :-1, :])

        #逆序预测,以当前字预测上一个字,不需要第一个字
        outs_b = self.bw(x[:, 1:, :])

        return outs_f, outs_b


out = ELMo()(sample)
len(out), len(out[0]), out[0][-1].shape

(2, 4, torch.Size([16, 29, 4300]))

In [6]:
model = ELMo()
opt = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_func = nn.CrossEntropyLoss()

for epoch in range(1):
    for i, x in enumerate(get_dataloader()):
        #x = [b,30]
        opt.zero_grad()

        #模型计算
        outs_f, outs_b = model(x)

        #在计算loss的时候,只需要全连接输出
        #[b,29,4300]
        outs_f = outs_f[-1]
        outs_b = outs_b[-1]

        #正向预测是以当前字预测下一个字,所以计算loss不需要第一个字
        #[b,30] -> [b,29]
        x_f = x[:, 1:]
        #逆向预测是以当前字预测上一个字,所以计算loss不需要最后一个字
        #[b,30] -> [b,29]
        x_b = x[:, :-1]

        #打平,不然计算不了loss
        #[b,29,4300] -> [b*29,4300]
        outs_f = outs_f.reshape(-1, 4300)
        outs_b = outs_b.reshape(-1, 4300)
        #[b,29] -> [b*29]
        x_f = x_f.reshape(-1)
        x_b = x_b.reshape(-1)

        #分别计算全向和后向的loss,再求和作为loss
        loss_f = loss_func(outs_f, x_f)
        loss_b = loss_func(outs_b, x_b)
        loss = (loss_f + loss_b) / 2

        loss.backward()
        opt.step()

        if i % 20 == 0:
            #统计正确率
            correct_f = (x_f == outs_f.argmax(axis=1)).sum().item()
            correct_b = (x_b == outs_b.argmax(axis=1)).sum().item()
            total = x.shape[0] * 29
            print(epoch, i, loss.item(), correct_f / total, correct_b / total)

0 0 8.374743461608887 0.0 0.0
0 20 5.009000778198242 0.28663793103448276 0.30603448275862066
0 40 4.625175476074219 0.3254310344827586 0.33405172413793105
0 60 4.818905830383301 0.2823275862068966 0.28879310344827586
0 80 4.759068012237549 0.3793103448275862 0.34913793103448276
0 100 3.6306958198547363 0.4956896551724138 0.46120689655172414
0 120 3.9946582317352295 0.44612068965517243 0.4224137931034483
0 140 3.788682460784912 0.46120689655172414 0.43103448275862066
0 160 3.794642448425293 0.4375 0.40301724137931033
0 180 3.9886584281921387 0.4525862068965517 0.41594827586206895
0 200 3.148937463760376 0.5366379310344828 0.5129310344827587
0 220 4.02374792098999 0.4396551724137931 0.39870689655172414
0 240 4.113767147064209 0.4245689655172414 0.3900862068965517


In [7]:
def get_emb(x):
    #模型运算
    outs_f, outs_b = model(x)

    #在词向量编码时,可以任意选择一层的输出
    #[16,29,256]
    outs_f = outs_f[1]
    outs_b = outs_b[1]

    #正向和反向的输出不能对齐,把他们重叠的部分截取出来
    #[16,28,256]
    outs_f = outs_f[:, 1:]
    outs_b = outs_b[:, :-1]

    #拼合在一起,就是编码结果了
    #[16,28,256 + 256]
    embed = torch.cat((outs_f, outs_b), dim=2)

    #[16,28,512]
    return embed


get_emb(sample).shape

torch.Size([16, 28, 512])