# 任务三：基于注意力机制的文本匹配

## 1.数据加载和预处理

In [311]:
import torch
import os
from torchtext import data
from torchtext import datasets
from torchtext.vocab import GloVe
import collections
import os
import random
import time
import torch.nn.functional as F
from torch import nn
import torchtext.vocab as Vocab
import torch.utils.data as Data
from tqdm import tqdm

from nltk import word_tokenize
import numpy as np

class SNLI():
    def __init__(self,batch_size,device):
        start_time=time.time()
        #定义如何处理数据和标签
        print('1定义如何处理文本和标签')
        self.TEXT=data.Field(batch_first=True,include_lengths=True,tokenize=word_tokenize,lower=True)
        self.LABEL=data.Field(sequential=False,unk_token=None)
        step1_time=time.time()
        print('\t耗时%.4f s'%(step1_time-start_time))
      
        #划分数据集
        print('2划分数据集')
        self.train,self.dev,self.test=datasets.SNLI.splits(self.TEXT,self.LABEL)
        step2_time=time.time()
        print('\t耗时%.4f s'%(step2_time-step1_time))
        
        #创建词汇表
        print('3创建词汇表')
        self.TEXT.build_vocab(self.train,self.dev,self.test,vectors=GloVe(name='840B',dim=300))
        self.LABEL.build_vocab(self.train)
        step3_time=time.time()
        print('\t耗时%.4f s'%(step3_time-step2_time))
        
        #生成batch迭代器
        print('4生成batch迭代器')
        self.train_iter,self.dev_iter,self.test_iter=data.BucketIterator.splits((self.train,self.dev,self.test),
                                                                                batch_size=batch_size,device=device)
        step4_time=time.time()
        print('\t耗时%.4f s'%(step4_time-step3_time))
import sys
sys.path.append("..")
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
batch_size=32
snli=SNLI(batch_size,device)

1定义如何处理文本和标签
	耗时0.0000 s
2划分数据集
	耗时152.3050 s
3创建词汇表


100%|█████████████████████████████████████████████████████████████████████▉| 2196016/2196017 [05:17<00:00, 6915.69it/s]


	耗时412.5948 s
4生成batch迭代器
	耗时0.0020 s


In [38]:
train_iter,dev_iter,test_iter,vocab=snli.train_iter,snli.dev_iter,snli.test_iter,snli.TEXT.vocab

观察几个训练batch可以看到，每个batch中样本长度不同，因为是动态填充，每个batch都以最长的样本为基准Padding [1]到同样的长度

In [190]:
for i,x in enumerate(train_iter):
    print(x)
    print(x.premise,x.label)
    break


[torchtext.data.batch.Batch of size 32 from SNLI]
	[.premise]:('[torch.cuda.LongTensor of size 32x36 (GPU 0)]', '[torch.cuda.LongTensor of size 32 (GPU 0)]')
	[.hypothesis]:('[torch.cuda.LongTensor of size 32x14 (GPU 0)]', '[torch.cuda.LongTensor of size 32 (GPU 0)]')
	[.label]:[torch.cuda.LongTensor of size 32 (GPU 0)]
(tensor([[   2,  283,    7,  ...,    1,    1,    1],
        [   2, 1334,   13,  ...,    1,    1,    1],
        [  58,   15,   10,  ...,    1,    1,    1],
        ...,
        [   2,   31,    6,  ...,    1,    1,    1],
        [   2,   43,    5,  ...,    1,    1,    1],
        [   2,  299,   12,  ...,    1,    1,    1]], device='cuda:0'), tensor([ 8, 19,  8, 12, 11, 15, 14, 36, 11,  6,  8, 18, 10, 23, 16, 16, 10, 21,
         8, 14, 12, 22, 11, 19, 11, 12, 23, 11, 12, 15, 13, 18],
       device='cuda:0')) tensor([1, 1, 0, 0, 0, 0, 0, 2, 2, 2, 1, 0, 2, 0, 0, 2, 2, 0, 2, 0, 2, 0, 1, 1,
        0, 2, 0, 2, 0, 1, 2, 1], device='cuda:0')


In [46]:
vocab.stoi

defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x0000020DE12DA888>>,
            {'<unk>': 0,
             '<pad>': 1,
             'a': 2,
             '.': 3,
             'the': 4,
             'in': 5,
             'is': 6,
             'man': 7,
             'on': 8,
             'and': 9,
             'are': 10,
             'of': 11,
             'with': 12,
             'woman': 13,
             'two': 14,
             'people': 15,
             ',': 16,
             'to': 17,
             'at': 18,
             'wearing': 19,
             'an': 20,
             'his': 21,
             'young': 22,
             'men': 23,
             'playing': 24,
             'girl': 25,
             'boy': 26,
             'white': 27,
             'shirt': 28,
             'while': 29,
             'black': 30,
             'dog': 31,
             'sitting': 32,
             'blue': 33,
             'standing': 34,
             'her': 35,
           

In [61]:
vocab.vectors[10]

tensor([-0.1986, -0.0628, -0.3661, -0.4179,  0.2096, -0.2673,  0.2460,  0.1278,
        -0.0458,  2.5253,  0.3520,  0.0935,  0.0866, -0.1193, -0.0624, -0.1926,
        -0.1178,  1.4466, -0.5956, -0.0779, -0.3133, -0.1653,  0.0526, -0.1715,
        -0.0731,  0.0559, -0.2939, -0.2025,  0.0781, -0.0800, -0.1561, -0.0657,
        -0.0595, -0.0250,  0.1039,  0.2579, -0.2203, -0.0935, -0.0185, -0.0976,
        -0.3875,  0.2554, -0.1344,  0.2989,  0.1525,  0.0373, -0.0316, -0.3345,
         0.0897,  0.0314, -0.1805, -0.1350, -0.2551, -0.1749,  0.0691, -0.2049,
        -0.0906, -0.0155, -0.2772,  0.1814,  0.1595, -0.2087, -0.2764,  0.3273,
        -0.0573,  0.1798,  0.0128,  0.3842,  0.1590, -0.0144, -0.0375,  0.1903,
         0.8313, -0.0500,  0.3258, -0.2754,  0.2384, -0.1607,  0.3132,  0.3932,
        -0.0119,  0.1894, -0.6243, -0.1962,  0.0844, -0.3168, -0.1948,  0.0633,
         0.2979,  0.0058, -0.1860, -0.0506,  0.2250, -0.0846,  0.2914,  0.1910,
        -0.0775, -0.1179, -0.0850, -0.40

## 2.建构模型：Input Encoding ; Local Inference Modeling ; Inference Composition

### 1.Input Encoding

如论文所述，对于ESLM而言，Input Encoding是一个双向的LSTM模型，分别计算premise、hypothesis的所有隐藏状态，然后将他们在每个时间步拼接起来

In [233]:
class InputEncodingLayer(nn.Module):
    def __init__(self,input_size,hidden_size):
        super(InputEncodingLayer,self).__init__()
        self.lstm=nn.LSTM(input_size,hidden_size,num_layers=1,bidirectional=True)
    def forward(self,X):
        """
        Input:
            X:{torch.Tensor}--输入 embedding shape[batch_size,seq_len,input_size]
        Return:
            output{torch.Tensor}--shape[batch_size,seq_len,num_directions*hidden_size]

        """
        #self.lstm.flatten_parameters()
        output,_=self.lstm(X) #output,(h,c)这里只要由hiddenstate构成的output，不需要h和c
        return output


### 2.Local Inference Modeling

打分函数是点积模型，对打分进行softmax得到注意力分布，以一组文本的一个词向量为query，求另一组文本的所有注意力分布，然后求加权和，最后将hiddenstate，加权和，两者之差，两者点乘进行连接输入到下一层

In [234]:
class LocalInference(nn.Module):
    def __init__(self):
        super(LocalInference,self).__init__()
        self.softmax_1=nn.Softmax(dim=1)
        self.softmax_2=nn.Softmax(dim=2)
        #dim=1则是除了batch外的一个二维矩阵，第一维之和为1,dim=2同理
        
    def forward(self,p,h,p_mask,h_mask):
        """
        Arguments:
            前一个inputencoding层的输出
            p {torch.Tensor} -- p has shape [batch, seq_len_p, 2 * hidden_size]
            h {torch.Tensor} -- h has shape [batch, seq_len_h, 2 * hidden_size]
           
            p_mask {torch.Tensor (int)} -- p has shape [batch, seq_len_p], 0 in the mask
                means padding.
            h_mask {torch.Tensor (int)} -- h has shape [batch, seq_len_h]
        Returns:
            将p,p_,p-p_,p*p_在最后一维上连接起来
            m_p, m_h {torch.Tensor} -- tensor with shape [batch, seq_len, 8 * hidden_size]
        """
        # equation 11,获得注意分布矩阵
        e=torch.matmul(p,h.transpose(1,2))
        """
        用的是点积打分模型，直接相乘然后softmax就是注意力分布，总的分布矩阵[batch,seq_len_p,seq_len_h]
        这个矩阵横着看每一行代表以premise的一个词向量为query查询的注意力分布                         
        纵向看每一列代表以hypothesis的一个词向量维query查询的注意力的分布
        [[e11 e12 e13 e1h]
         [e21 e22 e23 e2h]
         [e31 e32 e33 e3h]
         [ep1 ep2 ep3 eph]]
        """
        
        #获取掩码矩阵
        inference_mask=torch.matmul(p_mask.unsqueeze(2).float(),h_mask.unsqueeze(1).float()) #[batch,seq_len_p,seq_len_h]
        #这个矩阵同上面的注意力分布矩阵相同，横看每一行是hypothesis的mask，纵向每一列是premise的mask
       
        assert inference_mask.shape==e.shape
        # masking the scores for padding tokends
        e.masked_fill_(inference_mask<1e-7,-1e7)
        
        #equation 12,13
        h_score,p_score=self.softmax_1(e),self.softmax_2(e) 
        h_=h_score.transpose(1,2).bmm(p) #按照输出[batch,seq_len_h,2*hidden_size]对参与运算的矩阵进行转置满足即可
        p_=p_score.bmm(h)               #[batch,seq_len_p,2*hidden_size]
        """
        p_对应论文中的a~，是ei1 ei2 ei3 ei4这样的方向(横向)做softmax，也就是在seq_len_h维度上和为1，对应softmax的dim=2
        h_对应论文中的b~，是e1j e2j e3j e4j这样的方向(纵向)做softmax，也就是在seq_len_p维度上和为1，对应softmax的dim=1
        """
        
        assert p.shape==p_.shape and h.shape==h_.shape
        
        #equation 14,15
        m_p=torch.cat((p,p_,p-p_,p*p_),dim=-1) #按照最后一个维度将4个向量拼接起来
        m_h=torch.cat((h,h_,h-h_,h*h_),dim=-1)
        assert m_p.shape[-1]==p.shape[-1]*4
        
        return m_p,m_h
        
        
        
        

In [235]:
"""测试softmax维度"""
x=torch.tensor((range(16)),dtype=torch.float32).view(2,2,4)
x
s1=nn.Softmax(dim=1)
s2=nn.Softmax(dim=2)
print(x,'\n',s1(x),'\n',s2(x))

tensor([[[ 0.,  1.,  2.,  3.],
         [ 4.,  5.,  6.,  7.]],

        [[ 8.,  9., 10., 11.],
         [12., 13., 14., 15.]]]) 
 tensor([[[0.0180, 0.0180, 0.0180, 0.0180],
         [0.9820, 0.9820, 0.9820, 0.9820]],

        [[0.0180, 0.0180, 0.0180, 0.0180],
         [0.9820, 0.9820, 0.9820, 0.9820]]]) 
 tensor([[[0.0321, 0.0871, 0.2369, 0.6439],
         [0.0321, 0.0871, 0.2369, 0.6439]],

        [[0.0321, 0.0871, 0.2369, 0.6439],
         [0.0321, 0.0871, 0.2369, 0.6439]]])


### 3.Inference Composition：包含compostionlayer和poolinglayer、MLP

#### compositionlayer：继续使用BiLSTM，aims to capture local inference imformation ma and mb and their context for inference  compostion

In [258]:
class CompositionLayer(nn.Module):
    def __init__(self,input_size,output_size,hidden_size):
        """
        Arguments:
            input_size{int}-- 前馈网络输入向量尺寸
            output_size{int}--前馈网络输出向量尺寸，同时也是LSTM输入向量
            hidden_size{int}--LSTM隐藏状态的向量尺寸
        
        """
        super(CompositionLayer,self).__init__()
        self.hidden_size=hidden_size
        self.F=nn.Linear(input_size,output_size)
        self.lstm=nn.LSTM(input_size=output_size,hidden_size=hidden_size,num_layers=1,bidirectional=True)
        self.dropout=nn.Dropout(0.5) #前馈网络输出进过dropout层
    def forward(self,m):
        """
        Arguments:
            m{torch.tensor}--{batch,seq_len,input_size}
        Returns:
            outputs{torch.tensor}--{batch,seq_len,hidden_size*2}
        """
        y=self.dropout(self.F(m))
        #self.lstm.fatten_parameters()
        outputs,_=self.lstm(y)
        return outputs
        

#### pooling：将之前得到的向量通过池化转换成固定长度的向量并输入的最后的分类器，同时采用平均池化和最大池化然后拼接起来成固定长度的向量

In [281]:
class Pooling(nn.Module):
    def __init__(self):
        super(Pooling,self).__init__()
        
    def forward(self,x,x_mask):
        """
        Arguments:
            x {torch.tensor} --[batch,seq_len,hidden_size*2]
            x_mask{torch.tensor}--[batch,seq_len] padding位置值为0
        Returns:
            V{torch.tensor}--[batch,hidden_size*4]
        """
        mask_expand=x_mask.unsqueeze(-1).expand(x.shape) #首先扩展x_mask的第三维，此时只是单纯的改变视图，并没有添加新的值
                                                        #，然后将用expand方法将mask的维度扩展到与x的形状相同，expand方法填充是复制
        #平均池化
        """
        首先与掩码矩阵进行点积，然后求每个batch的总的非填充向量数，sum(-1)后的形状是[batch]，要对每个样本分别求平均，就要把维度扩展为[batch,1]
        x.sum(1)后形状为[batch,hidden_size*2]
        """
        x_=x*mask_expand.float()
        v_avg=x_.sum(1)/x_mask.sum(-1).unsqueeze(-1).float() 
      
        
        #最大池化层
        x_=x.masked_fill(mask_expand==0,-1e7)
        v_max=x_.max(1).values #max(1)在dim=1上求最大，然后这个维度消失。返回一个类，其values属性是torch.tensor，[batch,hidden_size*2]
        
        return torch.cat((v_avg,v_max),dim=1) #[batch,hidden_size*4]
        

In [282]:
x=torch.rand((4,2,3),dtype=torch.float32)
x,x.sum(1),x.sum(1).shape,x.sum(-1),x.max(1).values

(tensor([[[0.1157, 0.4712, 0.1455],
          [0.9834, 0.9600, 0.6661]],
 
         [[0.3223, 0.4334, 0.1320],
          [0.2990, 0.7387, 0.6437]],
 
         [[0.9825, 0.0839, 0.9926],
          [0.0182, 0.7709, 0.3424]],
 
         [[0.9994, 0.8482, 0.0176],
          [0.2477, 0.0134, 0.6837]]]),
 tensor([[1.0991, 1.4312, 0.8116],
         [0.6213, 1.1721, 0.7757],
         [1.0007, 0.8548, 1.3350],
         [1.2471, 0.8616, 0.7013]]),
 torch.Size([4, 3]),
 tensor([[0.7324, 2.6095],
         [0.8877, 1.6815],
         [2.0590, 1.1315],
         [1.8652, 0.9448]]),
 tensor([[0.9834, 0.9600, 0.6661],
         [0.3223, 0.7387, 0.6437],
         [0.9825, 0.7709, 0.9926],
         [0.9994, 0.8482, 0.6837]]))

#### InferenceComposition

In [283]:
class InferenceComposition(nn.Module):
    def __init__(self,input_size,output_size,hidden_size):
        """
        Arguments:
            input_size{int}-- 前馈网络输入向量尺寸
            output_size{int}--前馈网络输出向量尺寸，同时也是LSTM输入向量
            hidden_size{int}--LSTM隐藏状态的向量尺寸
        """
        super(InferenceComposition,self).__init__()
        self.composition=CompositionLayer(input_size,output_size,hidden_size)
        self.pooling=Pooling()
    def forward(self,m_p,m_h,p_mask,h_mask):
        """
        Arguments:
            m_p {torch.Tensor} -- [batch, seq_len, input_size]
            m_h {torch.Tensor} -- [batch, seq_len, input_size]
            mask {torch.Tensor} -- [batch, seq_len], 0 means padding
        Returns:
            v {torch.Tensor} -- [batch, hidden_size * 8]        --对应论文公式20
        
        """
        v_p,v_h=self.composition(m_p),self.composition(m_h)
        v_p_,v_h_=self.pooling(v_p,p_mask),self.pooling(v_h,h_mask) #[batch,hidden_size*4]
        v=torch.cat((v_p_,v_h_),dim=1)
        
        return v
        

#### Multilayer perceptron classifier

In [296]:
class MLP(nn.Module):
    def __init__(self,input_size,output_size,class_num):
        super(MLP,self).__init__()
        self.activation=nn.ReLU()
        self.dropout=nn.Dropout(0.5)
        self.mlp=nn.Sequential(self.dropout,nn.Linear(input_size,output_size),self.activation,nn.Linear(output_size,class_num))
    def forward(self,X):
        """
        Arguments:
            x {torch.Tensor} -- [batch, features]
        Returns:
            logits {torch.Tensor} -- raw, unnormalized scores for each class. [batch, class_num]
        """
        logits=self.mlp(X)
        return logits
        

### 4.ESIM模型

In [297]:
class ESIM(nn.Module):
   
    def __init__(self,hidden_size,embedding_dim,vocab_size,num_labels,device):
   
        """
            Arguments:
            hidden_size{int} -- LSTM隐藏层的向量尺寸
            vocab_size{int} -- 字典中的单词数
            embedding_dim{int}-- 词向量的维度
            num_labels{int}--分类数
        """
        super(ESIM,self).__init__()
        self.device=device
        self.embedding=nn.Embedding(vocab_size,embedding_dim)
        self.encoder=InputEncodingLayer(embedding_dim,hidden_size)
        self.inference=LocalInference()
        self.infercomposition=InferenceComposition(hidden_size*8,hidden_size,hidden_size)
        self.linear=MLP(hidden_size*8,hidden_size,num_labels)
    
    def forward(self,p,h):
        """
        Arguments:
                p {torch.Tensor} -- premise [batch, seq_len]
                h {torch.Tensor} -- hypothesis [batch, seq_len]
            Returns:
                logits {torch.Tensor} -- raw, unnormalized scores for each class
                    with shape [batch, class_num]
        """
        p_embedding=self.embedding(p)
        h_embedding=self.embedding(h)
        p_=self.encoder(p_embedding)
        h_=self.encoder(h_embedding)
    
        p_mask,h_mask=(p!=1).long(),(h!=1).long()
        m_p,m_h=self.inference(p_,h_,p_mask,h_mask)
    
        v=self.infercomposition(m_p,m_h,p_mask,h_mask)
            
        logits=self.linear(v)
    
        return logits
    
        

In [298]:
hidden_size,embedding_dim,num_labels=300,300,3
net=ESIM(hidden_size,embedding_dim,len(vocab),num_labels,device)

#### 加载预训练的词向量

In [299]:
net.embedding.weight.data.copy_(vocab.vectors)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0438,  0.0248, -0.2094,  ..., -0.3010, -0.1458,  0.2819],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])

## 3.模型训练

In [319]:
def evaluate_accuracy(data_iter, net, device=None):
    if device is None and isinstance(net, torch.nn.Module):
        # 如果没指定device就使用net的device
        device = list(net.parameters())[0].device 
    acc_sum, n = 0.0, 0
    with torch.no_grad():
        for batch in tqdm(data_iter):
            premise=batch.premise[0]
            hypothesis=batch.hypothesis[0]
            y=batch.label
            premise=premise.to(device)
            hypothesis=hypothesis.to(device)
            y=y.to(device)
            if isinstance(net, torch.nn.Module):
                net.eval() # 评估模式, 这会关闭dropout
                acc_sum += (net(premise,hypothesis).argmax(dim=1) == y.to(device)).float().sum().cpu().item()
                net.train() # 改回训练模式
            else: # 自定义的模型
                if('is_training' in net.__code__.co_varnames): # 如果有is_training这个参数
                    # 将is_training设置成False
                    acc_sum += (net(premise,hypothesis, is_training=False).argmax(dim=1) == y).float().sum().item() 
                else:
                    acc_sum += (net(premise,hypothesis).argmax(dim=1) == y).float().sum().item() 
            n += y.shape[0]
    return acc_sum / n


In [320]:
def train(train_iter,test_iter,net,loss,optimizer,device,num_epochs):
    net=net.to(device)
    print("training on ",device)
    batch_count=0
    for epoch in range(num_epochs):
        train_l_sum,train_acc_sum,n,start=0.0,0.0,0,time.time()
        for batch in tqdm(train_iter):
            premise=batch.premise[0]
            hypothesis=batch.hypothesis[0]
            y=batch.label
            #print(y)
            premise=premise.to(device)
            hypothesis=hypothesis.to(device)
            y=y.to(device)
            y_hat=net(premise,hypothesis)
            #print(y_hat.shape,y.shape)
            l=loss(y_hat,y)
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
            train_l_sum+=l.cpu().sum().item()
            train_acc_sum+=(y_hat.argmax(dim=1)==y).sum().cpu().item()
            n+=y.shape[0]
            batch_count+=1
        test_acc=evaluate_accuracy(test_iter,net)
        print("epoch %d,loss %.4f, train acc %.3f, test acc%.3f ,time%.1f sec"%
              (epoch+1,train_l_sum,train_acc_sum/n,test_acc,time.time()-start))

In [321]:
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
lr,num_epochs=0.001,5
#过滤掉embedding参数，因为其不计算梯度不更新
optimizer=torch.optim.Adam(filter(lambda p:p.requires_grad,net.parameters()),lr=lr)
loss=nn.CrossEntropyLoss()
train(train_iter,test_iter,net,loss,optimizer,device,num_epochs)

  0%|                                                                                        | 0/17168 [00:00<?, ?it/s]

training on  cuda


100%|████████████████████████████████████████████████████████████████████████████| 17168/17168 [23:54<00:00, 11.97it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 307/307 [00:05<00:00, 53.97it/s]
  0%|                                                                                        | 0/17168 [00:00<?, ?it/s]

epoch 1,loss 8656.3210, train acc 0.803, test acc0.827 ,time1440.3 sec


100%|████████████████████████████████████████████████████████████████████████████| 17168/17168 [25:00<00:00, 11.44it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 307/307 [00:05<00:00, 56.03it/s]
  0%|                                                                                        | 0/17168 [00:00<?, ?it/s]

epoch 2,loss 7751.3534, train acc 0.828, test acc0.831 ,time1505.9 sec


100%|████████████████████████████████████████████████████████████████████████████| 17168/17168 [24:45<00:00, 11.56it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 307/307 [00:05<00:00, 54.82it/s]
  0%|                                                                                        | 0/17168 [00:00<?, ?it/s]

epoch 3,loss 7429.7429, train acc 0.837, test acc0.834 ,time1491.2 sec


100%|████████████████████████████████████████████████████████████████████████████| 17168/17168 [24:17<00:00, 11.78it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 307/307 [00:05<00:00, 55.54it/s]
  0%|                                                                                        | 0/17168 [00:00<?, ?it/s]

epoch 4,loss 7254.6632, train acc 0.841, test acc0.831 ,time1463.2 sec


100%|████████████████████████████████████████████████████████████████████████████| 17168/17168 [24:18<00:00, 11.77it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 307/307 [00:05<00:00, 56.08it/s]

epoch 5,loss 7158.3898, train acc 0.843, test acc0.831 ,time1463.5 sec





从训练效果来看，LOSS在稳步下降，训练集的准确率也在稳定上升，猜想随着训练次数的增加，准确率会进一步的升高，但是由于计算硬件的限制，我的每个epoch训练时间太长了，基本都在22分钟左右，所以没有做进一步的测试，总的来说得到的参数较好，准确率达到了百分之83左右，比较理想，同时由于硬件条件的限制，并没有使用dev_iter来进行模型的选择