![cnn残差模型](img/cnn_1.png)

此分类模型是来自序列模型[Convolutional Sequence to Sequence Learning](https://arxiv.org/pdf/1705.03122.pdf)中的encoder部分，这里暂且叫它带残差的cnn model，如上图所示。

    1.句子token和其对应的position经过embedding后，逐元素加和作为source embedding。
    
    2.source embedding进入： 线性层 -> 卷积块后得到的特征 -> 线性层。
    
    3.以上的输出和source embedding进行残差连接。
    
    4.以上的输出，我这里加了一个平均池化后进入线性层，预测输出。

![cnn残差模型](img/cnn_2.png)

以上是模型中的卷积块，可以设置多个卷积块。具体含义可以见论文[Convolutional Sequence to Sequence Learning](https://arxiv.org/pdf/1705.03122.pdf)。

以上的图片和原始代码是改自https://github.com/bentrevett/pytorch-seq2seq 在此非常感谢作者实现了这么通俗易懂的代码架构，可以让其它人在上面进行修改。

In [1]:
import os
import torch
from torchtext import data,datasets
from torchtext.data import Iterator, BucketIterator
from torchtext.vocab import Vectors
from torch import nn,optim
import torch.nn.functional as F
import pandas as pd
import pickle

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

intent_classification_path = os.path.abspath(os.path.join(os.getcwd(), '../..'))
# 训练数据路径
train_data = os.path.join(intent_classification_path,'classification_data/knowledge_point_qa_data.csv')
# 读取数据
train_data = pd.read_csv(train_data)
# 按字分    
tokenize =lambda x: x.split(' ')

TEXT = data.Field(
                    sequential=True,
                    tokenize=tokenize,
                    lower=True,
                    use_vocab=True,
                    pad_token='<pad>',
                    unk_token='<unk>',
                    batch_first=True,
                    fix_length=20)

LABEL = data.Field(
                    sequential=False,
                    use_vocab=False)
# 获取训练或测试数据集
def get_dataset(csv_data, text_field, label_field, test=False):
    fields = [('id', None), ('text', text_field), ('label', label_field)]
    examples = []
    if test: #测试集，不加载label
        for text in csv_data['text']:
            examples.append(data.Example.fromlist([None, text, None], fields))
    else: # 训练集
        for text, label in zip(csv_data['text'], csv_data['label']):
            examples.append(data.Example.fromlist([None, text, label], fields))
    return examples, fields

train_examples,train_fields = get_dataset(train_data, TEXT, LABEL)

train = data.Dataset(train_examples, train_fields)
# 预训练数据
pretrained_embedding = os.path.join(os.getcwd(), 'sgns.sogou.char')
vectors = Vectors(name=pretrained_embedding)
# 构建词典
TEXT.build_vocab(train, min_freq=1, vectors = vectors)

words_path = os.path.join(os.getcwd(), 'words.pkl')
with open(words_path, 'wb') as f_words:
    pickle.dump(TEXT.vocab, f_words)
    
BATCH_SIZE = 163
# 构建迭代器
train_iter = BucketIterator(
                            dataset=train,
                            batch_size=BATCH_SIZE,
                            shuffle=True,
                            sort_within_batch=False)




In [2]:
print(TEXT.vocab.vectors.shape)

torch.Size([44, 300])


In [10]:
'''
带残差的cnn
'''
class CNNResidual(nn.Module):
    def __init__(self, input_dim, emb_dim, intent_dim, hid_dim, n_layers, kernel_size, dropout, max_length=20):
        super(CNNResidual, self).__init__()
        
        assert kernel_size % 2 == 1,'kernel size must be odd!' # 卷积核size为奇数，方便序列两边pad处理
        
        self.scale = torch.sqrt(torch.FloatTensor([0.5])).to(DEVICE) # 确保整个网络的方差不会发生显著变化
        
        self.tok_embedding = nn.Embedding(input_dim, emb_dim) # token编码
        self.pos_embedding = nn.Embedding(max_length, emb_dim) # token的位置编码
        
        self.emb2hid = nn.Linear(emb_dim, hid_dim) # 线性层，从emb_dim转为hid_dim
        self.hid2emb = nn.Linear(hid_dim, emb_dim) # 线性层，从hid_dim转为emb_dim
        
        # 卷积块
        self.convs = nn.ModuleList([nn.Conv1d(in_channels=hid_dim,
                                              out_channels=2*hid_dim, # 卷积后输出的维度，这里2*hid_dim是为了后面的glu激活函数
                                              kernel_size=kernel_size,
                                              padding=(kernel_size - 1)//2) # 序列两边补0个数，保持维度不变
                                              for _ in range(n_layers)]) 
        self.dropout = nn.Dropout(dropout)
        
        # 利用encoder的输出进行意图识别
        self.intent_output = nn.Linear(emb_dim, intent_dim)
        
    def forward(self, src):
        # src: [batch_size, src_len]
        batch_size = src.shape[0]
        src_len = src.shape[1]
        
        # 创建token位置信息
        pos = torch.arange(src_len).unsqueeze(0).repeat(batch_size, 1).to(DEVICE) # [batch_size, src_len]
        
        # 对token与其位置进行编码
        tok_embedded = self.tok_embedding(src) # [batch_size, src_len, emb_dim]
        pos_embedded = self.pos_embedding(pos.long()) # [batch_size, src_len, emb_dim]
        
        # 对token embedded和pos_embedded逐元素加和
        embedded = self.dropout(tok_embedded + pos_embedded) # [batch_size, src_len, emb_dim]
        
        # embedded经过一线性层，将emb_dim转为hid_dim，作为卷积块的输入
        conv_input = self.emb2hid(embedded) # [batch_size, src_len, hid_dim]
        
        # 转变维度，卷积在输入数据的最后一维进行
        conv_input = conv_input.permute(0, 2, 1) # [batch_size, hid_dim, src_len]
        
        # 以下进行卷积块
        for i, conv in enumerate(self.convs):
            # 进行卷积
            conved = conv(self.dropout(conv_input)) # [batch_size, 2*hid_dim, src_len]
            
            # 进行激活glu
            conved = F.glu(conved, dim=1) # [batch_size, hid_dim, src_len]
            
            # 进行残差连接
            conved = (conved + conv_input) * self.scale # [batch_size, hid_dim, src_len]
            
            # 作为下一个卷积块的输入
            conv_input = conved
        
        # 经过一线性层，将hid_dim转为emb_dim，作为enocder的卷积输出的特征
        conved = self.hid2emb(conved.permute(0, 2, 1)) # [batch_size, src_len, emb_dim]
        
        # 又是一个残差连接，逐元素加和输出，作为encoder的联合输出特征
        combined = (conved + embedded) * self.scale # [batch_size, src_len, emb_dim]
        
        # 意图识别,加一个平均池化,池化后的维度是：[batch_size, emb_dim]
        intent_output = self.intent_output(F.avg_pool1d(combined.permute(0, 2, 1), combined.shape[1]).squeeze()) # [batch_size, intent_dim]
        
        return intent_output

In [11]:
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter(os.getcwd()+'/log', comment='cnnresidual')

input_dim = len(TEXT.vocab)
intent_dim = 9 # intent size
emb_dim = TEXT.vocab.vectors.shape[1]
hid_dim = 128
conv_layers = 2 # 几层卷积块
kernel_size = 3
dropout = 0.25

model = CNNResidual(input_dim, emb_dim, intent_dim, hid_dim, conv_layers, kernel_size, dropout).to(DEVICE)

# 利用预训练模型初始化embedding，requires_grad=True，可以fine-tune
model.tok_embedding.weight.data.copy_(TEXT.vocab.vectors)

# 训练模式
model.train()

# 优化和损失
#optimizer = torch.optim.Adam(model.parameters(),lr=0.1, weight_decay=0.01)
optimizer = torch.optim.SGD(model.parameters(),lr=0.1, momentum=0.9, nesterov=True)
criterion = nn.CrossEntropyLoss()

with writer:
    for iter in range(300):
        for i, batch in enumerate(train_iter):
            train_text = batch.text
            train_label = batch.label
            train_text = train_text.to(DEVICE)
            train_label = train_label.to(DEVICE)
            out = model(train_text)
            loss = criterion(out, train_label)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            if (iter+1) % 10 == 0:
                    print ('iter [{}/{}], Loss: {:.4f}'.format(iter+1, 300, loss.item()))
            #writer.add_graph(model, input_to_model=train_text,verbose=False)
            writer.add_scalar('loss',loss.item(),global_step=iter+1)
    writer.flush()
    writer.close()
            
model_path = os.path.join(os.getcwd(), "model.h5")
torch.save(model.state_dict(), model_path)

iter [10/300], Loss: 2.1015
iter [20/300], Loss: 1.9979
iter [30/300], Loss: 1.9021
iter [40/300], Loss: 1.7218
iter [50/300], Loss: 3.7206
iter [60/300], Loss: 1.5389
iter [70/300], Loss: 1.4861
iter [80/300], Loss: 0.9166
iter [90/300], Loss: 2.8118
iter [100/300], Loss: 1.1905
iter [110/300], Loss: 0.5213
iter [120/300], Loss: 0.6722
iter [130/300], Loss: 0.2272
iter [140/300], Loss: 0.0841
iter [150/300], Loss: 0.0688
iter [160/300], Loss: 0.0673
iter [170/300], Loss: 0.0485
iter [180/300], Loss: 0.0110
iter [190/300], Loss: 0.0162
iter [200/300], Loss: 0.0617
iter [210/300], Loss: 0.0416
iter [220/300], Loss: 0.0491
iter [230/300], Loss: 0.0095
iter [240/300], Loss: 0.0067
iter [250/300], Loss: 0.0082
iter [260/300], Loss: 0.0091
iter [270/300], Loss: 0.0128
iter [280/300], Loss: 0.0084
iter [290/300], Loss: 0.0022
iter [300/300], Loss: 0.0009


![cnn残差模型](img/loss.png)