![textcnn模型](img/textcnn.png)

In [1]:
import os
import pickle

# 加载知识点标签
knowledge_points_path = os.path.join(os.getcwd(), "knowledge_points.pkl")
with open(knowledge_points_path, 'rb') as f_words:
    knowledge_points = pickle.load(f_words)
    
# 原始数据的处理见项目下的transformer-encoder模型中的train.ipynb
# 这里直接加载处理好的训练数据
train_data_path = os.path.join(os.getcwd(), "train_data.pkl")
with open(train_data_path, 'rb') as f_train:
    df_final = pickle.load(f_train)
df_final

Unnamed: 0,content,“重农抑商”政策,不完全显性,与细胞分裂有关的细胞器,中央官制——三公九卿制,中心体的结构和功能,人体免疫系统在维持稳态中的作用,人体水盐平衡调节,人体的体温调节,人口增长与人口问题,...,胚胎移植,蛋白质的合成,血糖平衡的调节,走进细胞,选官、用官制度的变化,遗传的分子基础,遗传的细胞基础,避孕的原理和方法,郡县制,高尔基体的结构和功能
0,左传 记载 春秋 后期 鲁国 大夫 季孙氏 家臣 阳虎 独掌 权柄 后 标榜 鲁国 国君 整...,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,秦始皇 统一 六国后 创制 一套 御玺 任命 国家 官员 封印 皇帝 之玺 任命 四夷 官员...,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,北宋 中央集权 措施 将领 兵权 收归 中央 派 文官 担任 地方 长官 设置 通判 监督 ...,1,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
3,商朝人 崇信 鬼神 占卜 祭祀 神灵 沟通 手段 负责 通神 事务 商王 巫师 出身 贵族 ...,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,公元 年 北宋 政府 江淮地区 设置 包括 盐业 管理 控制 茶叶 销售 专卖 主要职责 转...,1,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29808,纯种 高杆 抗 锈病 小麦 矮杆 易染 锈病 小麦 培育 矮杆 抗 锈病 小麦 新品种 方法...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
29809,下图 二倍体 生物 细胞分裂 受精 作用 过程 中核 含量 染色体 数目 变化 正确 孟德尔...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
29810,调查 人群 中 遗传病 叙述 错误 选取 群体 中 发病率 高 基因 遗传病 患者 家庭成员...,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
29811,下图 人类 一种 遗传病 家系 图谱 图中 阴影 患者 推测 病 遗传 方式 常 染色体 显...,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [2]:
import os
import torch
from torchtext import data,datasets
from torchtext.data import Iterator, BucketIterator
from torchtext.vocab import Vectors
from torch import nn,optim
import torch.nn.functional as F
import pandas as pd
import pickle

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 按字分    
tokenize =lambda x: x.split(' ')

TEXT = data.Field(
                    sequential=True,
                    tokenize=tokenize,
                    lower=False,
                    use_vocab=True,
                    pad_token='<pad>',
                    unk_token='<unk>',
                    batch_first=True,
                    fix_length=200)

LABEL = data.Field(
                    sequential=False,
                    use_vocab=False,
                    batch_first=True,
                    )

# 获取训练或测试数据集
def get_dataset(csv_data, text_field, label_field, test=False):
    fields = [('id', None), ('text', text_field), ('label', label_field)]
    examples = []
    if test: #测试集，不加载label
        for text in csv_data['content']:
            examples.append(data.Example.fromlist([None, text, None], fields))
    else: # 训练集
        for i in range(len(csv_data)):
            sample = csv_data.loc[i]
            text = sample['content']
            label = [v for v in map(int, sample[knowledge_points])]
            examples.append(data.Example.fromlist([None, text, label], fields))
    return examples, fields

train_examples,train_fields = get_dataset(df_final, TEXT, LABEL)

train = data.Dataset(train_examples, train_fields)
# 预训练数据
pretrained_embedding = os.path.join(os.getcwd(), 'sgns.sogou.char')
vectors = Vectors(name=pretrained_embedding)
# 构建词典
TEXT.build_vocab(train, min_freq=1, vectors = vectors)

#TEXT.build_vocab(train, min_freq=1)
words_path = os.path.join(os.getcwd(), 'words.pkl')
with open(words_path, 'wb') as f_words:
    pickle.dump(TEXT.vocab, f_words)
    
print('process done!')

process done!




In [3]:
print(TEXT.vocab.vectors.shape)

torch.Size([70057, 300])


In [4]:
import random

# 划分训练与验证集，一个问题，利用random_split进行数据集划分后，会丢失fields属性
train_set, val_set = train.split(split_ratio=0.95, random_state=random.seed(1))

BATCH_SIZE = 512
# 生成训练与验证集的迭代器
train_iterator, val_iterator = data.BucketIterator.splits(
    (train_set, val_set),
    batch_size=BATCH_SIZE,
    #shuffle=True,
    # device=device,
    sort_within_batch=False,
    sort_key=lambda x:len(x.text)
)

print('build dataset done!')

build dataset done!


In [7]:
print(len(train_set))
print(len(train_iterator))

28322
443


In [8]:
print(len(train.examples))
print(vars(train.examples[0]))
print(vars(train.examples[1]))

29813
{'text': ['左传', '记载', '春秋', '后期', '鲁国', '大夫', '季孙氏', '家臣', '阳虎', '独掌', '权柄', '后', '标榜', '鲁国', '国君', '整肃', '跋扈', '大夫', '此举', '得不到', '知礼', '之士', '赞成', '反而', '批评', '此举', '挑战', '宗法制度', '损害', '大夫', '利益', '冲击', '天子', '权威', '不', '符合', '周礼', '次数', '阳虎', '身份', '鲁国', '大夫', '季孙氏', '家臣', '周礼', '效忠', '季孙氏', '标榜', '鲁国', '国君', '整肃', '大夫', '僭', '越', '批评', '违背', '周礼', '选择项', '宗法制度', '血缘', '核心', '故项', '与此无关', '排除', '项', '题意', '无关', '排除', '材料', '事件', '涉及', '鲁国', '国内', '周天子', '权威', '无关', '排除', '项'], 'label': [1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]}
{'text': ['秦始皇', '统一', '六国后', '创制', '一套', '御玺', '任命', '国家', '官员', '封印', '皇帝', '之玺', '任命', '四夷', '官员', '天子', '之玺', '信玺', '用于', '国内', '四夷', '用兵', '事宜', '行玺', '皇帝', '外', '巡时', '随身携带', '材料', '皇帝', '处于', '至高无上', '地位', '秦朝', '内外', '两种', '系统', '国事', '秦朝', '实行', '中央集权', '体制', '三公九卿', '制

In [5]:
# 构建分类模型
class TextCNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, output_size, filter_num=100, filter_size=(3,4,5), dropout=0.5):
        '''
        vocab_size:词典大小
        embedding_dim:词维度大小
        output_size:输出类别数
        filter_num:卷积核数量
        filter_size(3,4,5):三种卷积核，size为3,4,5，每个卷积核有filter_num个，卷积核的宽度都是embedding_dim
        '''
        super(TextCNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        # conv2d(in_channel,out_channel,kernel_size,stride,padding),stride默认为1，padding默认为0
        self.convs = nn.ModuleList([nn.Conv2d(1, filter_num,(k, embedding_dim)) for k in filter_size])
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(filter_num * len(filter_size), output_size)

    def forward(self, x):
        # x :(batch, seq_len)
        x = self.embedding(x) # [batch,word_num,embedding_dim] = [N,H,W]
        x = x.unsqueeze(1) # [batch, channel, word_num, embedding_dim] = [N,C,H,W] 
        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs] # len(filter_size) * (N, filter_num, H)
        # MaxPool1d(kernel_size, stride=None, padding=0, dilation=1, return_indices=False, ceil_mode=False),stride默认为kernal_size
        x = [F.max_pool1d(output,output.shape[2]).squeeze(2) for output in x] # len(filter_size) * (N, filter_num)
        x = torch.cat(x, 1) # (N, filter_num * len(filter_size))
        x = self.dropout(x)
        x = self.fc(x)
        return x

1.卷积后的shape计算完整公式：

input_shape:(batch,channel,height,width)

$Input:(N,C,H_{in},W_{in})$

$Output:(N,C,H_{out},W_{out})$

$H_{out}=\lfloor\frac{H_{in} + 2 * padding[0] - dilation[0] * (kernel_-size[0] - 1) - 1}{stride[0]} + 1\rfloor$

$W_{out}=\lfloor\frac{W_{in} + 2 * padding[1] - dilation[1] * (kernel_-size[1] - 1) -1 }{stride[1]} + 1\rfloor$

2.池化max_pool1d计算完整公式：
input_shape:(batch, channel, lin)

$Input:(N,C,L_{in})$

$Output:(N,C,L_{out})$

$L_{out}=\lfloor\frac{L_{in} + 2 * padding - dilation * (kernel_-size - 1) - 1}{stride} + 1\rfloor$

In [6]:

'''
评估
'''
def evaluate(model, criterion):
    model.eval()  # 评估模型，切断dropout与batchnorm
    epoch_loss = 0
    with torch.no_grad():  # 不更新梯度
        for i, batch in enumerate(val_iterator):
            train_text = batch.text  
            train_label = batch.label
            train_label = train_label.float()

            train_text = train_text.to(DEVICE)
            train_label = train_label.to(DEVICE)

            out = model(train_text)
            loss = criterion(out, train_label)
            epoch_loss += float(loss.item())
    print('evaluate loss:{}'.format(epoch_loss/len(val_iterator)))
    
#对所有模块和子模块进行权重初始化
def init_weights(model):
    for name,param in model.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)

In [7]:
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter(os.getcwd()+'/log', comment='textcnn')

# 训练

# 构建model
model = TextCNN(len(TEXT.vocab),TEXT.vocab.vectors.shape[1],73).to(DEVICE)
#初始化权重
model.apply(init_weights)
# 利用预训练模型初始化embedding，requires_grad=True，可以fine-tune
model.embedding.weight.data.copy_(TEXT.vocab.vectors)
# 训练模式
model.train()
# 优化和损失
#optimizer = torch.optim.Adam(model.parameters(),lr=0.1, weight_decay=0.01)
optimizer = torch.optim.SGD(model.parameters(),lr=0.1, momentum=0.9, nesterov=True)
criterion = nn.BCEWithLogitsLoss()

with writer:
    for iter in range(300):
        for i, batch in enumerate(train_iterator):
            train_text = batch.text
            train_label = batch.label
            train_label = train_label.float()
            
            train_text = train_text.to(DEVICE)
            train_label = train_label.to(DEVICE)
            out = model(train_text)
            loss = criterion(out, train_label)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            if (iter+1) % 10 == 0:
                    print ('iter [{}/{}], Loss: {:.4f}'.format(iter+1, 300, loss.item()))
            #writer.add_graph(model, input_to_model=train_text,verbose=False)
            writer.add_scalar('loss',loss.item(),global_step=iter+1)
        if (iter+1) % 10 == 0:
            evaluate(model, criterion)
    writer.flush()
    writer.close()
            
model_path = os.path.join(os.getcwd(), "model.h5")
torch.save(model.state_dict(), model_path)


iter [10/300], Loss: 0.0816
iter [10/300], Loss: 0.0827
iter [10/300], Loss: 0.0863
iter [10/300], Loss: 0.0817
iter [10/300], Loss: 0.0855
iter [10/300], Loss: 0.0837
iter [10/300], Loss: 0.0871
iter [10/300], Loss: 0.0844
iter [10/300], Loss: 0.0866
iter [10/300], Loss: 0.0859
iter [10/300], Loss: 0.0860
iter [10/300], Loss: 0.0777
iter [10/300], Loss: 0.0808
iter [10/300], Loss: 0.0890
iter [10/300], Loss: 0.0802
iter [10/300], Loss: 0.0814
iter [10/300], Loss: 0.0900
iter [10/300], Loss: 0.0843
iter [10/300], Loss: 0.0835
iter [10/300], Loss: 0.0880
iter [10/300], Loss: 0.0809
iter [10/300], Loss: 0.0843
iter [10/300], Loss: 0.0806
iter [10/300], Loss: 0.0874
iter [10/300], Loss: 0.0840
iter [10/300], Loss: 0.0824
iter [10/300], Loss: 0.0845
iter [10/300], Loss: 0.0798
iter [10/300], Loss: 0.0820
iter [10/300], Loss: 0.0855
iter [10/300], Loss: 0.0902
iter [10/300], Loss: 0.0861
iter [10/300], Loss: 0.0814
iter [10/300], Loss: 0.0894
iter [10/300], Loss: 0.0818
iter [10/300], Loss:

iter [60/300], Loss: 0.0224
iter [60/300], Loss: 0.0256
iter [60/300], Loss: 0.0252
iter [60/300], Loss: 0.0227
iter [60/300], Loss: 0.0241
iter [60/300], Loss: 0.0217
iter [60/300], Loss: 0.0257
iter [60/300], Loss: 0.0230
iter [60/300], Loss: 0.0252
iter [60/300], Loss: 0.0248
iter [60/300], Loss: 0.0237
iter [60/300], Loss: 0.0242
iter [60/300], Loss: 0.0225
iter [60/300], Loss: 0.0220
iter [60/300], Loss: 0.0242
iter [60/300], Loss: 0.0242
iter [60/300], Loss: 0.0239
iter [60/300], Loss: 0.0231
iter [60/300], Loss: 0.0243
iter [60/300], Loss: 0.0245
iter [60/300], Loss: 0.0229
iter [60/300], Loss: 0.0269
iter [60/300], Loss: 0.0247
iter [60/300], Loss: 0.0227
iter [60/300], Loss: 0.0228
iter [60/300], Loss: 0.0232
iter [60/300], Loss: 0.0242
iter [60/300], Loss: 0.0231
iter [60/300], Loss: 0.0205
iter [60/300], Loss: 0.0234
iter [60/300], Loss: 0.0250
iter [60/300], Loss: 0.0220
iter [60/300], Loss: 0.0226
iter [60/300], Loss: 0.0245
iter [60/300], Loss: 0.0221
iter [60/300], Loss:

iter [110/300], Loss: 0.0147
iter [110/300], Loss: 0.0155
iter [110/300], Loss: 0.0151
iter [110/300], Loss: 0.0147
iter [110/300], Loss: 0.0145
iter [110/300], Loss: 0.0149
iter [110/300], Loss: 0.0149
iter [110/300], Loss: 0.0133
iter [110/300], Loss: 0.0140
iter [110/300], Loss: 0.0137
iter [110/300], Loss: 0.0171
iter [110/300], Loss: 0.0141
iter [110/300], Loss: 0.0164
iter [110/300], Loss: 0.0162
iter [110/300], Loss: 0.0151
iter [110/300], Loss: 0.0149
iter [110/300], Loss: 0.0174
iter [110/300], Loss: 0.0175
iter [110/300], Loss: 0.0174
iter [110/300], Loss: 0.0169
iter [110/300], Loss: 0.0184
iter [110/300], Loss: 0.0162
iter [110/300], Loss: 0.0140
iter [110/300], Loss: 0.0168
iter [110/300], Loss: 0.0164
iter [110/300], Loss: 0.0156
iter [110/300], Loss: 0.0149
iter [110/300], Loss: 0.0148
iter [110/300], Loss: 0.0168
iter [110/300], Loss: 0.0158
iter [110/300], Loss: 0.0159
iter [110/300], Loss: 0.0174
iter [110/300], Loss: 0.0159
iter [110/300], Loss: 0.0168
iter [110/300]

iter [160/300], Loss: 0.0104
iter [160/300], Loss: 0.0123
iter [160/300], Loss: 0.0104
iter [160/300], Loss: 0.0146
iter [160/300], Loss: 0.0128
iter [160/300], Loss: 0.0119
iter [160/300], Loss: 0.0142
iter [160/300], Loss: 0.0114
iter [160/300], Loss: 0.0121
iter [160/300], Loss: 0.0101
iter [160/300], Loss: 0.0117
iter [160/300], Loss: 0.0102
iter [160/300], Loss: 0.0113
iter [160/300], Loss: 0.0112
iter [160/300], Loss: 0.0116
iter [160/300], Loss: 0.0121
iter [160/300], Loss: 0.0130
iter [160/300], Loss: 0.0111
iter [160/300], Loss: 0.0128
iter [160/300], Loss: 0.0113
iter [160/300], Loss: 0.0129
iter [160/300], Loss: 0.0124
iter [160/300], Loss: 0.0114
iter [160/300], Loss: 0.0099
iter [160/300], Loss: 0.0126
iter [160/300], Loss: 0.0107
iter [160/300], Loss: 0.0113
iter [160/300], Loss: 0.0135
iter [160/300], Loss: 0.0126
iter [160/300], Loss: 0.0101
iter [160/300], Loss: 0.0121
iter [160/300], Loss: 0.0121
iter [160/300], Loss: 0.0117
iter [160/300], Loss: 0.0101
iter [160/300]

iter [210/300], Loss: 0.0090
iter [210/300], Loss: 0.0083
iter [210/300], Loss: 0.0095
iter [210/300], Loss: 0.0090
iter [210/300], Loss: 0.0098
iter [210/300], Loss: 0.0102
iter [210/300], Loss: 0.0109
iter [210/300], Loss: 0.0105
iter [210/300], Loss: 0.0093
iter [210/300], Loss: 0.0094
iter [210/300], Loss: 0.0077
iter [210/300], Loss: 0.0088
iter [210/300], Loss: 0.0093
iter [210/300], Loss: 0.0106
iter [210/300], Loss: 0.0086
iter [210/300], Loss: 0.0091
iter [210/300], Loss: 0.0091
iter [210/300], Loss: 0.0083
iter [210/300], Loss: 0.0093
iter [210/300], Loss: 0.0108
iter [210/300], Loss: 0.0099
iter [210/300], Loss: 0.0083
iter [210/300], Loss: 0.0090
iter [210/300], Loss: 0.0088
iter [210/300], Loss: 0.0087
iter [210/300], Loss: 0.0083
iter [210/300], Loss: 0.0082
iter [210/300], Loss: 0.0094
iter [210/300], Loss: 0.0097
iter [210/300], Loss: 0.0094
iter [210/300], Loss: 0.0086
iter [210/300], Loss: 0.0084
iter [210/300], Loss: 0.0091
iter [210/300], Loss: 0.0093
iter [210/300]

iter [260/300], Loss: 0.0070
iter [260/300], Loss: 0.0074
iter [260/300], Loss: 0.0066
iter [260/300], Loss: 0.0075
iter [260/300], Loss: 0.0074
iter [260/300], Loss: 0.0082
iter [260/300], Loss: 0.0078
iter [260/300], Loss: 0.0087
iter [260/300], Loss: 0.0080
iter [260/300], Loss: 0.0076
iter [260/300], Loss: 0.0089
iter [260/300], Loss: 0.0077
iter [260/300], Loss: 0.0072
iter [260/300], Loss: 0.0081
iter [260/300], Loss: 0.0074
iter [260/300], Loss: 0.0082
iter [260/300], Loss: 0.0076
iter [260/300], Loss: 0.0077
iter [260/300], Loss: 0.0072
iter [260/300], Loss: 0.0078
iter [260/300], Loss: 0.0072
iter [260/300], Loss: 0.0072
iter [260/300], Loss: 0.0072
iter [260/300], Loss: 0.0075
iter [260/300], Loss: 0.0077
iter [260/300], Loss: 0.0062
iter [260/300], Loss: 0.0069
iter [260/300], Loss: 0.0078
iter [260/300], Loss: 0.0072
iter [260/300], Loss: 0.0071
iter [260/300], Loss: 0.0085
iter [260/300], Loss: 0.0086
iter [260/300], Loss: 0.0076
iter [260/300], Loss: 0.0068
iter [260/300]

![textcnn模型](img/loss.png)

In [8]:
# 释放gpu显存
torch.cuda.empty_cache()