In [None]:
# 最后训练时间较长，调试好后转为py以命令行的方式运行

In [None]:
import pandas as pd
import numpy as np
from tensorflow import keras
from tensorflow.keras.layers import Concatenate as concatenate
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Conv1D, MaxPooling1D, AveragePooling1D, Flatten, Dropout, Dense, Input
from tensorflow.keras.layers import Layer, InputSpec
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, Callback
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import classification_report
from sklearn import metrics
import gensim
import time
from sklearn.metrics import f1_score
import logging
import pickle
from tensorflow.keras.layers import Flatten, Layer, InputSpec
import tensorflow as tf

In [None]:
test_mode = False

if test_mode:
    fold_num = 2
    max_epoch = 3
else:
    fold_num = 5
    max_epoch = 50

logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', 
                    level=logging.INFO, 
                    datefmt='%Y-%m-%d %H:%M:%S')

target_names = ['科技', '股票', '体育', '娱乐', '时政', '社会', '教育',
               '财经', '家居', '游戏', '房产', '时尚', '彩票', '星座']

In [None]:
class Evaluator(Callback):
    def __init__(self, validation_data):
        super().__init__()
        self.best_val_f1 = 0.
        self.x_val = validation_data[0]
        self.y_val = validation_data[1]

    def evaluate(self):
        y_true = self.y_val
        y_pred = self.model.predict(self.x_val).argmax(axis=1)
        f1 = f1_score(y_true, y_pred, average='macro')
        return f1

    def on_epoch_end(self, epoch, logs=None):
        val_f1 = self.evaluate()
        if val_f1 > self.best_val_f1:
            self.best_val_f1 = val_f1
        logs['val_f1'] = val_f1
        print(f'val_f1: {val_f1:.5f}, best_val_f1: {self.best_val_f1:.5f}')

In [None]:
#读取训练集、测试集，并合并为total_df
train_df = pd.read_csv('../data/train_set.csv', sep='\t')
test_df = pd.read_csv('../data/test_a.csv', sep='\t')
total_df = pd.concat([train_df['text'], test_df['text']], axis=0)

# # 需要使用xxx.ipynb预先生成cut_train.csv和cut_test.csv
# cut_train_df = pd.read_csv('../data/cut_train.csv', sep='\t')
# cut_test_df = pd.read_csv('../data/cut_test.csv', sep='\t')

In [None]:
# 第一次需要运行这段 
# max_len为句子的固定长度
max_len = 3000
tokenizer = Tokenizer()                   # 创建一个Tokenizer对象，将一个词转换为正整数
tokenizer.fit_on_texts(total_df)  #将词编号，词频越大，编号越小
vocab = tokenizer.word_index              # 得到每个词的编号

x_test_word_ids = tokenizer.texts_to_sequences(test_set_df['text_set'])
x_test_padded_seqs = pad_sequences(x_test_word_ids, maxlen=max_len, padding="post", truncating="post") 

raw_train_word_ids = tokenizer.texts_to_sequences(train_set_df['text_set'])
#最终训练只需要raw_train_padded_seqs和raw_train_labels
raw_train_padded_seqs = pad_sequences(raw_train_word_ids, maxlen=max_len, padding="post", truncating="post") 
raw_train_onehot_labels = keras.utils.to_categorical(train_df['label'], num_classes=14)

# 存储中间数据后续使用节约时间
pickle.dump(x_test_padded_seqs, open("../tmp_data/textcnn_maxlen3000_tpre1500_tpost1500_ppost_seq_test.pickle", "wb"), protocol = 4)
pickle.dump(raw_train_padded_seqs, open("../tmp_data/textcnn_maxlen3000_tpre1500_tpost1500_ppost_seq_train.pickle", "wb"), protocol = 4)
# 以下两项不需要每次都保存，如果每次保存也不影响结果
pickle.dump(raw_train_onehot_labels, open("../tmp_data/textcnn_onehot_label_train.pickle", "wb"))
pickle.dump(tokenizer, open("../tmp_data/textcnn_tokenizer.pickle", "wb"))

In [None]:
# # 第二次跑同样实验时候可以加载存储的中间数据
# # x_test_padded_seqs = pickle.load(open("../tmp_data/textcnn_maxlen3000_tpre1500_tpost1500_ppost_seq_test.pickle", "rb"))
# raw_train_padded_seqs = pickle.load(open("../tmp_data/textcnn_maxlen3000_tpre1500_tpost1500_ppost_seq_train.pickle", "rb"))
# raw_train_onehot_labels = pickle.load(open("../tmp_data/textcnn_onehot_label_train.pickle", "rb"))
# tokenizer = pickle.load(open("../tmp_data/textcnn_tokenizer.pickle", "rb"))
# vocab = tokenizer.word_index

In [None]:
#引入预训练的word2vec，可选

logging.info("Loading word2vec weight...")
# 需要使用xxx.ipynb训练得到word2vec词向量
vectorPath = '../tmp_data/word2vec.d300.sg.w5.model' # 本地词向量的地址
Word2VecModel = gensim.models.Word2Vec.load(vectorPath) # 读取词向量

embeddings_matrix = np.zeros((len(vocab) + 1, Word2VecModel.vector_size))
print(embeddings_matrix.shape)
for word, index in vocab.items():
    if word in Word2VecModel.wv.vocab.keys():
        embeddings_matrix[index] = Word2VecModel.wv[word]  # 词向量矩阵

In [None]:
k_fold = fold_num
monitor = 'val_f1'
EMBEDDING_DIM = 300  

In [None]:
# 准备数据
skf = StratifiedKFold(n_splits=k_fold, random_state=2020, shuffle=True)
test_pred = np.zeros((test_df.shape[0], 14), dtype=np.float32)

logging.info("Start training...")
total_start =  time.time()

for idx, (train_index, valid_index) in enumerate(skf.split(raw_train_padded_seqs, train_df['label'])):
    x_train_padded_seqs, x_val_padded_seqs = raw_train_padded_seqs[train_index], raw_train_padded_seqs[valid_index]
    one_hot_labels, val_one_hot_labels = raw_train_onehot_labels[train_index], raw_train_onehot_labels[valid_index]
    y_train, y_val = train_df['label'].values[train_index], train_df['label'].values[valid_index]
    
    #构建textCNN
    model_path = '../models/textcnn_maxlen3000_tpre1500_tpost1500_ppost_{}.h5'.format(idx+1)
    checkpoint = ModelCheckpoint(model_path, monitor=monitor, verbose=1, save_best_only=True, mode='max', save_weights_only=True)
    earlystopping = EarlyStopping(monitor=monitor, patience=5, verbose=1, mode='max')
    reduce_lr = ReduceLROnPlateau(monitor=monitor, factor=0.5, patience=2, mode='max', verbose=1)
    main_input = Input(shape=(max_len,), dtype='float64')

    # 嵌入层（使用预训练的词向量）
    embedder = Embedding(input_dim = len(embeddings_matrix), output_dim = embeddings_matrix.shape[1], weights=[embeddings_matrix], input_length=max_len, trainable=False)
    embed = embedder(main_input)
    
    # 卷积层和池化层，设置卷积核大小分别为3,4,5,6
    cnn1 = Conv1D(256, 3, padding='same', strides=1, activation='relu')(embed)
    max1 = MaxPooling1D(pool_size=max_len)(cnn1)
#     ave1 = AveragePooling1D(pool_size=max_len)(cnn1)
    cnn2 = Conv1D(256, 4, padding='same', strides=1, activation='relu')(embed)
    max2 = MaxPooling1D(pool_size=max_len)(cnn2)
#     ave2 = AveragePooling1D(pool_size=max_len)(cnn2)
    cnn3 = Conv1D(256, 5, padding='same', strides=1, activation='relu')(embed)
    max3 = MaxPooling1D(pool_size=max_len)(cnn3)
#     ave3 = AveragePooling1D(pool_size=max_len)(cnn3)
    cnn4 = Conv1D(256, 10, padding='same', strides=1, activation='relu')(embed)
    max4 = MaxPooling1D(pool_size=max_len)(cnn4)
#     ave4 = AveragePooling1D(pool_size=max_len)(cnn4)
#     cnn = concatenate(axis=-1)([max1, ave1, max2, ave2, max3, ave3, max4, ave4])
    cnn = concatenate(axis=-1)([max1, max2, max3, max4])
    flat = Flatten()(cnn)
    drop = Dropout(0.2)(flat) 
    main_output = Dense(14, activation='softmax')(drop)
    model = Model(inputs=main_input, outputs=main_output)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    start = time.time()
    logging.info("Fold {} fitting...".format(idx+1))
    start = time.time()
    model.fit(x_train_padded_seqs,
              one_hot_labels, 
              batch_size=256,
              validation_data=(x_val_padded_seqs, val_one_hot_labels), 
              callbacks=[Evaluator(validation_data=(x_val_padded_seqs, y_val)), checkpoint, reduce_lr, earlystopping],
              verbose=2, 
              shuffle=True,
              epochs=max_epoch)
    end = time.time()
    logging.info("fold {0} train {1:.3f} min".format(idx+1, ((end - start) / 60)))
    model.save('../models/textcnn_maxlen3000_tpre1500_tpost1500_ppost_model_{}.h5'.format(idx+1))
    y_val_predict_result = model.predict(x_val_padded_seqs)  
    y_val_predict_label = np.argmax(y_val_predict_result, axis=1) 
    logging.info("\n{}\n".format(classification_report(y_val, y_val_predict_label, target_names=target_names)))
    test_pred += model.predict(x_test_padded_seqs)
total_end =  time.time()   
logging.info("Total train time: {:.3f} min".format((total_end - total_start)/60))

In [None]:
预测结果输出csv
y_test_predict_label = np.argmax(test_pred, axis=1)
test_df['label'] = y_test_predict_label
test_df.to_csv('../results/textcnn_maxlen3000_tpre1500_tpost1500_ppost.csv', index=False, columns=['label'])