In [1]:
import pandas as pd
import numpy as np

from tensorflow import keras
from tensorflow.keras.layers import Concatenate as concatenate
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Conv1D, MaxPooling1D, AveragePooling1D, Flatten, Dropout, Dense, Input
from tensorflow.keras.layers import Layer, InputSpec
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, Callback
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import classification_report

from sklearn import metrics
import gensim
import time
from sklearn.metrics import f1_score
import logging
import scipy.sparse as sp
import pickle

from tensorflow.keras.layers import Flatten, Layer, InputSpec
import tensorflow as tf

In [2]:
logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', 
                    level=logging.INFO, 
                    datefmt='%Y-%m-%d %H:%M:%S')

In [3]:
max_len = 3000
tokenizer = pickle.load(open("../tmp_data/textcnn_maxlen3000_tokenizer.pickle", "rb"))
vocab = tokenizer.word_index

In [4]:
vectorPath = '../tmp_data/word2vec.d300.sg.w5.model' # 本地词向量的地址
Word2VecModel = gensim.models.Word2Vec.load(vectorPath) # 读取词向量

embeddings_matrix = np.zeros((len(vocab) + 1, Word2VecModel.vector_size))
print(embeddings_matrix.shape)
for word, index in vocab.items():
    if word in Word2VecModel.wv.vocab.keys():
        embeddings_matrix[index] = Word2VecModel.wv[word]  # 词向量矩阵

2020-09-08 17:48:43 INFO: loading Word2Vec object from ../tmp_data/word2vec.d300.sg.w5.model
2020-09-08 17:48:44 INFO: loading wv recursively from ../tmp_data/word2vec.d300.sg.w5.model.wv.* with mmap=None
2020-09-08 17:48:44 INFO: setting ignored attribute vectors_norm to None
2020-09-08 17:48:44 INFO: loading vocabulary recursively from ../tmp_data/word2vec.d300.sg.w5.model.vocabulary.* with mmap=None
2020-09-08 17:48:44 INFO: loading trainables recursively from ../tmp_data/word2vec.d300.sg.w5.model.trainables.* with mmap=None
2020-09-08 17:48:44 INFO: setting ignored attribute cum_table to None
2020-09-08 17:48:44 INFO: loaded ../tmp_data/word2vec.d300.sg.w5.model


(6978, 300)


In [5]:
test_df = pd.read_csv('../data/cut_test_b.csv', sep='\t')
x_test_word_ids = tokenizer.texts_to_sequences(test_df['cut_text'])
x_test_padded_seqs = pad_sequences(x_test_word_ids, maxlen=max_len, padding="post", truncating="post") 

In [6]:
def build_model(max_len, embeddings_matrix):
    main_input = Input(shape=(max_len,), dtype='float64')

    # 嵌入层（使用预训练的词向量）
    embedder = Embedding(input_dim = len(embeddings_matrix), output_dim = embeddings_matrix.shape[1], weights=[embeddings_matrix], input_length=max_len, trainable=False)
    embed = embedder(main_input)
    
    # 卷积层和池化层，设置卷积核大小分别为3,4,5,6
    cnn1 = Conv1D(256, 3, padding='same', strides=1, activation='relu')(embed)
    max1 = MaxPooling1D(pool_size=max_len)(cnn1)
#     ave1 = AveragePooling1D(pool_size=max_len)(cnn1)
    cnn2 = Conv1D(256, 4, padding='same', strides=1, activation='relu')(embed)
    max2 = MaxPooling1D(pool_size=max_len)(cnn2)
#     ave2 = AveragePooling1D(pool_size=max_len)(cnn2)
    cnn3 = Conv1D(256, 5, padding='same', strides=1, activation='relu')(embed)
    max3 = MaxPooling1D(pool_size=max_len)(cnn3)
#     ave3 = AveragePooling1D(pool_size=max_len)(cnn3)
    cnn4 = Conv1D(256, 10, padding='same', strides=1, activation='relu')(embed)
    max4 = MaxPooling1D(pool_size=max_len)(cnn4)
#     ave4 = AveragePooling1D(pool_size=max_len)(cnn4)
#     cnn = concatenate(axis=-1)([max1, ave1, max2, ave2, max3, ave3, max4, ave4])
    cnn = concatenate(axis=-1)([max1, max2, max3, max4])
    flat = Flatten()(cnn)
    drop = Dropout(0.2)(flat)
    main_output = Dense(14, activation='softmax')(drop)
    model = Model(inputs=main_input, outputs=main_output)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [7]:
# use checkpoint
test_pred = np.zeros((len(test_df), 14))
for idx in range(5):
    model = build_model(max_len, embeddings_matrix)
    model_path = '../models/textcnn_maxlen3000_tpre1500_tpost1500_ppost_{}.h5'.format(idx+1)
    model.load_weights(model_path)
    prob = model.predict(x_test_padded_seqs, batch_size=256, verbose=1)
    test_pred += prob / 5
#     print(model_path)



In [8]:
predict_label = np.argmax(test_pred, axis=1)

In [9]:
test_df['label'] = predict_label
test_df.to_csv("../results/textcnn_maxlen3000_tpre1500_tpost1500_ppost_+f20_5fold_test_b.csv", index=False, columns=['label'])

In [None]:
temp1 = train_df[train_df['label'] != train_df['predict_label']]

In [None]:
temp1[temp1['predict_prob'] < 0.5]['label'].hist()

In [None]:
temp2 = temp1.apply(lambda x:"{0}_{1}".format(x['label'],x['predict_label'], axis=1)

In [None]:
lambda x, y: x if x>y else y

In [None]:
temp_test = [(1,)]