In [1]:
import gc
import json
import pandas as pd
import numpy as np
import gensim
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import tensorflow as tf
import keras
from keras.layers import *
from keras.models import *
from keras.optimizers import *
from keras.callbacks import *
from keras.preprocessing import text, sequence
from keras.utils import to_categorical
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import KFold
import dask.dataframe as dd
import warnings
warnings.filterwarnings("ignore")

Using TensorFlow backend.


In [2]:
train=pd.read_table("../smalldata/train_word.txt",sep="\n",names=["fact"])
val=pd.read_table("../smalldata/val_word.txt",sep="\n",names=["fact"])
test=pd.read_table("../smalldata/test_word.txt",sep="\n",names=["fact"])


train_label=pd.read_table("../smalldata/train_label",header=None,sep="\n")
val_label=pd.read_table("../smalldata/val_label",header=None,sep="\n")
test_label=pd.read_table("../smalldata/test_label",header=None,sep="\n")

In [3]:
print(train.shape,val.shape,test.shape,train_label.shape,val_label.shape,test_label.shape)

(154592, 1) (17131, 1) (32508, 1) (154592, 1) (17131, 1) (32508, 1)


In [4]:
def w2v_pad(train,val, test, maxlen_,victor_size):
    max_features = 50000
    count = 0
    #第一步：将字或者词转化为id
    print("step 1:transform the  text into idx")
   
    tokenizer = text.Tokenizer(num_words=max_features, lower=True)
    tokenizer.fit_on_texts(pd.concat([train,val,test])['fact'].tolist())
    train_ = sequence.pad_sequences(tokenizer.texts_to_sequences(train['fact'].tolist()), maxlen=maxlen_)
    val_ = sequence.pad_sequences(tokenizer.texts_to_sequences(val['fact'].tolist()), maxlen=maxlen_)
    test_ = sequence.pad_sequences(tokenizer.texts_to_sequences(test['fact'].tolist()), maxlen=maxlen_) 
    word_index = tokenizer.word_index

    print("step 1 run finish")
        
    #第二步：训练词向量
    print("step 2: train the word2vec")
    file_name = '../cache/' + 'Word2Vec_' + str(victor_size) +'.model'
    if not os.path.exists(file_name):
        print("train word2vec")
        model = Word2Vec([line.split(" ") for line in (pd.concat([train,val,test])['fact'].tolist())],
                         size=victor_size, window=5, iter=15, workers=11, seed=2018, min_count=5)
        model.save(file_name)
    else:
        model = Word2Vec.load(file_name)
    print("step 2 run finish")
    
    
    #第三步：将其转化为matrix
    print("step 3: transform into embedding matrix")

    embedding_matrix = np.zeros((len(word_index) + 1, victor_size))
    for word, i in word_index.items():
        embedding_vector = model[word] if word in model else None
        if embedding_vector is not None:
            count += 1
            embedding_matrix[i] = embedding_vector
        else:
            unk_vec = np.random.random(victor_size) * 0.5
            unk_vec = unk_vec - unk_vec.mean()
            embedding_matrix[i] = unk_vec

    print("step 3 run finish")
        
    print(embedding_matrix.shape, train_.shape, count * 1.0 / embedding_matrix.shape[0]) 
    return train_,val_,test_, word_index, embedding_matrix

word_seq_len=400
victor_size=300

train_,val_,test_, word2idx, word_embedding = w2v_pad(train,val,test, word_seq_len,victor_size)
print("word to idx finish")



step 1:transform the  text into idx
step 1 run finish
step 2: train the word2vec
step 2 run finish
step 3: transform into embedding matrix
step 3 run finish
(579810, 300) (154592, 400) 0.1985374519239061
word to idx finish


In [5]:
train_label[0]=train_label[0].map(lambda x:x.split(" "))
val_label[0]=val_label[0].map(lambda x:x.split(" "))
test_label[0]=test_label[0].map(lambda x:x.split(" "))

mlb = MultiLabelBinarizer()
mlb.fit(train_label[0].tolist()+test_label[0].tolist()+val_label[0].tolist())

train_label=mlb.transform(train_label[0].tolist())
val_label=mlb.transform(val_label[0].tolist())
test_label=mlb.transform(test_label[0].tolist())
label_name = mlb.classes_

print(train_label.shape,test_label.shape)

(154592, 202) (32508, 202)


model 部分

In [8]:
def TextCNN(sent_length,embeddings_weight):
    content = Input(shape=(sent_length,), dtype='int32')
    embedding = Embedding(
                            name="word_embedding",
                            input_dim=embeddings_weight.shape[0],
                            weights=[embeddings_weight],
                            output_dim=embeddings_weight.shape[1],
                            trainable=False)
    x=embedding(content)
    conv1 = Conv1D(filters=64, kernel_size=1, padding='same')(x)
    conv1 = MaxPool1D(pool_size=32)(conv1)
    
    
    conv2 = Conv1D(filters=64, kernel_size=2, padding='same')(x)
    conv2 = MaxPool1D(pool_size=32)(conv2)
    
    conv3 = Conv1D(filters=64, kernel_size=3, padding='same')(x)
    conv3 = MaxPool1D(pool_size=32)(conv3)
    
    conv4 = Conv1D(filters=64, kernel_size=4, padding='same')(x)
    conv4 = MaxPool1D(pool_size=32)(conv4)
    
    cnn = concatenate([conv1, conv2, conv3, conv4], axis=-1)
    fc = Flatten()(cnn)

    #fc layer
    fc=Dense(512)(fc)
#     fc=BatchNormalization()(fc)
    fc=Activation(activation="relu")(fc)
#     fc = Dropout(0.2)(fc)
    
    fc=Dense(256)(fc)
    #fc=BatchNormalization()(fc)
    fc = Activation(activation="relu")(fc)
    output = Dense(202,  activation="softmax")(fc)
    
    
    model = Model(inputs=content, outputs=output)
    model.compile(loss= "categorical_crossentropy",
              optimizer='adam',
              metrics=['accuracy'])
    model.summary()
    return model

In [12]:
from sklearn.model_selection import train_test_split


file_path = "../model/TextCNN.hdf"

model = TextCNN(word_seq_len, word_embedding)

early_stopping = EarlyStopping(monitor='val_acc', patience=6)
plateau = ReduceLROnPlateau(monitor="val_acc", verbose=1, mode='max', factor=0.5, patience=3)
checkpoint = ModelCheckpoint(file_path, monitor='val_acc', verbose=1, save_best_only=True, mode='max',save_weights_only=False)
#if not os.path.exists(file_path):
model.fit(train_, train_label,
              epochs=20,
              batch_size=128,
              validation_data=(val_, val_label),
              #callbacks=[early_stopping, plateau, checkpoint],
              verbose=2)
# else:
#     model.load_weights(file_path)


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 400)          0                                            
__________________________________________________________________________________________________
word_embedding (Embedding)      (None, 400, 300)     173943000   input_1[0][0]                    
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 400, 64)      19264       word_embedding[0][0]             
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, 400, 64)      38464       word_embedding[0][0]             
__________________________________________________________________________________________________
conv1d_3 (

<keras.callbacks.History at 0x7fb4f4376f60>

In [16]:
test_pred=model.predict(test_,batch_size=128)

In [14]:
#仅预测1个label
from sklearn.preprocessing import label_binarize
from sklearn.metrics import *
test_pred_onelabel=label_binarize(np.argmax(test_pred,axis=1),classes=list(range(0,202)))
print('sklearn Macro-F1-Score:', f1_score(test_label, test_pred_onelabel, average='macro'))
print('sklearn Macro-precision-Score:', precision_score(test_label, test_pred_onelabel, average='macro'))
print('sklearn Macro-recall-Score:', recall_score(test_label, test_pred_onelabel, average='macro'))
print('sklearn hamming_loss:', hamming_loss(test_label, test_pred_onelabel))

sklearn Macro-F1-Score: 0.5554373854255992
sklearn Macro-precision-Score: 0.6289731656984494
sklearn Macro-recall-Score: 0.5299730109888571
sklearn hamming_loss: 0.0026159592703456393


In [15]:
from sklearn.metrics import *


for i in np.arange(0,1,0.05):
    print(i)
    temp=test_pred.copy()
    temp[temp<i]=0
    temp[temp>=i]=1
    print('sklearn Macro-F1-Score:', f1_score(test_label, temp, average='macro'))
    print('sklearn Macro-precision-Score:', precision_score(test_label, temp, average='macro'))
    print('sklearn Macro-recall-Score:', recall_score(test_label, temp, average='macro'))
    print('sklearn hamming_loss:', hamming_loss(test_label, temp))
      

0.0
sklearn Macro-F1-Score: 0.010407896254786374
sklearn Macro-precision-Score: 0.005287350440470405
sklearn Macro-recall-Score: 1.0
sklearn hamming_loss: 0.9947126495595296
0.05
sklearn Macro-F1-Score: 0.5144346725699229
sklearn Macro-precision-Score: 0.43670731440793775
sklearn Macro-recall-Score: 0.6900723564290342
sklearn hamming_loss: 0.005386640546668178
0.1
sklearn Macro-F1-Score: 0.5554054258904327
sklearn Macro-precision-Score: 0.5098931264996042
sklearn Macro-recall-Score: 0.6542859792928142
sklearn hamming_loss: 0.0038505373239428042
0.15000000000000002
sklearn Macro-F1-Score: 0.571610897014695
sklearn Macro-precision-Score: 0.5532853513895253
sklearn Macro-recall-Score: 0.6283328974819034
sklearn hamming_loss: 0.0032223598882590364
0.2
sklearn Macro-F1-Score: 0.5761391952984019
sklearn Macro-precision-Score: 0.5809014556283769
sklearn Macro-recall-Score: 0.6054050710455529
sklearn hamming_loss: 0.002887331922561027
0.25
sklearn Macro-F1-Score: 0.5797097039336729
sklearn Mac