In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import os
from gensim.models import Word2Vec
from tensorflow.keras.layers import (Bidirectional,
                                     Embedding,
                                     GRU, 
                                     GlobalAveragePooling1D,
                                     GlobalMaxPooling1D,
                                     Concatenate,
                                     SpatialDropout1D,
                                     BatchNormalization,
                                     Dropout,
                                     Dense,
                                     Activation,
                                     concatenate,
                                     Input
                                    )
                        

In [2]:
#读取数据集
train = pd.read_csv('../data/train_set.csv')
test = pd.read_csv('../data/test_set.csv')

# 数据准备

## 1、 Text to Sequence

In [3]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=50000, 
                                                  lower=False,filters="")
tokenizer.fit_on_texts(train['word_seg'].tolist()+test['word_seg'].tolist())

In [4]:
train_ = tokenizer.texts_to_sequences(train['word_seg'].values)
test_ = tokenizer.texts_to_sequences(test['word_seg'].values)

## 2、数据统计

In [5]:
np.percentile(list(map(lambda x: len(x),train_)),95)

1822.199999999997

## 3、数据截断、补全

In [6]:
train_ = tf.keras.preprocessing.sequence.pad_sequences(train_, maxlen=1800,
                                                      padding='pre',truncating='pre',value=0.0)
test_ = tf.keras.preprocessing.sequence.pad_sequences(test_, maxlen=1800,
                                                     padding='pre',truncating='pre',value=0.0)

## 4、词表

In [7]:
word_vocab = tokenizer.word_index

## 5、label处理

In [8]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
lb = LabelEncoder()
train_label = lb.fit_transform(train['class'].values)
train_label = to_categorical(train_label)

## 6、Dataset数据读取

In [9]:
from sklearn.model_selection import train_test_split
X_train,X_val,y_train,y_val = train_test_split(train_,train_label,
                                               test_size=0.1,random_state=666)

In [10]:
train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(10000).batch(64)
val_ds = tf.data.Dataset.from_tensor_slices((X_val, y_val)).batch(128)

# Word Embedding

## 1、word2vec模型

In [11]:
all_data=pd.concat([train['article'],test['article']])
file_name = '../embedding/Word2Vec_word_200.model'
if not os.path.exists(file_name):
    model = Word2Vec([document.split(' ')for document in all_data.values],
                     size=200, 
                     window=5,
                     iter=10, 
                     workers=11, 
                     seed=2018, 
                     min_count=2)

    model.save(file_name)
else:
    model = Word2Vec.load(file_name)
print("add word2vec finished....")    

add word2vec finished....


## 2、word embedding构建

In [12]:
count = 0

embedding_matrix = np.zeros((len(word_vocab) + 1, 200))
for word, i in word_vocab.items():
    embedding_vector = model.wv[word] if word in model else None
    if embedding_vector is not None:
        count += 1
        embedding_matrix[i] = embedding_vector
    else:
        unk_vec = np.random.random(200) * 0.5
        unk_vec = unk_vec - unk_vec.mean()
        embedding_matrix[i] = unk_vec

  """


# 模型构建

In [13]:
def build_model(sent_length, embeddings_weight,class_num):
    content = Input(shape=(sent_length,), dtype='int32')
    embedding = Embedding(
        name="word_embedding",
        input_dim=embeddings_weight.shape[0],
        weights=[embeddings_weight],
        output_dim=embeddings_weight.shape[1],
        trainable=False)

    x = SpatialDropout1D(0.2)(embedding(content))

    x = Bidirectional(GRU(200, return_sequences=True))(x)
    x = Bidirectional(GRU(200, return_sequences=True))(x)

    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)

    conc = concatenate([avg_pool, max_pool])
    
    x = Dense(1000)(conc)
    x = BatchNormalization()(x)
    x = Activation(activation="relu")(x)
    x = Dropout(0.2)(x)
    x = Dense(500)(x)
    x = BatchNormalization()(x)
    x = Activation(activation="relu")(x)
    output = Dense(19, activation="softmax")(x)

    model = tf.keras.models.Model(inputs=content, outputs=output)
    model.compile(loss='categorical_crossentropy', 
                  optimizer='adam', 
                  metrics=['accuracy'])
    return model

In [14]:
model = build_model(1800, embedding_matrix,19)

In [15]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 1800)]       0                                            
__________________________________________________________________________________________________
word_embedding (Embedding)      (None, 1800, 200)    254292200   input_1[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d (SpatialDropo (None, 1800, 200)    0           word_embedding[0][0]             
__________________________________________________________________________________________________
bidirectional (Bidirectional)   (None, 1800, 400)    482400      spatial_dropout1d[0][0]          
______________________________________________________________________________________________

## 模型训练

In [16]:
from tensorflow.keras.callbacks import EarlyStopping,ReduceLROnPlateau,ModelCheckpoint,TensorBoard
# 检查点保存至的目录
checkpoint_dir = './training_checkpoints_bs64'

# 检查点的文件名
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")


In [None]:

early_stopping = EarlyStopping(monitor='val_accuracy', patience=6)
plateau = ReduceLROnPlateau(monitor="val_accuracy", 
                            verbose=1, 
                            mode='max', 
                            factor=0.5, 
                            patience=3)
checkpoint = ModelCheckpoint(checkpoint_prefix, 
                             monitor='val_accuracy',
                             verbose=2, 
                             save_best_only=True,
                             mode='max',
                             save_weights_only=True)

model.fit(train_ds,
          epochs=30,
          validation_data=val_ds,
          callbacks=[early_stopping, plateau, checkpoint],
          verbose=2)



## 加载模型

In [17]:
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fbf68c13b38>

## 模型线下验证

In [18]:
valid_prob = model.predict(val_ds)
valid_pred = np.argmax(valid_prob,axis=1)
valid_pred = lb.inverse_transform(valid_pred)

val_true = np.argmax(y_val,axis=1)
val_true = lb.inverse_transform(val_true)

from sklearn.metrics import f1_score
print ("valid's macro_f1: %s" % f1_score(val_true,valid_pred,average='macro'))


valid's macro_f1: 0.7650360570681052


## 结果提交

In [19]:
test_ds = tf.data.Dataset.from_tensor_slices((test_,np.zeros((test_.shape[0],19)))).batch(128)
test_prob = model.predict(test_ds)
test_pred = np.argmax(test_prob,axis=1)
test['class'] = lb.inverse_transform(test_pred)
test[["id","class"]].to_csv("submission_dnn_baseline.csv",index=False,header=True,encoding='utf-8')
