In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import os
import gc
from gensim.models import Word2Vec
from tensorflow.keras.layers import (Bidirectional,
                                     Embedding,
                                     GRU, 
                                     GlobalAveragePooling1D,
                                     GlobalMaxPooling1D,
                                     Concatenate,
                                     SpatialDropout1D,
                                     BatchNormalization,
                                     Dropout,
                                     Dense,
                                     Activation,
                                     concatenate,
                                     Input
                                    )
from tensorflow.keras.callbacks import EarlyStopping,ReduceLROnPlateau,ModelCheckpoint
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score


# 数据准备

In [2]:

#读取数据集
train = pd.read_csv('../data/train_set.csv')
test = pd.read_csv('../data/test_set.csv')

tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=50000, lower=False,filters="")
tokenizer.fit_on_texts(list(train['word_seg'].values)+list(test['word_seg'].values))

train_ = tokenizer.texts_to_sequences(train['word_seg'].values)
test_ = tokenizer.texts_to_sequences(test['word_seg'].values)

train_ = tf.keras.preprocessing.sequence.pad_sequences(train_, maxlen=1800)
test_ = tf.keras.preprocessing.sequence.pad_sequences(test_, maxlen=1800)

word_vocab = tokenizer.word_index



from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
lb = LabelEncoder()
train_label = lb.fit_transform(train['class'].values)
train_label = to_categorical(train_label)


In [None]:
#注意，不再这里进行读入

# Word Embedding构建

In [3]:


all_data=pd.concat([train['word_seg'],test['word_seg']])
file_name = '../embedding/Word2Vec_word_200.model'
if not os.path.exists(file_name):
    model = Word2Vec([[word for word in document.split(' ')] for document in all_data.values],
                     size=200, 
                     window=5,
                     iter=10, 
                     workers=11, 
                     seed=2018, 
                     min_count=2)
    model.save(file_name)
else:
    model = Word2Vec.load(file_name)
print("add word2vec finished....")    



count = 0

embedding_matrix = np.zeros((len(word_vocab) + 1, 200))
for word, i in word_vocab.items():
    embedding_vector = model.wv[word] if word in model else None
    if embedding_vector is not None:
        count += 1
        embedding_matrix[i] = embedding_vector
    else:
        unk_vec = np.random.random(200) * 0.5
        unk_vec = unk_vec - unk_vec.mean()
        embedding_matrix[i] = unk_vec
        


add word2vec finished....




# 模型构建

In [5]:
def build_model(sent_length, embeddings_weight):
    content = Input(shape=(sent_length,), dtype='int32')
    embedding = Embedding(
        name="word_embedding",
        input_dim=embeddings_weight.shape[0],
        weights=[embeddings_weight],
        output_dim=embeddings_weight.shape[1],
        trainable=False)

    x = SpatialDropout1D(0.2)(embedding(content))

    x = Bidirectional(GRU(200, return_sequences=True))(x)
    x = Bidirectional(GRU(200, return_sequences=True))(x)

    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)

    conc = concatenate([avg_pool, max_pool])

    x = Dense(1000)(conc)
    x = BatchNormalization()(x)
    x = Activation(activation="relu")(x)
    x = Dropout(0.2)(x)
    x = Dense(500)(x)
    x = BatchNormalization()(x)
    x = Activation(activation="relu")(x)
    output = Dense(19, activation="softmax")(x)

    model = tf.keras.models.Model(inputs=content, outputs=output)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# 模型交叉验证

In [None]:
kf = KFold(n_splits=10, shuffle=True, random_state=666)


#交叉验证的验证集的概率结果保存
train_pre_matrix = np.zeros((train.shape[0],19)) #记录验证集的概率
#测试集的概率结果保存（cv次数，测试集的行数，标签）
test_pre_matrix = np.zeros((10,test.shape[0],19)) #将10轮的测试概率分别保存起来
cv_scores=[] #每一轮线下的验证成绩

for i, (train_fold, test_fold) in enumerate(kf.split(train_)):
    print("第%s的结果"%i)
    X_train, X_valid = train_[train_fold, :], train_[test_fold, :]
    y_train, y_valid = train_label[train_fold], train_label[test_fold]

    #在这里进行数据组装，
    train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(10000).batch(64)
    val_ds = tf.data.Dataset.from_tensor_slices((X_valid, y_valid)).batch(128)
    test_ds = tf.data.Dataset.from_tensor_slices((test_,np.zeros((test_.shape[0],19)))).batch(128)

    # 检查点保存至的目录
    checkpoint_dir = './cv_checkpoints/cv_'+str(i)+'/'
    # 检查点的文件名
    checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")
    model = build_model(1800, embedding_matrix)

    early_stopping = EarlyStopping(monitor='val_accuracy', patience=6)
    plateau = ReduceLROnPlateau(monitor="val_accuracy", verbose=1, mode='max', factor=0.5, patience=3)
    checkpoint = ModelCheckpoint(checkpoint_prefix, monitor='val_accuracy', 
                                 verbose=2, save_best_only=True, mode='max',save_weights_only=True)

    if not os.path.exists(checkpoint_dir):
        model.fit(train_ds,
                  epochs=30,
                  validation_data=val_ds,
                  callbacks=[early_stopping, plateau, checkpoint],
                  verbose=2)

    model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

    
   #验证集的结果 
    valid_prob = model.predict(val_ds)
    valid_pred = np.argmax(valid_prob,axis=1)
    valid_pred = lb.inverse_transform(valid_pred)

    y_valid = np.argmax(y_valid, axis=1)
    y_valid = lb.inverse_transform(y_valid)

    f1_score_ = f1_score(y_valid,valid_pred,average='macro') 
    print ("valid's f1-score: %s" %f1_score_)


    train_pre_matrix[test_fold, :] =  valid_prob

    test_pre_matrix[i, :,:]= model.predict(test_ds)
    #第一轮的ok
    #第二轮？
    #GPU 释放

    del model; gc.collect()#注意
    tf.keras.backend.clear_session()  #注意
    
np.save("cv_test_result.npy",test_pre_matrix)
#模型融合


# 10折概率结果
## 结果融合
 1、 概率求平均，然后取最大值！作为结果
 
 2、每一折，先取最大值，作为结果
 这个结果有10个
 【1，1，1，1，2，3，5，9，0，1】 ——>  1:5 2:1 3:1 5:1 9:1 -> 5作为结果


# 结果融合

## 方法一

In [17]:
res = np.load("cv_test_result.npy")

In [19]:
res.shape

(10, 102277, 19)

In [21]:
res = np.load("cv_test_result.npy")
res_mean = res.mean(axis=0)
res_mean.shape

(102277, 19)

In [22]:
test_pred = lb.inverse_transform(np.argmax(res_mean,axis=1))
test['class'] = test_pred
test[["id","class"]].to_csv("submission_baseline_dnn_cv.csv",index=False,header=True,encoding='utf-8')

## 方法二

In [24]:
res = np.load("cv_test_result.npy")
test_pred_matrix = np.zeros((test.shape[0],10),dtype=int)


In [25]:
#每一折进行取概率最大值             
for i in range(10):
    sub_res = res[i,:,:]
    sub_test_pred = lb.inverse_transform(np.argmax(sub_res,axis=1))
    test_pred_matrix[:,i]=sub_test_pred

In [26]:
test_pred_matrix

array([[ 5,  5,  5, ...,  5,  5,  5],
       [ 4,  4,  4, ...,  4,  4,  4],
       [12, 13, 13, ...,  1,  3, 13],
       ...,
       [ 6,  6,  6, ...,  6,  6,  6],
       [14, 14, 14, ..., 14, 14, 14],
       [13, 13, 13, ..., 13, 13, 13]])

In [27]:
a=[5,5,5,5,1,1,1,2,3,4]
np.argmax(np.bincount(a))

5

In [28]:
test_pred=[]
for line in test_pred_matrix:
    test_pred.append(np.argmax(np.bincount(line)))
test['class'] = test_pred
test[["id","class"]].to_csv("submission_baseline_dnn_cv_v2.csv",index=False,header=True,encoding='utf-8')