In [2]:
import pandas as pd

train = pd.read_csv('./train.csv', index_col=0, encoding='utf-8').astype(str)

cols = ['title1_zh','title2_zh', 'label']
train = train.loc[:, cols]

import jieba.posseg as pseg
import time

def jieba_tokenizer(text):       
    words = pseg.cut(text)    
    return ' '.join([word for word, flag in words if flag != 'x'])

train['title1_tokenized'] = train.loc[:, 'title1_zh'].apply(jieba_tokenizer)
train['title2_tokenized'] = train.loc[:, 'title2_zh'].apply(jieba_tokenizer)

import keras

MAX_NUM_WORDS = 10000
tokenizer = keras.preprocessing.text.Tokenizer(num_words=MAX_NUM_WORDS)

corpus_x1 = train.title1_tokenized
corpus_x2 = train.title2_tokenized
corpus = pd.concat([corpus_x1, corpus_x2])

tokenizer.fit_on_texts(corpus)
x1_train = tokenizer.texts_to_sequences(corpus_x1)
x2_train = tokenizer.texts_to_sequences(corpus_x2)

MAX_SEQUENCE_LENGTH = 20
x1_train = keras.preprocessing.sequence.pad_sequences(x1_train,maxlen=MAX_SEQUENCE_LENGTH)
x2_train = keras.preprocessing.sequence.pad_sequences(x2_train,maxlen=MAX_SEQUENCE_LENGTH)

import numpy as np

# 定義每一個分類對應到的索引數字
label_to_index = {
    'unrelated': 0, 
    'agreed': 1, 
    'disagreed': 2
}

# 將分類標籤對應到剛定義的數字
y_train = train.label.apply(lambda x: label_to_index[x])
y_train = np.asarray(y_train).astype('float32')

y_train = keras.utils.to_categorical(y_train)

from sklearn.model_selection import train_test_split

VALIDATION_RATIO = 0.1
RANDOM_STATE = 50
x1_train, x1_val, x2_train, x2_val, y_train, y_val = train_test_split(x1_train, x2_train, y_train, test_size = VALIDATION_RATIO, random_state = RANDOM_STATE)

test = pd.read_csv('./test.csv', index_col=0, encoding='utf-8')
test = test.dropna(how = 'any', axis = 0)

# 以下步驟分別對新聞標題 A、B　進行
# 文本斷詞 / Word Segmentation
test['title1_tokenized'] = test.loc[:, 'title1_zh'].apply(jieba_tokenizer)
test['title2_tokenized'] = test.loc[:, 'title2_zh'].apply(jieba_tokenizer)

# 將詞彙序列轉為索引數字的序列
x1_test = tokenizer.texts_to_sequences(test.title1_tokenized)
x2_test = tokenizer.texts_to_sequences(test.title2_tokenized)

# 為數字序列加入 zero padding
x1_test = keras.preprocessing.sequence.pad_sequences(x1_test, maxlen=MAX_SEQUENCE_LENGTH)
x2_test = keras.preprocessing.sequence.pad_sequences(x2_test, maxlen=MAX_SEQUENCE_LENGTH)  

Building prefix dict from the default dictionary ...
I0506 14:22:11.548615  4780 __init__.py:111] Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\MCUCC\AppData\Local\Temp\jieba.cache
I0506 14:22:11.548615  4780 __init__.py:131] Loading model from cache C:\Users\MCUCC\AppData\Local\Temp\jieba.cache
Loading model cost 0.636 seconds.
I0506 14:22:12.184971  4780 __init__.py:163] Loading model cost 0.636 seconds.
Prefix dict has been built succesfully.
I0506 14:22:12.188971  4780 __init__.py:164] Prefix dict has been built succesfully.


In [3]:
# 建立孿生 LSTM 架構（Siamese LSTM）
import tensorflow as tf
from keras import Input
from keras.layers import LSTM, concatenate, Dense
from keras.models import Model
import utils
from elmo import ELMoEmbedding

# 基本參數設置，有幾個分類
NUM_CLASSES = 3
# 在語料庫裡有多少詞彙
MAX_NUM_WORDS = 10000
# 一個標題最長有幾個詞彙
MAX_SEQUENCE_LENGTH = 20
# 一個詞向量的維度
NUM_EMBEDDING_DIM = 256
# LSTM 輸出的向量維度
NUM_LSTM_UNITS = 128

top_input = Input(shape=(20, ), dtype='int64')
bm_input = Input(shape=(20, ), dtype='int64')

idx2word = utils.get_idx2word()

embedding_layer = ELMoEmbedding(idx2word=idx2word, output_mode="elmo", trainable=True)
top_embedded = embedding_layer(top_input)
bm_embedded = embedding_layer(bm_input)

share_lstm = LSTM(NUM_LSTM_UNITS)
top_output = share_lstm(top_embedded)
bm_output = share_lstm(bm_embedded)

merged = concatenate([top_output, bm_output], axis=-1)

# 全連接層搭配 Softmax Activation
# 可以回傳 3 個成對標題
# 屬於各類別的可能機率
dense =  Dense(units=NUM_CLASSES, activation='softmax')
predictions = dense(merged)


# 我們的模型就是將數字序列的輸入，轉換
# 成 3 個分類的機率的所有步驟 / 層的總和
model = Model(inputs=[top_input, bm_input], outputs=predictions)

# from keras.utils import plot_model
# plot_model(
#     model, 
#     to_file='model.png', 
#     show_shapes=True, 
#     show_layer_names=False, 
#     rankdir='LR')


model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0506 14:39:09.206950  4780 saver.py:1483] Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0506 14:39:09.494951  4780 saver.py:1483] Saver not created because there are no variables in the graph to restore


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 20)           0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 20)           0                                            
__________________________________________________________________________________________________
el_mo_embedding_2 (ELMoEmbeddin (None, 20, 1024)     0           input_3[0][0]                    
                                                                 input_4[0][0]                    
__________________________________________________________________________________________________
lstm_2 (LSTM)                   (None, 128)          590336      el_mo_embedding_2[0][0]          
          

In [4]:
# 實際訓練模型
history = model.fit(
    # 輸入是兩個長度為 20 的數字序列
    x=[x1_train, x2_train], 
    y=y_train,
    batch_size=512,
    epochs=10,
    # 每個 epoch 完後計算驗證資料集
    # 上的 Loss 以及準確度
    validation_data=([x1_val, x2_val],y_val),
    # 每個 epoch 隨機調整訓練資料集
    # 裡頭的數據以讓訓練過程更穩定
)  

Instructions for updating:
Use tf.cast instead.


W0506 14:39:09.950957  4780 deprecation.py:323] From c:\users\mcucc\appdata\local\programs\python\python35\lib\site-packages\tensorflow\python\ops\math_ops.py:3066: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.cast instead.


Train on 288496 samples, validate on 32056 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# 利用已訓練的模型做預測

predictions = model.predict([x1_test, x2_test], batch_size=20)

index_to_label = {v: k for k, v in label_to_index.items()}

test['Category'] = [index_to_label[idx] for idx in np.argmax(predictions, axis=1)]

submission = test.loc[:, ['Category']].reset_index()

submission.columns = ['Id', 'Category']
print(submission.head())

In [12]:
model.save('my_model.h5')