In [83]:
import numpy as np
import pandas as pd
import jieba.posseg as pseg
from tensorflow import keras
from tensorflow.keras import layers, Input, regularizers
from tensorflow.keras.backend import square, abs, sqrt
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
TRAIN_CSV_PATH = 'fake-news-pair-classification-challenge/train.csv'

In [2]:
train = pd.read_csv(TRAIN_CSV_PATH)
train.head(3)

Unnamed: 0,id,tid1,tid2,title1_zh,title2_zh,title1_en,title2_en,label
0,0,0,1,2017养老保险又新增两项，农村老人人人可申领，你领到了吗,警方辟谣“鸟巢大会每人领5万” 仍有老人坚持进京,There are two new old-age insurance benefits f...,"Police disprove ""bird's nest congress each per...",unrelated
1,3,2,3,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",深圳GDP首超香港？深圳统计局辟谣：只是差距在缩小,"""If you do not come to Shenzhen, sooner or lat...",Shenzhen's GDP outstrips Hong Kong? Shenzhen S...,unrelated
2,1,2,4,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",GDP首超香港？深圳澄清：还差一点点……,"""If you do not come to Shenzhen, sooner or lat...",The GDP overtopped Hong Kong? Shenzhen clarifi...,unrelated


In [3]:
cols = ['title1_zh', 'title2_zh', 'label']
train = train.loc[:, cols]
train.head(3)

Unnamed: 0,title1_zh,title2_zh,label
0,2017养老保险又新增两项，农村老人人人可申领，你领到了吗,警方辟谣“鸟巢大会每人领5万” 仍有老人坚持进京,unrelated
1,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",深圳GDP首超香港？深圳统计局辟谣：只是差距在缩小,unrelated
2,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",GDP首超香港？深圳澄清：还差一点点……,unrelated


## Text Segmetation

In [4]:
text = "我是小牛奶，是東吳巨資大四生"
words = pseg.cut(text)
[word for word in words]

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/fk/0dy994z53xdf6qfz76jn986c0000gn/T/jieba.cache
Loading model cost 0.520 seconds.
Prefix dict has been built successfully.


[pair('我', 'r'),
 pair('是', 'v'),
 pair('小', 'a'),
 pair('牛奶', 'n'),
 pair('，', 'x'),
 pair('是', 'v'),
 pair('東吳', 'ns'),
 pair('巨資', 'n'),
 pair('大四', 'm'),
 pair('生', 'vn')]

In [5]:
def jieba_tokenizer(text):
    words = pseg.cut(text)
    return " ".join([word for word, flag in words if flag != 'x'])

In [6]:
train['title1_tokenized'] = train.loc[:, 'title1_zh'].astype('str').apply(jieba_tokenizer)
train['title2_tokenized'] = train.loc[:, 'title2_zh'].astype('str').apply(jieba_tokenizer)

In [7]:
train.iloc[:, [0, 3]].head(3)

Unnamed: 0,title1_zh,title1_tokenized
0,2017养老保险又新增两项，农村老人人人可申领，你领到了吗,2017 养老保险 又 新增 两项 农村 老人 人人 可 申领 你 领到 了 吗
1,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",你 不 来 深圳 早晚 你 儿子 也 要 来 不出 10 年 深圳 人均 GDP 将 超 香港
2,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",你 不 来 深圳 早晚 你 儿子 也 要 来 不出 10 年 深圳 人均 GDP 将 超 香港


In [8]:
train.iloc[:, [1, 4]].head(3)

Unnamed: 0,title2_zh,title2_tokenized
0,警方辟谣“鸟巢大会每人领5万” 仍有老人坚持进京,警方 辟谣 鸟巢 大会 每人 领 5 万 仍 有 老人 坚持 进京
1,深圳GDP首超香港？深圳统计局辟谣：只是差距在缩小,深圳 GDP 首 超 香港 深圳 统计局 辟谣 只是 差距 在 缩小
2,GDP首超香港？深圳澄清：还差一点点……,GDP 首 超 香港 深圳 澄清 还 差 一点点


## Tokenize

In [9]:
MAX_NUM_WORDS = 10000
tokenizer = keras.preprocessing.text.Tokenizer(num_words=MAX_NUM_WORDS)

In [10]:
corpus1 = train['title1_tokenized']
corpus2 = train['title2_tokenized']
corpus = pd.concat([corpus1, corpus2], axis=0)
corpus.shape

(641104,)

In [11]:
tokenizer.fit_on_texts(corpus)

In [12]:
x1_train = tokenizer.texts_to_sequences(corpus1)
x2_train = tokenizer.texts_to_sequences(corpus2)
len(x1_train)

320552

In [13]:
x1_train[:1]

[[217, 1268, 32, 1178, 5967, 25, 489, 2877, 116, 5559, 4, 1850, 2, 13]]

In [14]:
print([tokenizer.index_word[idx] for idx in x1_train[0]])

['2017', '养老保险', '又', '新增', '两项', '农村', '老人', '人人', '可', '申领', '你', '领到', '了', '吗']


## zero padding

In [15]:
for seq in x1_train[:10]:
    print(len(seq), seq[:5], '...')

14 [217, 1268, 32, 1178, 5967] ...
19 [4, 10, 47, 678, 2558] ...
19 [4, 10, 47, 678, 2558] ...
19 [4, 10, 47, 678, 2558] ...
9 [31, 320, 3372, 3062, 1] ...
19 [4, 10, 47, 678, 2558] ...
6 [7, 2221, 1, 2072, 7] ...
19 [4, 10, 47, 678, 2558] ...
14 [1281, 1211, 427, 3, 3245] ...
9 [31, 320, 3372, 3062, 1] ...


In [16]:
max_seq_len = max([len(seq) for seq in x1_train])
max_seq_len

61

In [17]:
MAX_SEQ_LEN = 30
x1_train = keras.preprocessing.sequence.pad_sequences(x1_train, maxlen=MAX_SEQ_LEN)
x2_train = keras.preprocessing.sequence.pad_sequences(x2_train, maxlen=MAX_SEQ_LEN)

In [18]:
for seq in x1_train + x2_train:
    assert len(seq) == 30
print('all seqs are len 30')

all seqs are len 30


In [19]:
x1_train[:5]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,  217, 1268,   32, 1178, 5967,   25,
         489, 2877,  116, 5559,    4, 1850,    2,   13],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           4,   10,   47,  678, 2558,    4,  166,   34,   17,   47, 5150,
          63,   15,  678, 4502, 3211,   23,  284, 1181],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           4,   10,   47,  678, 2558,    4,  166,   34,   17,   47, 5150,
          63,   15,  678, 4502, 3211,   23,  284, 1181],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           4,   10,   47,  678, 2558,    4,  166,   34,   17,   47, 5150,
          63,   15,  678, 4502, 3211,   23,  284, 1181],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   31,
         320, 3372, 3062,    1, 

## One-Hot encoding

In [20]:
train['label'][:5]

0    unrelated
1    unrelated
2    unrelated
3    unrelated
4       agreed
Name: label, dtype: object

In [21]:
label_to_index = {
    'unrelated': 0,
    'agreed': 1, 
    'disagreed': 2,
}
y_train = train['label'].apply(lambda x: label_to_index[x])
y_train = y_train.astype('float32').to_numpy()
y_train[:5]

array([0., 0., 0., 0., 1.], dtype=float32)

In [22]:
y_train = keras.utils.to_categorical(y_train)
y_train[:5]

array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.]], dtype=float32)

In [23]:
VALIDATION_RATION = 0.2
RANDOM_STATE = 9527
x1_train, x1_val, x2_train, x2_val, y_train, y_val = train_test_split(x1_train, x2_train, y_train, test_size=VALIDATION_RATION, random_state=RANDOM_STATE)

In [24]:
print("Training Set")
print("-" * 10)
print(f"x1_train: {x1_train.shape}")
print(f"x2_train: {x2_train.shape}")
print(f"y_train : {y_train.shape}")

print("-" * 10)
print(f"x1_val:   {x1_val.shape}")
print(f"x2_val:   {x2_val.shape}")
print(f"y_val :   {y_val.shape}")
print("-" * 10)
print("Test Set")

Training Set
----------
x1_train: (256441, 30)
x2_train: (256441, 30)
y_train : (256441, 3)
----------
x1_val:   (64111, 30)
x2_val:   (64111, 30)
y_val :   (64111, 3)
----------
Test Set


In [25]:
# lstm = layers.LSTM()

## Word Embedding

In [62]:
NUM_CLASSES = 3

MAX_NUM_WORDS = 10000

MAX_SEQ_LEN = 30

NUM_EMBEDDING_DIM = 256

NUM_LSTM_UNITS = 128

In [63]:
## normal
# top_input = Input(shape=(MAX_SEQ_LEN, ), dtype="int32")
# bm_input = Input(shape=(MAX_SEQ_LEN, ), dtype="int32")

# embedding_layer = layers.Embedding(MAX_NUM_WORDS, NUM_EMBEDDING_DIM)
# top_embedding = embedding_layer(top_input)
# bm_embedding = embedding_layer(bm_input)

# shared_LSTM = layers.LSTM(NUM_LSTM_UNITS)
# top_output = shared_LSTM(top_embedding)
# bm_output = shared_LSTM(bm_embedding)

# merged = layers.concatenate([top_output, bm_output], axis=-1)

# dense = layers.Dense(units=NUM_CLASSES, activation='softmax')
# predictions = dense(merged)

# model = Model(inputs=[top_input, bm_input], outputs=predictions)

In [64]:
## add
# top_input = Input(shape=(MAX_SEQ_LEN, ), dtype="int32")
# bm_input = Input(shape=(MAX_SEQ_LEN, ), dtype="int32")

# embedding_layer = layers.Embedding(MAX_NUM_WORDS, NUM_EMBEDDING_DIM)
# top_embedding = embedding_layer(top_input)
# bm_embedding = embedding_layer(bm_input)

# shared_LSTM = layers.LSTM(NUM_LSTM_UNITS)
# top_output = shared_LSTM(top_embedding)
# bm_output = shared_LSTM(bm_embedding)

# added = layers.Add()([top_output, bm_output])
# # merged = layers.concatenate([top_output, bm_output], axis=-1)

# dense = layers.Dense(units=NUM_CLASSES, activation='softmax')
# predictions = dense(added)

# model = Model(inputs=[top_input, bm_input], outputs=predictions)

In [84]:
def Euclidean_distance(a, b):
    return square(a - b)

top_input = Input(shape=(MAX_SEQ_LEN, ), dtype="int32")
bm_input = Input(shape=(MAX_SEQ_LEN, ), dtype="int32")

embedding_layer = layers.Embedding(MAX_NUM_WORDS, NUM_EMBEDDING_DIM)
top_embedding = embedding_layer(top_input)
bm_embedding = embedding_layer(bm_input)

shared_LSTM = layers.LSTM(NUM_LSTM_UNITS)
top_output = shared_LSTM(top_embedding)
bm_output = shared_LSTM(bm_embedding)

merged = layers.Lambda(
    lambda x: Euclidean_distance(x[0], x[1]), # x is list of input shape
)([top_output, bm_output])

dense = layers.Dense(units=NUM_CLASSES, activation='softmax', kernel_regularizer=regularizers.l2(l=0.001))
predictions = dense(merged)

model = Model(inputs=[top_input, bm_input], outputs=predictions)


In [57]:
# def abs_distance(a, b):
#     return abs(a - b)

# top_input = Input(shape=(MAX_SEQ_LEN, ), dtype="int32")
# bm_input = Input(shape=(MAX_SEQ_LEN, ), dtype="int32")

# embedding_layer = layers.Embedding(MAX_NUM_WORDS, NUM_EMBEDDING_DIM)
# top_embedding = embedding_layer(top_input)
# bm_embedding = embedding_layer(bm_input)

# shared_LSTM = layers.LSTM(NUM_LSTM_UNITS)
# top_output = shared_LSTM(top_embedding)
# bm_output = shared_LSTM(bm_embedding)

# merged = layers.Lambda(
#     lambda x: abs_distance(x[0], x[1]), # x is list of input shape
# )([top_output, bm_output])

# dense = layers.Dense(units=NUM_CLASSES, activation='softmax')
# predictions = dense(merged)

In [85]:
model.summary()

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_17 (InputLayer)           [(None, 30)]         0                                            
__________________________________________________________________________________________________
input_18 (InputLayer)           [(None, 30)]         0                                            
__________________________________________________________________________________________________
embedding_8 (Embedding)         (None, 30, 256)      2560000     input_17[0][0]                   
                                                                 input_18[0][0]                   
__________________________________________________________________________________________________
lstm_8 (LSTM)                   (None, 128)          197120      embedding_8[0][0]          

In [86]:
model.compile(
    optimizer='rmsprop',
    loss='categorical_crossentropy',
    metrics=['accuracy'])

In [87]:
BATCH_SIZE = 256

NUM_EPOCHS = 7

history = model.fit(
    x=[x1_train, x2_train], 
    y=y_train,
    batch_size=BATCH_SIZE, 
    epochs=NUM_EPOCHS, 
    validation_data=(
        [x1_val, x2_val], 
        y_val
    ),
    shuffle=True
)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


## test data

In [88]:
TEST_CSV_PATH = "fake-news-pair-classification-challenge/test.csv"
test = pd.read_csv(TEST_CSV_PATH, index_col=0)
test.head(3)

Unnamed: 0_level_0,tid1,tid2,title1_zh,title2_zh,title1_en,title2_en
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
321187,167562,59521,萨拉赫人气爆棚!埃及总统大选未参选获百万选票 现任总统压力山大,辟谣！里昂官方否认费基尔加盟利物浦，难道是价格没谈拢？,egypt 's presidential election failed to win m...,Lyon! Lyon officials have denied that Felipe F...
321190,167564,91315,萨达姆被捕后告诫美国的一句话，发人深思,10大最让美国人相信的荒诞谣言，如蜥蜴人掌控着美国,A message from Saddam Hussein after he was cap...,The Top 10 Americans believe that the Lizard M...
321189,167563,167564,萨达姆此项计划没有此国破坏的话，美国还会对伊拉克发动战争吗,萨达姆被捕后告诫美国的一句话，发人深思,Will the United States wage war on Iraq withou...,A message from Saddam Hussein after he was cap...


In [89]:
test = test.iloc[:, [2, 3]]
test.head(3)

Unnamed: 0_level_0,title1_zh,title2_zh
id,Unnamed: 1_level_1,Unnamed: 2_level_1
321187,萨拉赫人气爆棚!埃及总统大选未参选获百万选票 现任总统压力山大,辟谣！里昂官方否认费基尔加盟利物浦，难道是价格没谈拢？
321190,萨达姆被捕后告诫美国的一句话，发人深思,10大最让美国人相信的荒诞谣言，如蜥蜴人掌控着美国
321189,萨达姆此项计划没有此国破坏的话，美国还会对伊拉克发动战争吗,萨达姆被捕后告诫美国的一句话，发人深思


In [90]:
test['title1_tokenized'] = test['title1_zh'].astype('str').apply(jieba_tokenizer)
test['title2_tokenized'] = test['title2_zh'].astype('str').apply(jieba_tokenizer)

In [100]:
# tokenize
x1_test = tokenizer.texts_to_sequences(test['title1_tokenized'])
x2_test = tokenizer.texts_to_sequences(test['title2_tokenized'])

In [101]:
# zero padding
x1_test = keras.preprocessing.sequence.pad_sequences(x1_test, maxlen=MAX_SEQ_LEN)
x2_test = keras.preprocessing.sequence.pad_sequences(x2_test, maxlen=MAX_SEQ_LEN)

In [103]:
predictions = model.predict([x1_test, x2_test])

In [107]:
predictions[:10]

array([[9.9564725e-01, 9.7066441e-06, 4.3429662e-03],
       [9.2297775e-01, 3.5294704e-04, 7.6669298e-02],
       [9.1102898e-02, 9.0695900e-01, 1.9381513e-03],
       [9.2342085e-01, 8.2459586e-04, 7.5754516e-02],
       [9.6152264e-01, 7.7072706e-04, 3.7706628e-02],
       [2.3397334e-01, 7.6446277e-01, 1.5638802e-03],
       [9.9343687e-01, 3.4138761e-04, 6.2217303e-03],
       [2.2440670e-01, 7.7466857e-01, 9.2467858e-04],
       [7.6061058e-01, 2.2778060e-01, 1.1608930e-02],
       [8.5276204e-01, 1.4637919e-01, 8.5875957e-04]], dtype=float32)

In [105]:
index_to_label = {v: k for k, v in label_to_index.items()}
test['Category'] = [index_to_label[idx] for idx in np.argmax(predictions, axis=1)]

submission = test.loc[:, ['Category']].reset_index()
submission.columns = ['Id', 'Category']
submission.head(3)

Unnamed: 0,Id,Category
0,321187,unrelated
1,321190,unrelated
2,321189,agreed


In [106]:
submission.to_csv('submission.csv', index=False)