In [1]:
import warnings
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import gensim as gs
from tqdm import tqdm
import tensorflow as tf
import matplotlib.pyplot as plt
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
warnings.filterwarnings('ignore')
import fasttext

In [2]:
train_dataset = pd.read_csv('../train_data/final_train_dataset', delimiter=',')
validation_dataset = pd.read_csv('../train_data/final_validation_dataset', delimiter=',')

In [3]:
embedding_model = gs.models.FastText.load('../train_embedding_models/fasttext_embedding_50d_all_signals')

In [4]:
MAX_PAD = 31

def sequence_padding(X_DIM, value):
    
    value_padding = np.pad(value, ((0,MAX_PAD - X_DIM),(0,0)), 'constant')
    
    return value_padding

In [5]:
def create_embedding(value):

    value = tknzr.tokenize(str(value))
    
    if len(value) < MAX_PAD:
        
        embedding = embedding_model.wv[value]
        
        padding_embedding = sequence_padding(embedding.shape[0],embedding)
        
        return padding_embedding
        
    else:
        
        embedding = embedding_model.wv[value[0:MAX_PAD]]
        
        return embedding

In [6]:
train_article_title = []
train_table_title = []
train_label = []

for i,row in tqdm(train_dataset.iterrows()):
    
    train_article_title.append(create_embedding(row['article_page_title']))
    train_table_title.append(create_embedding(row['table_page_title']))
    train_label.append(row['label'])

169500it [01:01, 2757.67it/s]


In [7]:
train_article_title = np.array(train_article_title)
train_table_title = np.array(train_table_title)
train_label = np.array(train_label)

In [8]:
train_article_title.shape

(169500, 31, 50)

In [9]:
validation_article_title = []
validation_table_title = []
validation_label = []

for i, row in tqdm(validation_dataset.iterrows()):
    
    validation_article_title.append(create_embedding(row['article_page_title']))
    validation_table_title.append(create_embedding(row['table_page_title']))  
    validation_label.append(row['label'])

18834it [00:06, 2875.43it/s]


In [10]:
validation_article_title = np.array(validation_article_title)
validation_table_title = np.array(validation_table_title)
validation_label = np.array(validation_label)

In [11]:
validation_article_title.shape

(18834, 31, 50)

In [12]:
def coattention_method(query, key, value):
    
    scores = tf.matmul(query, key, transpose_b=True)
    
    distribution = tf.nn.softmax(scores)
    
    attention_matrix = tf.matmul(distribution, value)
    
    return attention_matrix

In [13]:
def attention_model():
    
    article_title = tf.keras.Input(shape=(31,50), dtype='float32')
    table_title = tf.keras.Input(shape=(31,50), dtype='float32')
    
    
    affinity_titleA_titleB = coattention_method(article_title,table_title,article_title)
    affinity_titleB_titleA = coattention_method(table_title,article_title,table_title)
    
    context_coattentionA = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(32, return_sequences=True))(affinity_titleA_titleB)
    context_coattentionB = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(32, return_sequences=True))(affinity_titleB_titleA)
    
    learned_vector1 = tf.keras.layers.Flatten()(context_coattentionA)
    learned_vector2 = tf.keras.layers.Flatten()(context_coattentionB)
    
    
    context_titleA = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(32, return_sequences=True))(article_title)
    context_titleB = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(32, return_sequences=True))(table_title)
    
    affinity1_titleA_titleB = coattention_method(context_titleA,context_titleB,context_titleA)
    affinity1_titleB_titleA = coattention_method(context_titleB,context_titleA,context_titleB)
    
    context2_coattentionA = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(32, return_sequences=True))(affinity1_titleA_titleB)
    context2_coattentionB = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(32, return_sequences=True))(affinity1_titleB_titleA)
    
    learned_vector3 = tf.keras.layers.Flatten()(context2_coattentionA)
    learned_vector4 = tf.keras.layers.Flatten()(context2_coattentionB)
    
    final_vector = tf.keras.layers.Concatenate()([learned_vector1,learned_vector2,learned_vector3,learned_vector4])
    
    MLP_input = tf.keras.layers.Dense(64, activation='relu')(final_vector)
    dropout1 = tf.keras.layers.Dropout(0.5, name="dropout1")(MLP_input)
    MLP_hidden = tf.keras.layers.Dense(32, activation='relu')(dropout1)
    dropout2 = tf.keras.layers.Dropout(0.5, name="dropout2")(MLP_hidden)
    MLP_output = tf.keras.layers.Dense(1,activation='sigmoid')(dropout2)
    
    model = tf.keras.Model(inputs=[article_title,table_title],outputs=MLP_output)
    
    return model

In [14]:
model = attention_model()

In [15]:
model.compile(loss="binary_crossentropy",optimizer="adam", metrics=['accuracy'])

In [16]:
filepath="coattention_several_spaces_model_title_{epoch:02d}_{val_accuracy:.4f}.h5"
checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

In [17]:
history = model.fit([train_article_title, train_table_title], train_label, 
          epochs=20, 
          batch_size=32, 
          validation_data=([validation_article_title, validation_table_title], validation_label),
          callbacks=callbacks_list)

Train on 169500 samples, validate on 18834 samples
Epoch 1/20
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 00001: val_accuracy improved from -inf to 0.92508, saving model to coattention_several_spaces_model_title_01_0.9251.h5
Epoch 2/20
Epoch 00002: val_accuracy improved from 0.92508 to 0.93023, saving model to coattention_several_spaces_model_title_02_0.9302.h5
Epoch 3/20
Epoch 00003: val_accuracy improved from 0.93023 to 0.93464, saving model to coattention_several_spaces_model_title_03_0.9346.h5
Epoch 4/20
Epoch 00004: val_accuracy improved from 0.93464 to 0.93852, saving model to coattention_several_spaces_model_title_04_0.9385.h5
Epoch 5/20
Epoch 00005: val_accuracy did not improve from 0.93852
Epoch 6/20
Epoch 00006: val_accuracy improved from 0.93852 to 0.93990, saving model to coattention_several_spaces_model_title_06_0.9399.h5
Epoch 7/20
Epoch 00007: val_accuracy did not improve from 0.93990
Epoch 8/20
Epoch 00008: val_acc

In [None]:
# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

In [None]:
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

In [None]:
model = tf.keras.models.load_model('affinity_model_train_cossine.h5')

In [None]:
y_prob = model.predict([validation_article_title,validation_table_title])
y_predict = (y_prob > 0.5).astype(np.int)
accuracy_score(validation_label, y_predict)

In [None]:
import seaborn as sns
import sklearn.metrics
confusion_matrix = sklearn.metrics.confusion_matrix(validation_label, y_predict)

plt.figure(figsize=(10, 8))
sns.heatmap(confusion_matrix, annot=True, fmt="d", cmap="YlGnBu")#, annot_kws={"size": 15});
plt.title("Confusion matrix")
plt.ylabel('True label')
plt.xlabel('Predict label')
plt.show()