In [1]:
import warnings
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import gensim as gs
from tqdm import tqdm
import tensorflow as tf
import matplotlib.pyplot as plt
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
warnings.filterwarnings('ignore')

In [2]:
train_dataset_1_1 = pd.read_csv('dataset/train_model_050_1M.csv', delimiter=',')

In [3]:
train_dataset, validation_dataset = train_test_split(train_dataset_1_1, test_size=0.10, random_state=42)

In [4]:
# embedding_model = gs.models.FastText.load_fasttext_format('pre_trained_models/cc.en.300.bin')
embedding_model = gs.models.FastText.load('pre_trained_models/embedding_model_50d')

In [5]:
MAX_PAD = 31

def sequence_padding(X_DIM, value):
    
    value_padding = np.pad(value, ((0,MAX_PAD - X_DIM),(0,0)), 'constant')
    
    return value_padding

In [6]:
def create_embedding(value):

    value = tknzr.tokenize(str(value))
    
    if len(value) < MAX_PAD:
        
        embedding = embedding_model.wv[value]
        
        padding_embedding = sequence_padding(embedding.shape[0],embedding)
        
        return padding_embedding
        
    else:
        
        embedding = embedding_model.wv[value[0:MAX_PAD]]
        
        return embedding

In [7]:
train_article_title = []
train_table_title = []
train_label = []

for i,row in tqdm(train_dataset.iterrows()):
    
    train_article_title.append(create_embedding(row['article_title']))
    train_table_title.append(create_embedding(row['table_title']))
    train_label.append(row['label'])

999928it [06:25, 2592.41it/s]


In [8]:
train_article_title = np.array(train_article_title)
train_table_title = np.array(train_table_title)
train_label = np.array(train_label)

In [9]:
train_article_title.shape

(999928, 31, 50)

In [10]:
validation_article_title = []
validation_table_title = []
validation_label = []

for i, row in tqdm(validation_dataset.iterrows()):
    
    validation_article_title.append(create_embedding(row['article_title']))
    validation_table_title.append(create_embedding(row['table_title']))  
    validation_label.append(row['label'])

111104it [00:42, 2612.62it/s]


In [11]:
validation_article_title = np.array(validation_article_title)
validation_table_title = np.array(validation_table_title)
validation_label = np.array(validation_label)

In [12]:
validation_article_title.shape

(111104, 31, 50)

In [13]:
def coattention_method(query, document):
    
    transpose_document = tf.transpose(document,perm=[0, 2, 1])
    
    affinity = tf.matmul(query,transpose_document)

#     transpose_affinity = tf.transpose(affinity, perm=[0, 2, 1])
    
#     row_normalized = tf.nn.softmax(affinity, axis=1)
    
#     column_normalized = tf.nn.softmax(transpose_affinity, axis=1)
    
#     context_vector = tf.matmul(row_normalized,document)
    
#     concatenate_context_query = tf.concat([context_vector,query],axis=-1)
     
#     context_document = tf.matmul(column_normalized,concatenate_context_query)
    
#     co_attention = tf.concat([document,context_document],axis=-1)
    
    return affinity

In [14]:
def attention_model():
    
    article_title = tf.keras.Input(shape=(31,50), dtype='float32')
    table_title = tf.keras.Input(shape=(31,50), dtype='float32')
    
    encodding_article_title = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(32, return_sequences=True))(article_title)
    encodding_table_title = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(32, return_sequences=True))(table_title)
    
    attention_title_title = coattention_method(encodding_article_title,encodding_table_title)
    
    encodding_attention = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(32, return_sequences=True))(attention_title_title)
    
    learned_vector = tf.keras.layers.Flatten()(encodding_attention)
   
    dense1 = tf.keras.layers.Dense(64, activation='relu')(learned_vector)
    dropout1 = tf.keras.layers.Dropout(0.5, name="dropout1")(dense1)
#     dense2 = tf.keras.layers.Dense(64, activation='relu')(dropout1)
#     dropout2 = tf.keras.layers.Dropout(0.5, name="dropout2")(dense2)
    prediction = tf.keras.layers.Dense(1,activation='sigmoid')(dropout1)
    
    model = tf.keras.Model(inputs=[article_title,table_title],outputs=prediction)
    
    return model

In [15]:
model = attention_model()

In [16]:
model.compile(loss="binary_crossentropy",optimizer="adam", metrics=['accuracy'])

In [17]:
checkpoint = tf.keras.callbacks.ModelCheckpoint('GRU_affinity_model_050_1M.h5', monitor='val_acc',verbose=1, save_the_best_only=True, mode='max')
callbacks_list = [checkpoint]

In [18]:
history = model.fit([train_article_title, train_table_title], train_label, 
          epochs=10, 
          batch_size=32, 
          validation_data=([validation_article_title, validation_table_title], validation_label),
          callbacks=callbacks_list)

Train on 999928 samples, validate on 111104 samples
Epoch 1/10
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
195040/999928 [====>.........................] - ETA: 13:43 - loss: 0.0768 - accuracy: 0.9753
Epoch 00001: saving model to GRU_affinity_model_050_1M.h5
195072/999928 [====>.........................] - ETA: 13:43 - loss: 0.0768 - accuracy: 0.9753

KeyboardInterrupt: 

In [None]:
# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

In [None]:
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

In [None]:
y_prob = model.predict([validation_article_title,validation_table_title])
y_predict = (y_prob > 0.5).astype(np.int)
accuracy_score(validation_label, y_predict)

In [None]:
import seaborn as sns
import sklearn.metrics
confusion_matrix = sklearn.metrics.confusion_matrix(validation_label, y_predict)

plt.figure(figsize=(10, 8))
sns.heatmap(confusion_matrix, annot=True, fmt="d", cmap="YlGnBu")#, annot_kws={"size": 15});
plt.title("Confusion matrix")
plt.ylabel('True label')
plt.xlabel('Predict label')
plt.show()

In [None]:
# count = 0
# wrong = []
# for i in range(len(validation_label)):
    
#     if validation_label[i] != y_predict[i]:
#         if validation_label[i] == 1:
#             wrong.append(i)
#             count = count + 1
            
# print(count)

In [None]:
# for i in wrong:
#     print(validation_dataset.iloc[i]['article_title']," - ",validation_dataset.iloc[i]['table_title'])
#     print("")