In [None]:
import warnings
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import gensim as gs
from tqdm import tqdm
import tensorflow as tf
import matplotlib.pyplot as plt
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
warnings.filterwarnings('ignore')
import fasttext

In [None]:
train_dataset = pd.read_csv('../train_data/train_data_1_1', delimiter=',')
validation_dataset = pd.read_csv('../train_data/validation_data_1_1', delimiter=',')

In [None]:
embedding_model = gs.models.FastText.load('../train_embedding_models/fasttext_embedding_50d_all_signals')

In [None]:
MAX_PAD_TITLE = 161

def sequence_padding(X_DIM, value):
    
    value_padding = np.pad(value, ((0,MAX_PAD_TITLE - X_DIM),(0,0)), 'constant')
    
    return value_padding

In [None]:
def create_embedding(value):

    value = tknzr.tokenize(str(value))
    
    if len(value) < MAX_PAD_TITLE:
        
        embedding = embedding_model.wv[value]
        embedding = embedding.astype('float16')
        
        padding_embedding = sequence_padding(embedding.shape[0],embedding)
        
        return padding_embedding
        
    else:
        
        embedding = embedding_model.wv[value[0:MAX_PAD_TITLE]]
        embedding = embedding.astype('float16')
        
        return embedding

In [None]:
train_article_catch_all = []
train_table_catch_all = []
train_label = []

for i,row in tqdm(train_dataset.iterrows()):
    
    catch_all_article = row['article_page_title']+" "+row['article_meta_description']+" "+row['article_keywords']
    train_article_catch_all.append(create_embedding(catch_all_article))
    
    catch_all_tables = str(row['table_page_title'])+" "+str(row['table_page_summary'])+" "+str(row['table_page_keywords'])
    train_table_catch_all.append(create_embedding(catch_all_tables))
    
    train_label.append(row['label'])

In [None]:
train_article_catch_all = np.array(train_article_catch_all, dtype='float16')
train_table_catch_all = np.array(train_table_catch_all, dtype='float16')
train_label = np.array(train_label)

In [None]:
train_article_catch_all.shape

In [None]:
validation_article_catch_all = []
validation_table_catch_all = []
validation_label = []

for i, row in tqdm(validation_dataset.iterrows()):
    
    catch_all_article = row['article_page_title']+" "+row['article_meta_description']+" "+row['article_keywords']
    validation_article_catch_all.append(create_embedding(catch_all_article))
    
    catch_all_tables = str(row['table_page_title'])+" "+str(row['table_page_summary'])+" "+str(row['table_page_keywords'])
    validation_table_catch_all.append(create_embedding(catch_all_tables))
    
    validation_label.append(row['label'])

In [None]:
validation_article_catch_all = np.array(validation_article_catch_all, dtype='float16')
validation_table_catch_all = np.array(validation_table_catch_all, dtype='float16')
validation_label = np.array(validation_label)

In [None]:
validation_article_catch_all.shape

In [None]:
def attention_method(query, key, value):
    
    scores = tf.matmul(query, key, transpose_b=True)
    
    distribution = tf.nn.softmax(scores)
    
    attention_matrix = tf.matmul(distribution, value)
    
    return attention_matrix

In [None]:
def attention_model():
    
    # inputs
    article_catch_all = tf.keras.Input(shape=(161,50), dtype='float32')
    table_catch_all = tf.keras.Input(shape=(161,50), dtype='float32')
    
    # article and tables signal representations
    rep_article_catch_all = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(32, return_sequences=True))(article_catch_all)
    rep_table_catch_all = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(32, return_sequences=True))(table_catch_all)
    
    #attention 
    att_A_catch_T_catch1 = attention_method(rep_article_catch_all,rep_table_catch_all,rep_article_catch_all)
    rep_att_A_catch_T_catch1 = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(32, return_sequences=True))(att_A_catch_T_catch1)
    vector1 = tf.keras.layers.Flatten()(rep_att_A_catch_T_catch1)
    
    
    #learned vector
    #final_vector = tf.keras.layers.Concatenate()([vector1,vector2])
    
    MLP_input = tf.keras.layers.Dense(128, activation='relu')(vector1)
    dropout1 = tf.keras.layers.Dropout(0.5, name="dropout1")(MLP_input)
    MLP_hidden = tf.keras.layers.Dense(64, activation='relu')(dropout1)
    dropout2 = tf.keras.layers.Dropout(0.5, name="dropout2")(MLP_hidden)
    MLP_output = tf.keras.layers.Dense(1,activation='sigmoid')(dropout2)
    
    model = tf.keras.Model(inputs=[article_catch_all,table_catch_all],outputs=MLP_output)
    
    return model

In [None]:
model = attention_model()

In [None]:
model.compile(loss="binary_crossentropy",optimizer="adam", metrics=['accuracy'])

In [None]:
filepath="attention_model_catch_all_1_1_{epoch:02d}_{val_accuracy:.4f}.h5"
checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

In [None]:
history = model.fit([train_article_catch_all,train_table_catch_all], train_label, 
          epochs=100, 
          batch_size=32, 
          verbose=1,
          validation_data=([validation_article_catch_all,validation_table_catch_all], validation_label),
          callbacks=callbacks_list)