In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import gensim as gs
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()
import tensorflow as tf
from transformers import TFBertModel, BertTokenizer, TFBertMainLayer, BertConfig

In [None]:
MAX_TOKENS = 250

In [None]:
train_dataset = pd.read_csv('../../../train_data/train_data_T.csv', delimiter=',')
validation_dataset = pd.read_csv('../../../train_data/validation_data_T.csv', delimiter=',')

In [None]:
# train_dataset = train_dataset.head(10)
# validation_dataset = validation_dataset.head(10)

In [None]:
# fast text embedding

In [None]:
# embedding_model = gs.models.FastText.load('../../train_embedding_models/fasttext_embedding_50d_all_signals')
embedding_model = gs.models.FastText.load_fasttext_format('../../../pre_trained_models/cc.en.300.bin')

In [None]:
MAX_PAD_TITLE = 30

def sequence_padding_title(X_DIM, value):
    
    value_padding = np.pad(value, ((0,MAX_PAD_TITLE - X_DIM),(0,0)), 'constant')
    
    return value_padding

In [None]:
def create_embedding_title(value):

    value = tknzr.tokenize(str(value))
    
    if len(value) < MAX_PAD_TITLE:
        
        embedding = embedding_model.wv[value]
        embedding = embedding.astype('float16')
        
        padding_embedding = sequence_padding_title(embedding.shape[0],embedding)
        
        return padding_embedding
        
    else:
        
        embedding = embedding_model.wv[value[0:MAX_PAD_TITLE]]
        embedding = embedding.astype('float16')
        
        return embedding

In [None]:
MAX_PAD_MAIN_PASSAGE = 55

def sequence_padding_main_passage(X_DIM, value):
    
    value_padding = np.pad(value, ((0,MAX_PAD_MAIN_PASSAGE - X_DIM),(0,0)), 'constant')
    
    return value_padding

In [None]:
def create_embedding_main_passage(value):

    value = tknzr.tokenize(str(value))
    
    if len(value) < MAX_PAD_MAIN_PASSAGE:
        
        embedding = embedding_model.wv[value]
        embedding = embedding.astype('float16')
        
        padding_embedding = sequence_padding_main_passage(embedding.shape[0],embedding)
        
        return padding_embedding
        
    else:
        
        embedding = embedding_model.wv[value[0:MAX_PAD_MAIN_PASSAGE]]
        embedding = embedding.astype('float16')
        
        return embedding

In [None]:
train_article_title = []
train_article_main_passage = []
train_article_keywords = []

train_table_title = []
train_table_main_passage = []
train_table_keywords = []

train_label = []

for i,row in tqdm(train_dataset.iterrows()):
    
    train_article_title.append(create_embedding_title(row['article_page_title']))
    train_table_title.append(create_embedding_title(row['table_page_title']))
    
    train_article_main_passage.append(create_embedding_main_passage(row['article_meta_description']))
    train_table_main_passage.append(create_embedding_main_passage(row['table_page_summary']))
    
    train_article_keywords.append(create_embedding_title(row['article_keywords']))
    train_table_keywords.append(create_embedding_title(row['table_page_keywords']))
    
    train_label.append((row['label']))

In [None]:
train_article_title = np.array(train_article_title, dtype='float16')
train_table_title = np.array(train_table_title, dtype='float16')
train_article_main_passage = np.array(train_article_main_passage, dtype='float16')
train_table_main_passage = np.array(train_table_main_passage, dtype='float16')
train_article_keywords = np.array(train_article_keywords, dtype='float16')
train_table_keywords = np.array(train_table_keywords, dtype='float16')

train_label = np.array(train_label)

In [None]:
validation_article_title = []
validation_article_main_passage = []
validation_article_keywords = []

validation_table_title = []
validation_table_main_passage = []
validation_table_keywords = []

validation_label = []

for i, row in tqdm(validation_dataset.iterrows()):
    
    validation_article_title.append(create_embedding_title(row['article_page_title']))
    validation_table_title.append(create_embedding_title(row['table_page_title']))  
    
    validation_article_main_passage.append(create_embedding_main_passage(row['article_meta_description']))
    validation_table_main_passage.append(create_embedding_main_passage(row['table_page_summary']))
    
    validation_article_keywords.append(create_embedding_title(row['article_keywords']))
    validation_table_keywords.append(create_embedding_title(row['table_page_keywords']))
    
    validation_label.append((row['label']))

In [None]:
validation_article_title = np.array(validation_article_title, dtype='float16')
validation_article_main_passage = np.array(validation_article_main_passage, dtype='float16')
validation_table_title = np.array(validation_table_title, dtype='float16')
validation_table_main_passage = np.array(validation_table_main_passage, dtype='float16')
validation_article_keywords = np.array(validation_article_keywords, dtype='float16')
validation_table_keywords = np.array(validation_table_keywords, dtype='float16')

validation_label = np.array(validation_label)

In [None]:
embedding_model = []

In [None]:
article_title = tf.keras.Input(shape=(MAX_PAD_TITLE,300), dtype='float32')
article_main_passage = tf.keras.Input(shape=(MAX_PAD_MAIN_PASSAGE,300), dtype='float32')
article_keywords = tf.keras.Input(shape=(MAX_PAD_TITLE,300), dtype='float32')

table_title = tf.keras.Input(shape=(MAX_PAD_TITLE,300), dtype='float32')
table_main_passage = tf.keras.Input(shape=(MAX_PAD_MAIN_PASSAGE,300), dtype='float32')
table_keywords = tf.keras.Input(shape=(MAX_PAD_TITLE,300), dtype='float32')

# article and tables signal representations
rep_article_title = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(32, return_sequences=True))(article_title)
rep_article_text = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(32, return_sequences=True))(article_main_passage)
rep_article_keywords = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(32, return_sequences=True))(article_keywords)

rep_table_title = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(32, return_sequences=True))(table_title)
rep_table_text = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(32, return_sequences=True))(table_main_passage)
rep_table_keywords = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(32, return_sequences=True))(table_keywords)


vector_A_title = tf.keras.layers.Flatten()(rep_article_title)
vector_A_Text = tf.keras.layers.Flatten()(rep_article_text)
vector_A_keywords = tf.keras.layers.Flatten()(rep_article_keywords)

vector_T_title = tf.keras.layers.Flatten()(rep_table_title)
vector_T_Text = tf.keras.layers.Flatten()(rep_table_text)
vector_T_keywords = tf.keras.layers.Flatten()(rep_table_keywords)


#learned vector
final_vector = tf.keras.layers.Concatenate()([vector_A_title,vector_A_Text,vector_A_keywords,vector_T_title,vector_T_Text,vector_T_keywords])
    
MLP_input = tf.keras.layers.Dense(512, activation='relu')(final_vector)
dropout1 = tf.keras.layers.Dropout(0.5, name="dropout1")(MLP_input)
MLP_hidden = tf.keras.layers.Dense(256, activation='relu')(dropout1)
dropout2 = tf.keras.layers.Dropout(0.5, name="dropout2")(MLP_hidden)
MLP_output = tf.keras.layers.Dense(1,activation='sigmoid')(dropout2)

model = tf.keras.Model(inputs=[article_title, article_main_passage, article_keywords, table_title, table_main_passage, table_keywords],outputs=MLP_output)  

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)

In [None]:
model.compile(loss="binary_crossentropy",optimizer=optimizer, metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
filepath="ablation_model01_{epoch:02d}_{val_accuracy:.4f}.h5"
checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_format='tf', save_best_only=True, mode='max')
callbacks_list = [checkpoint]

In [None]:
history = model.fit(
    [train_article_title,
    train_article_main_passage,
    train_article_keywords,
    train_table_title,
    train_table_main_passage,
    train_table_keywords], 
    train_label, 
    epochs=100, 
    batch_size=16,
    verbose=1,
    validation_data=(
    [validation_article_title,
    validation_article_main_passage,
    validation_article_keywords,
    validation_table_title,
    validation_table_main_passage,
    validation_table_keywords], 
    validation_label),
    callbacks=callbacks_list)