In [None]:
import warnings
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import gensim as gs
from tqdm import tqdm
import tensorflow as tf
import matplotlib.pyplot as plt
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
warnings.filterwarnings('ignore')
import fasttext
from hyperopt import Trials, STATUS_OK, tpe
from hyperas import optim
from hyperas.distributions import choice, uniform

In [None]:
def sequence_padding(X_DIM, value):
    
    MAX_PAD = 31
    
    value_padding = np.pad(value, ((0,MAX_PAD - X_DIM),(0,0)), 'constant')
    
    return value_padding

In [None]:
def create_embedding(value,embedding_model,tknzr):
    
    MAX_PAD = 31
    
    value = tknzr.tokenize(str(value))
    
    if len(value) < MAX_PAD:
        
        embedding = embedding_model.wv[value]
        embedding = embedding.astype('float16')
        
        padding_embedding = sequence_padding(embedding.shape[0],embedding)
        
        return padding_embedding
        
    else:
        
        embedding = embedding_model.wv[value[0:MAX_PAD]]
        embedding = embedding.astype('float16')
        
        return embedding

In [None]:
def data():
    
    tknzr = TweetTokenizer()
    
    embedding_model = gs.models.FastText.load('../train_embedding_models/fasttext_embedding_50d_all_signals')
    
    train_dataset = pd.read_csv('../train_data/train_data_1_1', delimiter=',')
    validation_dataset = pd.read_csv('../train_data/validation_data_1_1', delimiter=',')
        
    train_article_title = []
    train_table_title = []
    train_label = []

    #train data
    for i,row in tqdm(train_dataset.iterrows()):

        train_article_title.append(create_embedding(row['article_page_title'],embedding_model,tknzr))
        train_table_title.append(create_embedding(row['table_page_title'],embedding_model,tknzr))
        train_label.append(row['label'])
        
    train_article_title = np.array(train_article_title,dtype='float16')
    train_table_title = np.array(train_table_title,dtype='float16')
    train_label = np.array(train_label)
    
    #validation data
    validation_article_title = []
    validation_table_title = []
    validation_label = []

    for i, row in tqdm(validation_dataset.iterrows()):

        validation_article_title.append(create_embedding(row['article_page_title'],embedding_model,tknzr))
        validation_table_title.append(create_embedding(row['table_page_title'],embedding_model,tknzr))  
        validation_label.append(row['label'])
        
    validation_article_title = np.array(validation_article_title,dtype='float16')
    validation_table_title = np.array(validation_table_title,dtype='float16')
    validation_label = np.array(validation_label)
    
    return train_article_title,train_table_title,train_label,validation_article_title,validation_table_title,validation_label
    

In [None]:
def affinity_method(query, document):
    
    transpose_document = tf.transpose(document,perm=[0, 2, 1])
    
    affinity = tf.matmul(query,transpose_document)
    
    normalize_affinity = tf.nn.softmax(affinity)
    
    return normalize_affinity

In [None]:
def attention_model():
    
    article_title = tf.keras.Input(shape=(31,50), dtype='float32')
    table_title = tf.keras.Input(shape=(31,50), dtype='float32')
    
    univesal_BiGRU = tf.keras.layers.Bidirectional(tf.keras.layers.GRU({{choice([8,16,32,64,128,256,512])}}, return_sequences=True))
    
    context_titleA = univesal_BiGRU(article_title)
    context_titleB = univesal_BiGRU(table_title)
    
    affinity_titleA_titleB = affinity_method(context_titleA,context_titleB)
    context_attention = tf.keras.layers.Bidirectional(tf.keras.layers.GRU({{choice([8,16,32,64,128,256,512])}}, return_sequences=True))(affinity_titleA_titleB)
    learned_vectorA = tf.keras.layers.Flatten()(context_attention)
    
    affinity_embedding_titleA_titleB = affinity_method(article_title,table_title)
    context_affinity_embedding = tf.keras.layers.Bidirectional(tf.keras.layers.GRU({{choice([8,16,32,64,128,256,512])}}, return_sequences=True))(affinity_embedding_titleA_titleB)
    learned_vectorB = tf.keras.layers.Flatten()(context_affinity_embedding)
    
    final_representation = tf.keras.layers.Concatenate()([learned_vectorA,learned_vectorB])
    
    MLP_input = tf.keras.layers.Dense({{choice([8,16,32,64,128,256,512])}}, activation={{choice(['relu', 'sigmoid','linear'])}})(final_representation)
    dropout1 = tf.keras.layers.Dropout({{uniform(0, 1)}}, name="dropout1")(MLP_input)
    MLP_hidden = tf.keras.layers.Dense({{choice([8,16,32,64,128,256,512])}}, activation={{choice(['relu', 'sigmoid', 'linear'])}})(dropout1)
    dropout2 = tf.keras.layers.Dropout({{uniform(0, 1)}}, name="dropout2")(MLP_hidden)
    MLP_output = tf.keras.layers.Dense(1,activation='sigmoid')(dropout2)
    
    model = tf.keras.Model(inputs=[article_title,table_title],outputs=MLP_output)
    
    model.compile(loss="binary_crossentropy",optimizer={{choice(['rmsprop', 'adam', 'sgd'])}}, metrics=['accuracy'])
    
    result = model.fit([train_article_title, train_table_title], train_label, 
          epochs=100, 
          batch_size={{choice([32, 64, 128])}},
          verbose=1,
          validation_data=([validation_article_title, validation_table_title], validation_label))
    
    #get the highest validation accuracy of the training epochs
    validation_acc = np.amax(result.history['val_accuracy']) 
    print('Best validation acc of epoch:', validation_acc)
    return {'loss': -validation_acc, 'status': STATUS_OK, 'model': model}

In [None]:
with tf.device("/cpu:0"):

    best_run, best_model = optim.minimize(model=attention_model,
                                          data=data,
                                          functions=[sequence_padding,create_embedding,affinity_method],
                                          algo=tpe.suggest,
                                          max_evals=5,
                                          verbose=1,
                                          trials=Trials(),
                                          notebook_name='best_model_attention_parameters_search')

In [None]:
best_model.save('best_model_affinity_title_grid_search.h5') 

In [None]:
print(best_run)