In [None]:
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.metrics import pairwise_distances
import random
import gensim as gs
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()
import tensorflow as tf

In [None]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
    raise SystemError('GPU not found')
print('found GPU at {}'.format(device_name))

In [None]:
physical_devices = tf.config.list_physical_devices('GPU')

In [None]:
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

In [None]:
articles = pd.read_csv('../../dataset/data_articles_test.csv', delimiter=',', keep_default_na=False)

tables = pd.read_csv('../../dataset/distinct_tables_allsignals.csv', delimiter=',', keep_default_na=False)
tables = tables.drop(tables[tables.table_page_title == ""].index)

## encoder model create by us

In [None]:
def triplet_loss(y_true, y_pred, alpha = 0.5):
     
    anchor = y_pred[:,0:1024]
    positive = y_pred[:,1024:2048]
    negative = y_pred[:,2048:3072]
        
    # distance between the anchor and the positive
    #pos_dist = tf.keras.backend.sum(tf.keras.backend.square(anchor-positive),axis=1)
    pos_dist = tf.keras.layers.Dot(axes=1,normalize=True)([anchor, positive])
    
    # distance between the anchor and the negative
    #neg_dist = tf.keras.backend.sum(tf.keras.backend.square(anchor-negative),axis=1)
    neg_dist = tf.keras.layers.Dot(axes=1,normalize=True)([anchor, negative])
    
    # compute loss
    #basic_loss = pos_dist-neg_dist+alpha
    basic_loss = (1 - pos_dist) - (1 - neg_dist) + alpha
    loss = tf.keras.backend.maximum(basic_loss,0.0)
 
    return loss

In [None]:
article_page_title = tf.keras.Input(shape=(161,50), dtype='float32')
true_table_page_title = tf.keras.Input(shape=(161,50), dtype='float32')
false_table_page_title = tf.keras.Input(shape=(161,50), dtype='float32')

context_layer = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(512))

#context vectors
context_article_page_title = context_layer(article_page_title)
context_true_table_page_title = context_layer(true_table_page_title)
context_false_table_page_title = context_layer(false_table_page_title)

concatenated = tf.keras.layers.Concatenate(axis=-1)([context_article_page_title, context_true_table_page_title, context_false_table_page_title])

encoder_model = tf.keras.Model(inputs=[article_page_title,true_table_page_title,false_table_page_title],outputs=concatenated)

In [None]:
encoder_model.summary()

In [None]:
encoder_model.compile(loss=triplet_loss,optimizer="adam")

In [None]:
encoder_model.load_weights('encoder_title_main_passage_05_0.0245.h5')

In [None]:
final_encoder_model = tf.keras.Model(article_page_title,encoder_model.get_layer('bidirectional').output)

## fast text emedding

In [None]:
fast_text_embedding = gs.models.FastText.load('../../train_embedding_models/fasttext_embedding_50d_all_signals')

In [None]:
MAX_PAD = 86

def sequence_padding(X_DIM, value):
    
    value_padding = np.pad(value, ((0,MAX_PAD - X_DIM),(0,0)), 'constant')
    
    return value_padding

In [None]:
def create_embedding(value):

    value = tknzr.tokenize(str(value))
    
    if len(value) < MAX_PAD:
        
        embedding = fast_text_embedding.wv[value]
        embedding = embedding.astype('float16')
        
        padding_embedding = sequence_padding(embedding.shape[0],embedding)
        
        return padding_embedding
        
    else:
        
        embedding = fast_text_embedding.wv[value[0:MAX_PAD]]
        embedding = embedding.astype('float16')
        
        return embedding

In [None]:
articles.head(1)

## encode articles using encoder model

In [None]:
articles_ids = []
articles_fasttext_embedding = []

for i, row in tqdm(articles.iterrows()):
    
    article_title = row['page_title']
    article_meta_description = row['meta_description']
    article_keywords = row['keywords']
    
    catch_all = article_title+" "+article_meta_description
       
    #fast text embedding
    title_fast_text_embedding = create_embedding(catch_all)
    
    #save list
    articles_ids.append(row['page_id'])
    articles_fasttext_embedding.append(title_fast_text_embedding)

In [None]:
articles_fasttext_embedding = np.array(articles_fasttext_embedding,dtype='float16')

In [None]:
articles_vector = final_encoder_model.predict(x=articles_fasttext_embedding)

In [None]:
tables.head(1)

In [None]:
tables_ids = []
tables_fastext_embedding = []

for i, row in tqdm(tables.iterrows()):
    
    table_title = row['table_page_title']
    table_pg_summary = row['table_page_summary']
    table_pg_keywords = row['table_page_keywords']
    
    catch_all = table_title+" "+table_pg_summary
    
    #fast text embedding
    title_fast_text_embedding = create_embedding(catch_all)
    
    tables_ids.append(row['table_id'])
    tables_fastext_embedding.append(title_fast_text_embedding) 

In [None]:
tables_fastext_embedding = np.array(tables_fastext_embedding,dtype='float16')

In [None]:
tables_vector = final_encoder_model.predict(x=tables_fastext_embedding)

## test the model on final task

In [None]:
def getAccuracy(idRankedTables, idQueryGoal):

    accuracy = 0

    for idTable in idRankedTables:
        
        if idTable[0] == idQueryGoal:
    
            accuracy = 1
            break;

    return accuracy

In [None]:
def saveAccuracy(k,accuracy):
    
    if k == 1:
            
        AverageTop1.append(accuracy)
    
    if k == 5:
            
        AverageTop5.append(accuracy)
        
    if k == 10:
            
        AverageTop10.append(accuracy)
        
    if k == 20:
            
        AverageTop20.append(accuracy)
    
    if k == 50:
            
        AverageTop50.append(accuracy)
    
    if k == 100:
            
        AverageTop100.append(accuracy)

In [None]:
AverageTop1 = []
AverageTop5 = []
AverageTop10 = []
AverageTop20 = []
AverageTop50 = []
AverageTop100 = []

topK = [1,5,10,20,50,100]

for i in tqdm(range(len(articles_vector))):
    
    idQueryGoal = articles_ids[i]
    
    distance_vector = pairwise_distances(articles_vector[i].reshape(1,1024), tables_vector, metric='cosine')
    
    #creating the dataframe
    all_tables_score = []
    
    for j in range(len(tables_ids)):
        
        table_id = tables_ids[j]
        table_score = distance_vector[0][j]
        
        new_row = {"table_id": table_id,"table_score": table_score}
        
        all_tables_score.append(new_row)
        
    df_all_tables_scores = pd.DataFrame(all_tables_score)
    df_tables_sorting = df_all_tables_scores.sort_values('table_score')
    
    #compute the accuracy
    for accuracyK in topK:
        
        selected_top = df_tables_sorting.head(accuracyK)
        min_score = selected_top['table_score'].max()
        draw_tables_socres = df_tables_sorting[df_tables_sorting['table_score'] <= min_score]
        final_ranked_tables = draw_tables_socres.iloc[:,0:1].values
        
        accuracy_value = getAccuracy(final_ranked_tables,idQueryGoal)
        
        #save the accuracy on the list
        saveAccuracy(accuracyK,accuracy_value)

In [None]:
print("TOP@1 = "+ str(round(np.mean(AverageTop1),4)))
print("TOP@5 = "+ str(round(np.mean(AverageTop5),4)))
print("TOP@10 = "+ str(round(np.mean(AverageTop10),4)))
print("TOP@20 = "+ str(round(np.mean(AverageTop20),4)))
print("TOP@50 = "+ str(round(np.mean(AverageTop50),4)))
print("TOP@100 = "+ str(round(np.mean(AverageTop100),4)))