In [1]:
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.metrics import pairwise_distances
import random
import gensim as gs
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()
import tensorflow as tf

In [2]:
articles = pd.read_csv('../../dataset/data_articles_test.csv', delimiter=',', keep_default_na=False)

tables = pd.read_csv('../../dataset/distinct_tables_allsignals.csv', delimiter=',', keep_default_na=False)
tables = tables.drop(tables[tables.table_page_title == ""].index)

## encoder model create by us

In [3]:
def triplet_loss(y_true, y_pred, alpha = 0.4):
     
    anchor = y_pred[:,0:64]
    positive = y_pred[:,64:128]
    negative = y_pred[:,128:192]
    
    # distance between the anchor and the positive
    pos_dist = tf.keras.backend.sum(tf.keras.backend.square(anchor-positive),axis=1)

    # distance between the anchor and the negative
    neg_dist = tf.keras.backend.sum(tf.keras.backend.square(anchor-negative),axis=1)

    # compute loss
    basic_loss = pos_dist-neg_dist+alpha
    loss = tf.keras.backend.maximum(basic_loss,0.0)
 
    return loss

In [4]:
article_page_title = tf.keras.Input(shape=(31,50), dtype='float32')
true_table_page_title = tf.keras.Input(shape=(31,50), dtype='float32')
false_table_page_title = tf.keras.Input(shape=(31,50), dtype='float32')

context_layer = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(32))

#context vectors
context_article_page_title = context_layer(article_page_title)
context_true_table_page_title = context_layer(true_table_page_title)
context_false_table_page_title = context_layer(false_table_page_title)

concatenated = tf.keras.layers.Concatenate(axis=-1)([context_article_page_title, context_true_table_page_title, context_false_table_page_title])

encoder_model = tf.keras.Model(inputs=[article_page_title,true_table_page_title,false_table_page_title],outputs=concatenated)

In [5]:
encoder_model.compile(loss=triplet_loss,optimizer="adam")

In [6]:
encoder_model.load_weights('encoder_title_07_0.0345.h5')

In [8]:
final_encoder_model = tf.keras.Model(article_page_title,encoder_model.get_layer('bidirectional').output)

## fast text emedding

In [9]:
fast_text_embedding = gs.models.FastText.load('../../train_embedding_models/fasttext_embedding_50d_all_signals')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [10]:
MAX_PAD = 31

def sequence_padding(X_DIM, value):
    
    value_padding = np.pad(value, ((0,MAX_PAD - X_DIM),(0,0)), 'constant')
    
    return value_padding

In [14]:
def create_embedding(value):

    value = tknzr.tokenize(str(value))
    
    if len(value) < MAX_PAD:
        
        embedding = fast_text_embedding.wv[value]
        embedding = embedding.astype('float16')
        
        padding_embedding = sequence_padding(embedding.shape[0],embedding)
        
        return padding_embedding
        
    else:
        
        embedding = fast_text_embedding.wv[value[0:MAX_PAD]]
        embedding = embedding.astype('float16')
        
        return embedding

## encode articles using encoder model

In [15]:
articles_ids = []
articles_fasttext_embedding = []

for i, row in tqdm(articles.iterrows()):
    
    article_page_title = row['page_title']
       
    #fast text embedding
    title_fast_text_embedding = create_embedding(article_page_title)
    
    #save list
    articles_ids.append(row['page_id'])
    articles_fasttext_embedding.append(title_fast_text_embedding)

1026it [00:00, 4270.32it/s]


In [20]:
articles_fasttext_embedding = np.array(articles_fasttext_embedding,dtype='float16')

In [22]:
with tf.device("/cpu:0"):
    
    articles_vector = final_encoder_model.predict(x=articles_fasttext_embedding)

In [25]:
tables_ids = []
tables_fastext_embedding = []

for i, row in tqdm(tables.iterrows()):
    
    table_title = row['table_page_title']
    
    #fast text embedding
    title_fast_text_embedding = create_embedding(table_title)
    
    tables_ids.append(row['table_id'])
    tables_fastext_embedding.append(title_fast_text_embedding) 

85900it [00:16, 5205.96it/s]


In [26]:
tables_fastext_embedding = np.array(tables_fastext_embedding,dtype='float16')

In [27]:
with tf.device("/cpu:0"):
    
    tables_vector = final_encoder_model.predict(x=tables_fastext_embedding)

## test the model on final task

In [29]:
def getAccuracy(idRankedTables, idQueryGoal):

    accuracy = 0

    for idTable in idRankedTables:
        
        if idTable[0] == idQueryGoal:
    
            accuracy = 1
            break;

    return accuracy

In [37]:
def saveAccuracy(k,accuracy):
    
    if k == 1:
            
        AverageTop1.append(accuracy)
    
    if k == 5:
            
        AverageTop5.append(accuracy)
        
    if k == 10:
            
        AverageTop10.append(accuracy)
        
    if k == 20:
            
        AverageTop20.append(accuracy)
    
    if k == 50:
            
        AverageTop50.append(accuracy)
    
    if k == 100:
            
        AverageTop100.append(accuracy)

In [38]:
AverageTop1 = []
AverageTop5 = []
AverageTop10 = []
AverageTop20 = []
AverageTop50 = []
AverageTop100 = []

topK = [1,5,10,20,50,100]

for i in tqdm(range(len(articles_vector))):
    
    idQueryGoal = articles_ids[i]
    
    distance_vector = pairwise_distances(articles_vector[i].reshape(1,64), tables_vector, metric='cosine')
    
    #creating the dataframe
    all_tables_score = []
    
    for j in range(len(tables_ids)):
        
        table_id = tables_ids[j]
        table_score = distance_vector[0][j]
        
        new_row = {"table_id": table_id,"table_score": table_score}
        
        all_tables_score.append(new_row)
        
    df_all_tables_scores = pd.DataFrame(all_tables_score)
    df_tables_sorting = df_all_tables_scores.sort_values('table_score')
    
    #compute the accuracy
    for accuracyK in topK:
        
        selected_top = df_tables_sorting.head(accuracyK)
        min_score = selected_top['table_score'].max()
        draw_tables_socres = df_tables_sorting[df_tables_sorting['table_score'] <= min_score]
        final_ranked_tables = draw_tables_socres.iloc[:,0:1].values
        
        accuracy_value = getAccuracy(final_ranked_tables,idQueryGoal)
        
        #save the accuracy on the list
        saveAccuracy(accuracyK,accuracy_value)

100%|██████████| 1026/1026 [02:36<00:00,  6.41it/s]


In [39]:
print("TOP@1 = "+ str(round(np.mean(AverageTop1),4)))
print("TOP@5 = "+ str(round(np.mean(AverageTop5),4)))
print("TOP@10 = "+ str(round(np.mean(AverageTop10),4)))
print("TOP@20 = "+ str(round(np.mean(AverageTop20),4)))
print("TOP@50 = "+ str(round(np.mean(AverageTop50),4)))
print("TOP@100 = "+ str(round(np.mean(AverageTop100),4)))

TOP@1 = 0.0926
TOP@5 = 0.1481
TOP@10 = 0.1715
TOP@20 = 0.2018
TOP@50 = 0.2427
TOP@100 = 0.2953
