In [2]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import pairwise_distances
from heapq import nsmallest

In [3]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"

In [4]:
embed = hub.load(module_url)

In [5]:
articles = pd.read_csv('../dataset/test_articles_dataset_newyork.csv', delimiter=',', keep_default_na=False)
tables = pd.read_csv('../dataset/data_tables_all_signal_newyork.csv', delimiter=',', keep_default_na=False)

In [6]:
articles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148 entries, 0 to 147
Data columns (total 10 columns):
article_full_text           148 non-null object
article_key_match           148 non-null object
article_keywords            148 non-null object
article_main_passage        148 non-null object
article_meta_description    148 non-null object
article_meta_keywords       148 non-null object
article_summary             148 non-null object
article_tags                148 non-null object
article_title               148 non-null object
article_url                 148 non-null object
dtypes: object(10)
memory usage: 11.6+ KB


## creating the articles embedings

In [18]:
articles_title = []
articles_id = []

for i, row in articles.iterrows():
    
    articles_id.append(row['article_key_match'])
    articles_title.append(row['article_title']+" "+row['article_main_passage'][0:1000])

In [19]:
embedding_articles = embed(articles_title)

In [20]:
article_dense_vector = []

for current_embedding in embedding_articles:
    
    article_dense_vector.append(current_embedding.numpy())

In [10]:
tables.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52426 entries, 0 to 52425
Data columns (total 13 columns):
table_body                     52426 non-null object
table_caption                  52426 non-null object
table_domain_score             52426 non-null float64
table_header                   52426 non-null object
table_page_content             52426 non-null object
table_page_full_text           52426 non-null object
table_page_keywords            52426 non-null object
table_page_meta_description    52426 non-null object
table_page_meta_keywords       52426 non-null object
table_page_summary             52426 non-null object
table_page_tags                52426 non-null object
table_page_title               52426 non-null object
table_page_url                 52426 non-null object
dtypes: float64(1), object(12)
memory usage: 5.2+ MB


## creating the tables embeddings

In [21]:
tables_title = []
tables_ids = []

for i, row in tqdm(tables.iterrows()):
    
    tables_ids.append(row['table_page_url'])
    tables_title.append(row['table_page_title']+" "+row['table_page_content'][0:1000])

52426it [00:06, 8636.09it/s]


In [22]:
embedding_tables = embed(tables_title)

In [23]:
tables_dense_vector = []

for current_embedding in embedding_tables:
    
    tables_dense_vector.append(current_embedding.numpy())

## methods for getting the ranked tables and save the accuracy

In [24]:
def get_accuracy(id_ranked_tables, id_query_goal):

    accuracy = 0

    for id_table in id_ranked_tables:
    
        if id_table[0] == id_query_goal:
    
            accuracy = 1
            
            break;

    return accuracy

In [25]:
def save_accuracy(k,accuracy):
    
    if k == 1:
            
        average_top1.append(accuracy)
        
    if k == 5:
            
        average_top5.append(accuracy)
        
    if k == 10:
            
        average_top10.append(accuracy)
        
    if k == 20:
            
        average_top20.append(accuracy)

## computing the cosine similarity

In [26]:
average_top1 = []
average_top5 = []
average_top10 = []
average_top20 = []

top_k = [1,5,10,20]

for i in tqdm(range(len(article_dense_vector))):
    
    idQueryGoal = articles_id[i]
    
    distance_vector = pairwise_distances(article_dense_vector[i].reshape(1,512), tables_dense_vector, metric='cosine')
    
    #creating the dataframe
    all_tables_score = []
    
    for j in range(len(tables_ids)):
        
        table_id = tables_ids[j]
        table_score = distance_vector[0][j]
        
        new_row = {"table_id": table_id,"table_score": table_score}
        
        all_tables_score.append(new_row)
        
    df_all_tables_scores = pd.DataFrame(all_tables_score)
    df_tables_sorting = df_all_tables_scores.sort_values('table_score')
    
    #compute the accuracy
    for accuracyK in top_k:
        
        selected_top = df_tables_sorting.head(accuracyK)
#         min_score = selected_top['table_score'].max()
#         draw_tables_socres = df_tables_sorting[df_tables_sorting['table_score'] <= min_score]
        final_ranked_tables = selected_top.iloc[:,0:1].values
        
        accuracy_value = get_accuracy(final_ranked_tables,idQueryGoal)
        
        #save the accuracy on the list
        save_accuracy(accuracyK,accuracy_value)

100%|██████████| 148/148 [00:32<00:00,  4.46it/s]


In [27]:
print(str(round(np.mean(average_top1),4)))
print(str(round(np.mean(average_top5),4)))
print(str(round(np.mean(average_top10),4)))
print(str(round(np.mean(average_top20),4)))

0.2838
0.5338
0.5946
0.6419
