In [1]:
from tqdm import tqdm
import numpy as np
import pandas as pd
import random
import gensim as gs
import tensorflow as tf
import warnings
import tensorflow_hub as hub
from gensim.models.doc2vec import Doc2Vec
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()
warnings.filterwarnings('ignore')
from sklearn.metrics import pairwise_distances
from heapq import nsmallest
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
test_articles = pd.read_csv('../dataset/test_articles_ourdata.csv', delimiter=',')

In [3]:
fixed_index = pd.read_csv('../dataset/fixed_test_set_ourdata.csv', delimiter=',')

In [4]:
fixed_index.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94721 entries, 0 to 94720
Data columns (total 5 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   label_index              94721 non-null  int64 
 1   table_page_id            94721 non-null  int64 
 2   table_page_title         94721 non-null  object
 3   table_page_main_passage  94721 non-null  object
 4   table_page_keywords      94721 non-null  object
dtypes: int64(2), object(3)
memory usage: 3.6+ MB


In [5]:
corpus = []

for i, row in tqdm(test_articles.iterrows()):
    
    corpus.append(row['article_title'])
    corpus.append(str(row['article_meta_description']))
    corpus.append(str(row['article_keywords']))
    
for i, row in fixed_index.iterrows():
    
    corpus.append(row['table_page_title'])
    corpus.append(str(row['table_page_main_passage']))
    corpus.append(str(row['table_page_keywords']))

vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)

948it [00:00, 9801.80it/s]


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [6]:
def get_accuracy(ID_goal,ranked_tables_ID):
    
    accuracy = 0
    
    for table_ID in ranked_tables_ID:
        
        if table_ID[0] == ID_goal:
    
            accuracy = 1
            break;

    return accuracy

In [7]:
def get_mrr(ID_goal,ranked_tables_ID):
    
    accuracy = 0
    index_match = 1

    for idTable in ranked_tables_ID:
        
        if idTable[0] == ID_goal:
    
            accuracy = 1/index_match
            break;
        
        index_match = index_match + 1
   
    return accuracy

In [8]:
test_articles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 948 entries, 0 to 947
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   article_id                948 non-null    int64 
 1   article_title             948 non-null    object
 2   article_full_text         948 non-null    object
 3   article_meta_description  948 non-null    object
 4   article_summary           948 non-null    object
 5   article_keywords          948 non-null    object
 6   article_meta_key_words    662 non-null    object
 7   article_tags              207 non-null    object
dtypes: int64(1), object(7)
memory usage: 59.4+ KB


In [12]:
fixed_index.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94721 entries, 0 to 94720
Data columns (total 5 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   label_index              94721 non-null  int64 
 1   table_page_id            94721 non-null  int64 
 2   table_page_title         94721 non-null  object
 3   table_page_main_passage  94721 non-null  object
 4   table_page_keywords      94721 non-null  object
dtypes: int64(2), object(3)
memory usage: 3.6+ MB


In [15]:
evaluate_topk = [1,5,10,20,50]

result = []

#evaluating each topk value
for TOP_K in evaluate_topk:

    accuracy = []
    mrr = []

    for i, row in tqdm(test_articles.iterrows()):

        #current article values
        article_ID = row['article_id']
        article_title_text = str(row['article_title'])
        article_meta_description_text = str(row['article_meta_description'])
        article_keywords_text = str(row['article_keywords'])

        #embedding and model variables
        article_title = []
        title_table = []
        ranked_tables_model = []

        #return index
        return_index = fixed_index.loc[fixed_index['label_index'] == row['article_id']]
        
        tfidf_vector = vectorizer.transform([article_title_text+" "+article_meta_description_text+" "+article_keywords_text])
       
        article_title.append(tfidf_vector[0].toarray()[0])

        #creating embedding 
        for i, row in return_index.iterrows():
            
            tfidf_vector = vectorizer.transform([str(row['table_page_title'])+" "+str(row['table_page_main_passage'])+" "+str(row['table_page_keywords'])])
            title_table.append(tfidf_vector[0].toarray()[0])
        
        distance_vector = pairwise_distances(article_title, title_table, metric='cosine')
        
        #creating the final dataframe
        for i in range(0,len(distance_vector[0])):

            ranked_tables_model.append([return_index.iloc[i]['table_page_id'],return_index.iloc[i]['table_page_title'],distance_vector[0][i]]) 

        data_frame = pd.DataFrame(ranked_tables_model, columns = ['table_ID', 'table_title','table_ranking']) 
        data_frame_sorting = data_frame.sort_values('table_ranking')
        
        selected_top = data_frame_sorting.head(TOP_K)
#         max_score = selected_top['table_ranking'].max()
#         draw_tables_socres = data_frame_sorting[data_frame_sorting['table_ranking'] <= max_score]
        final_ranked_tables = selected_top.iloc[:,0:1].values
    
#         print("")
#         print("query:" +article_title_text)
#         print("")
#         print(draw_tables_socres)

        #getting topk accuracy
        accuracy.append(get_accuracy(article_ID, final_ranked_tables))

        #testing mean reciprocal rank at k = 50
        if TOP_K == 50:

            mrr.append(get_mrr(article_ID, final_ranked_tables))

    result.append(["Acc@"+str(TOP_K),str(round(np.mean(accuracy),4))])

print("TF-IDF METHOD")
print(result[0])
print(result[1])
print(result[2])
print(result[3])
print(result[4])
print("MRR: "+str(round(np.mean(mrr),4)) )
print(mrr)
print("")

948it [04:25,  3.58it/s]
948it [04:27,  3.55it/s]
948it [04:26,  3.55it/s]
948it [04:28,  3.53it/s]
948it [04:29,  3.51it/s]

TF-IDF METHOD
['Acc@1', '0.5601']
['Acc@5', '0.7985']
['Acc@10', '0.865']
['Acc@20', '0.9209']
['Acc@50', '0.9662']
MRR: 0.6679
[0.08333333333333333, 0.04, 1.0, 0, 0.2, 1.0, 0.5, 1.0, 1.0, 1.0, 0.3333333333333333, 0.3333333333333333, 0.058823529411764705, 1.0, 1.0, 0.125, 0.3333333333333333, 0.3333333333333333, 0.25, 0.3333333333333333, 1.0, 0.3333333333333333, 1.0, 0.1111111111111111, 1.0, 0.25, 1.0, 1.0, 0, 1.0, 0.5, 0.5, 0.25, 0.25, 1.0, 1.0, 0.5, 1.0, 1.0, 1.0, 1.0, 0.25, 0.030303030303030304, 0.3333333333333333, 1.0, 0.5, 0.03571428571428571, 1.0, 1.0, 0.25, 0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 0.3333333333333333, 1.0, 0.023809523809523808, 0, 1.0, 0.16666666666666666, 1.0, 1.0, 0.5, 1.0, 1.0, 0.5, 1.0, 1.0, 1.0, 1.0, 0.5, 0.14285714285714285, 1.0, 1.0, 0.16666666666666666, 0.1, 0.5, 1.0, 0, 0.2, 1.0, 0.5, 0.1, 1.0, 1.0, 0.08333333333333333, 0.5, 0.05555555555555555, 0.1111111111111111, 1.0, 1.0, 1.0, 0.5, 0.5, 1.0, 1.0, 0.1111111111111111, 0.25, 0.25, 1.0, 0.


