In [None]:
from tqdm import tqdm
import numpy as np
import pandas as pd
import random
import tensorflow as tf
import warnings
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()
warnings.filterwarnings('ignore')
import matchzoo as mz

In [None]:
test_articles = pd.read_csv('../dataset/test_articles_ourdata.csv', delimiter=',')

In [None]:
fixed_index = pd.read_csv('../dataset/fixed_test_set_ourdata.csv', delimiter=',')

In [None]:
glove_embedding = mz.datasets.embeddings.load_glove_embedding(dimension=300)

In [None]:
train_dataset = pd.read_csv('../train_data/train_data_T.csv', delimiter=',')
train_dataset = train_dataset.replace(np.nan, ' ', regex=True)

In [None]:
list_data1 = []
for i, row in train_dataset.iterrows():
    
    line1 = {'id_left': str(row['article_id']),
            'text_left':str(row['article_page_title'])+" "+str(row['article_meta_description'])+" "+str(row['article_keywords']),
            'id_right':str(row['table_id']),
            'text_right':str(row['table_page_title'])+" "+str(row['table_page_summary'])+" "+str(row['table_page_keywords']),
            'label':row['label']
           }
    
    list_data1.append(line1)

df1 = pd.DataFrame(list_data1)
train_pack = mz.pack(df1)

In [None]:
preprocessor = mz.preprocessors.BasicPreprocessor(fixed_length_left=130, fixed_length_right=130, remove_stop_words=True)
train_processed = preprocessor.fit_transform(train_pack, verbose=0)

In [None]:
embedding_matrix = glove_embedding.build_matrix(preprocessor.context['vocab_unit'].state['term_index'])

In [None]:
def get_accuracy(ID_goal,ranked_tables_ID):
    
    accuracy = 0
    
    for table_ID in ranked_tables_ID:
        
        if table_ID[0] == ID_goal:
    
            accuracy = 1
            break;

    return accuracy

In [None]:
def get_mrr(ID_goal,ranked_tables_ID):
    
    accuracy = 0
    index_match = 1

    for idTable in ranked_tables_ID:
        
        if idTable[0] == ID_goal:
    
            accuracy = 1/index_match
            break;
        
        index_match = index_match + 1
   
    return accuracy

In [None]:
evaluate_models = []
# evaluate_models.append('03_ARCI/ARCI_result_title_main_passage_keywords/ARCI_title_main_passage_keywords8')
# evaluate_models.append('04_ARCII/ARCII_result_title_main_passage_keywords/ARCII_title_main_passage_keywords1')
# evaluate_models.append('05_MVLSTM/MVLSTM_result_title_main_passage_keywords/MVLSTM_title_main_passage_keywords8')
evaluate_models.append('09_KNRM/KNRM_result_title_main_passage_keywords/KNRM_title_main_passage_keywords13')
evaluate_models.append('11_CONVKNRM/CONVKNRM_result_title_main_passage_keywords/CONVKNRM_title_main_passage_keywords8')
evaluate_models.append('10_DUET/DUET_result_title_main_passage_keywords/DUET_title_main_passage_keywords99')

In [None]:
test_articles.info()

In [None]:
with tf.device("/cpu:0"):
    
    evaluate_topk = [1,5,10,20,50]

    for i in range(0,len(evaluate_models)):
        
        result = []

        print("current_model: "+ evaluate_models[i])

        #loading the current model
        ranking_model = mz.load_model(evaluate_models[i])
        ranking_model.load_embedding_matrix(embedding_matrix)

        #evaluating each topk value
        for TOP_K in evaluate_topk:
            
            accuracy = []
            mrr = []
            
            for i, row in (test_articles.iterrows()):

                #current article values
                article_ID = row['article_id']
                article_title_text = row['article_title']
                article_main_passage_text = str(row['article_meta_description'])
                article_title_keywords_text = str(row['article_keywords'])

                #embedding and model variables
                article_title = []
                title_table = []
                ranked_tables_model = []
                class_list = []

                #return index
                return_index = fixed_index.loc[fixed_index['label_index'] == row['article_id']]

                #creating embedding 
                for i, row in return_index.iterrows():

                    line = {'id_left': article_ID,
                         'text_left':article_title_text+" "+article_main_passage_text+" "+article_title_keywords_text,
                         'id_right':row['table_page_id'],
                         'text_right':row['table_page_title']+" "+str(row['table_page_main_passage'])+" "+str(row['table_page_keywords'])
                        }

                    class_list.append(line)

                df2 = pd.DataFrame(class_list)
                test_pack = mz.pack(df2)
                valid_processed = preprocessor.transform(test_pack, verbose=0)
                test_x, test_y = valid_processed.unpack()

                table_ranking_model = ranking_model.predict(test_x)

                #creating the final dataframe
                for i in range(0,len(table_ranking_model)):

                    ranked_tables_model.append([return_index.iloc[i]['table_page_id'],return_index.iloc[i]['table_page_title'],table_ranking_model[i][0]]) 

                data_frame = pd.DataFrame(ranked_tables_model, columns = ['table_ID', 'table_title','table_ranking']) 
                data_frame_sorting = data_frame.sort_values('table_ranking', ascending=False)  

                selected_top = data_frame_sorting.head(TOP_K)
#                 min_score = selected_top['table_ranking'].min()
#                 draw_tables_socres = data_frame_sorting[data_frame_sorting['table_ranking'] >= min_score]
                final_ranked_tables = selected_top.iloc[:,0:1].values
                
                
                #getting topk accuracy
                accuracy.append(get_accuracy(article_ID, final_ranked_tables))
                
                #testing mean reciprocal rank at k = 50
                if TOP_K == 50:
                    
                    mrr.append(get_mrr(article_ID, final_ranked_tables))
                
            result.append(["Acc@"+str(TOP_K),str(round(np.mean(accuracy),4))])
        
        print("")
        print(result[0])
        print(result[1])
        print(result[2])
        print(result[3])
        print(result[4])
        print("MRR: "+str(round(np.mean(mrr),4)) )
        print(mrr)
        print("")