In [None]:
from tqdm import tqdm
import numpy as np
import pandas as pd
import random
import gensim as gs
import tensorflow as tf
import warnings
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()
warnings.filterwarnings('ignore')
from transformers import TFBertModel, BertTokenizer, TFBertMainLayer, BertConfig

In [None]:
test_articles = pd.read_csv('../dataset/test_articles_newyork.csv', delimiter=',')

In [None]:
fixed_index = pd.read_csv('../dataset/fixed_test_set_newyork.csv', delimiter=',')

In [None]:
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

In [None]:
MAX_TOKENS = 250

In [None]:
def get_accuracy(ID_goal,ranked_tables_ID):
    
    accuracy = 0
    
    for table_ID in ranked_tables_ID:
        
        if table_ID[0] == ID_goal:
    
            accuracy = 1
            break;

    return accuracy

In [None]:
def get_mrr(ID_goal,ranked_tables_ID):
    
    accuracy = 0
    index_match = 1

    for idTable in ranked_tables_ID:
        
        if idTable[0] == ID_goal:
    
            accuracy = 1/index_match
            break;
        
        index_match = index_match + 1
   
    return accuracy

In [None]:
# embedding_model = gs.models.FastText.load('../train_embedding_models/fasttext_embedding_50d_all_signals')
embedding_model = gs.models.FastText.load_fasttext_format('../pre_trained_models/cc.en.300.bin')

In [None]:
MAX_PAD_TITLE = 30

def sequence_padding_title(X_DIM, value):
    
    value_padding = np.pad(value, ((0,MAX_PAD_TITLE - X_DIM),(0,0)), 'constant')
    
    return value_padding

In [None]:
def create_embedding_title(value):

    value = tknzr.tokenize(str(value))
    
    if len(value) < MAX_PAD_TITLE:
        
        embedding = embedding_model.wv[value]
        embedding = embedding.astype('float16')
        
        padding_embedding = sequence_padding_title(embedding.shape[0],embedding)
        
        return padding_embedding
        
    else:
        
        embedding = embedding_model.wv[value[0:MAX_PAD_TITLE]]
        embedding = embedding.astype('float16')
        
        return embedding

In [None]:
MAX_PAD_MAIN_PASSAGE = 55

def sequence_padding_main_passage(X_DIM, value):
    
    value_padding = np.pad(value, ((0,MAX_PAD_MAIN_PASSAGE - X_DIM),(0,0)), 'constant')
    
    return value_padding

In [None]:
def create_embedding_main_passage(value):

    value = tknzr.tokenize(str(value))
    
    if len(value) < MAX_PAD_MAIN_PASSAGE:
        
        embedding = embedding_model.wv[value]
        embedding = embedding.astype('float16')
        
        padding_embedding = sequence_padding_main_passage(embedding.shape[0],embedding)
        
        return padding_embedding
        
    else:
        
        embedding = embedding_model.wv[value[0:MAX_PAD_MAIN_PASSAGE]]
        embedding = embedding.astype('float16')
        
        return embedding

In [None]:
evaluate_models = []
evaluate_models.append('bert_based_models/model_ablation/model_ablation05_02_0.9847.h5')

In [None]:
test_articles.info()

In [None]:
evaluate_topk = [1,5,10,20,50]

for i in range(0,len(evaluate_models)):

    result = []

    print("current_model: "+ evaluate_models[i])

    #loading the current model
    ranking_model = tf.keras.models.load_model(evaluate_models[i])

    #evaluating each topk value
    for TOP_K in evaluate_topk:

        accuracy = []
        mrr = []

        for i, row in tqdm(test_articles.iterrows()):

            #current article values
            #article_url = row['article_url']
            article_ID = row['article_key_match']
            article_title_text = row['article_title']
            article_main_passage_text = str(row['article_meta_description'])
            article_keywords_text = str(row['article_keywords'])

            #embedding and model variables
            article_title = []
            article_main_passage = []
            article_keywords = []
            table_title = []
            table_main_passage = []
            table_keywords = []
            test_title_ids = []
            test_title_mask = []
            test_title_seg = []
            ranked_tables_model = []

            #return index
            return_index = fixed_index.loc[fixed_index['label_index'] == row['article_key_match']]

            #creating embedding 
            for i, row in return_index.iterrows():

                #bert
                return_tokenizer1 = bert_tokenizer.encode_plus(
                  article_title_text+" "+article_main_passage_text+" "+article_keywords_text,
                    row['table_page_title']+" "+str(row['table_page_main_passage'])+" "+str(row['table_page_keywords']),
                  max_length=MAX_TOKENS,
                  add_special_tokens=True,
                  return_token_type_ids=True,
                  pad_to_max_length=True,
                  return_attention_mask=True,
                )

                #bert
                test_title_ids.append(return_tokenizer1['input_ids'])
                test_title_mask.append(return_tokenizer1['attention_mask'])
                test_title_seg.append(return_tokenizer1['token_type_ids'])  
                
                #fasttext embedding
                article_title_embedding = create_embedding_title(article_title_text)
                article_main_passage_embedding = create_embedding_main_passage(article_main_passage_text)
                article_keywords_embedding = create_embedding_title(article_keywords_text)
                
                table_title_embedding = create_embedding_title(row['table_page_title'])
                table_main_passage_embedding = create_embedding_main_passage(str(row['table_page_main_passage']))
                table_keywords_embedding = create_embedding_title(row['table_page_keywords'])

                article_title.append(article_title_embedding)
                article_main_passage.append(article_main_passage_embedding)
                article_keywords.append(article_keywords_embedding)
                table_title.append(table_title_embedding)
                table_main_passage.append(table_main_passage_embedding)
                table_keywords.append(table_keywords_embedding)
                
                    
            #bert
            test_title_ids = np.array(test_title_ids)
            test_title_mask = np.array(test_title_mask)
            test_title_seg = np.array(test_title_seg)
            
            #fasttext
            article_title = np.array(article_title, dtype='float16')
            article_main_passage = np.array(article_main_passage, dtype='float16')
            article_keywords = np.array(article_keywords, dtype='float16')
            
            table_title = np.array(table_title, dtype='float16')
            table_main_passage = np.array(table_main_passage, dtype='float16')
            table_keywords = np.array(table_keywords, dtype='float16')
           
            table_ranking_model = ranking_model.predict([test_title_ids,test_title_mask,test_title_seg, article_title, article_main_passage, article_keywords, table_title, table_main_passage, table_keywords])

            #creating the final dataframe
            for i in range(0,len(table_ranking_model)):

                ranked_tables_model.append([return_index.iloc[i]['table_page_id'],return_index.iloc[i]['table_page_title'],table_ranking_model[i][0]]) 

            data_frame = pd.DataFrame(ranked_tables_model, columns = ['table_ID', 'table_title','table_ranking']) 
            data_frame_sorting = data_frame.sort_values('table_ranking', ascending=False)  

            selected_top = data_frame_sorting.head(TOP_K)
#             min_score = selected_top['table_ranking'].min()
#             draw_tables_socres = data_frame_sorting[data_frame_sorting['table_ranking'] >= min_score]
            final_ranked_tables = selected_top.iloc[:,0:1].values
            
#             print("")
#             print("query:"+ article_url)
#             print("")
#             print("match:" +article_ID)
#             print("")
#             print(selected_top)

            #getting topk accuracy
            accuracy.append(get_accuracy(article_ID, final_ranked_tables))

            #testing mean reciprocal rank at k = 50
            if TOP_K == 50:

                mrr.append(get_mrr(article_ID, final_ranked_tables))

        result.append(["Acc@"+str(TOP_K),str(round(np.mean(accuracy),4))])

    print("")
    print(result[0])
    print(result[1])
    print(result[2])
    print(result[3])
    print(result[4])
    print("MRR: "+str(round(np.mean(mrr),4)) )
    print(mrr)
    print("")