In [1]:
from tqdm import tqdm
import numpy as np
import pandas as pd
import random
import gensim as gs
from elasticsearch import Elasticsearch
from elasticsearch import helpers
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()
indexing_tables = Elasticsearch(timeout=30, max_retries=10, retry_on_timeout=True)
import tensorflow as tf
import warnings
warnings.filterwarnings('ignore')

In [2]:
tf.__version__

'2.2.0'

In [3]:
embedding_model = gs.models.FastText.load('../../train_embedding_models/fasttext_embedding_50d_all_signals')

In [4]:
MAX_PAD = 55

def sequence_padding(X_DIM, value):
    
    value_padding = np.pad(value, ((0,MAX_PAD - X_DIM),(0,0)), 'constant')
    
    return value_padding

In [5]:
def create_embedding(value):

    value = tknzr.tokenize(str(value))
    
    if len(value) < MAX_PAD:
        
        embedding = embedding_model.wv[value]
        embedding = embedding.astype('float16')
        
        padding_embedding = sequence_padding(embedding.shape[0],embedding)
        
        return padding_embedding
        
    else:
        
        embedding = embedding_model.wv[value[0:MAX_PAD]]
        embedding = embedding.astype('float16')
        
        return embedding

In [None]:
#affinity_model_title_main_passage_keywords_attributes_correlation_1_1_08_0.9767.h5
#attention_model_title_main_passage_keywords_attributes_correlation_1_1_17_0.9859.h5
#coattention_model_title_main_passage_keywords_attributes_correlation_1_1_10_0.9604.h5

In [14]:
ranking_model = tf.keras.models.load_model('../../learning_to_rank_models/model_title_main_passage_keywords_attributes_correlation/attention_model_title_main_passage_keywords_attributes_correlation_1_1_17_0.9859.h5')



In [8]:
read_data = pd.read_csv('dataset_top100_bert.csv', delimiter=',')

In [9]:
unique_articles = read_data.article_url.unique()

In [10]:
total_articles = len(unique_articles)

In [11]:
accuracy = []

def get_accuracy(current_dataframe):
    
    for i in range(len(current_dataframe)):
        
        if current_dataframe[i] == 1:
            
            accuracy.append(1)
    
    return accuracy

In [15]:
for i in tqdm(range(len(unique_articles))):
    
    article_title = []
    article_main_passage = []
    article_keywords = []

    title_table = []
    table_page_content = []
    table_keywords = []
    
    ranked_tables_model = []
    
    current_article = unique_articles[i]
    
    rslt_df = read_data[read_data['article_url'] == current_article]
    
    for j,row in rslt_df.iterrows():
        
        article_url = row['article_url']
        article_title_text = row['article_title']
        article_meta_description_text = row['article_meta_description']
        article_keywords_text = row['article_keywords']
        
        table_url = row['table_url']
        table_title_text = row['table_title']
        table_summary_text = row['table_summary']
        table_keywords_text = row['table_keywords']
        
        match = row['match']
        old_rank = row['old_rank']
        
        #creating embedding
        article_title_embedding = create_embedding(article_title_text)
        article_main_passage_embedding = create_embedding(article_meta_description_text)
        article_keywords_embedding = create_embedding(article_keywords_text)
        
        table_title_embedding = create_embedding(table_title_text)
        table_page_content_embedding = create_embedding(table_summary_text)
        table_page_keywords_embedding = create_embedding(table_keywords_text)
        
        #adding to list
        article_title.append(article_title_embedding)
        article_main_passage.append(article_main_passage_embedding)
        article_keywords.append(article_keywords_embedding)

        title_table.append(table_title_embedding)
        table_page_content.append(table_page_content_embedding)
        table_keywords.append(table_page_keywords_embedding)
        
    
    #transforming in numpy
    article_title = np.array(article_title)
    article_main_passage = np.array(article_main_passage)
    article_keywords = np.array(article_keywords)

    title_table = np.array(title_table)
    table_page_content = np.array(table_page_content)
    table_keywords = np.array(table_keywords)
    
    #predicting
    table_ranking_model = ranking_model.predict([article_title,article_main_passage,article_keywords,title_table,table_page_content,table_keywords])
    
    for k in range(len(table_ranking_model)):
        
        article_url2 = rslt_df[k:k+1]['article_url'].iloc[0]
        table_url2 = rslt_df[k:k+1]['table_url'].iloc[0]
        match2 = rslt_df[k:k+1]['match'].iloc[0]
        old_rank2 = rslt_df[k:k+1]['old_rank'].iloc[0]
        
        ranked_tables_model.append([article_url2,table_url2,match2,old_rank2,table_ranking_model[k]]) 
    
    
    data_frame = pd.DataFrame(ranked_tables_model, columns = ['article_url', 'table_url','match','old_rank','table_ranking']) 
    data_frame_sorting = data_frame.sort_values('table_ranking', ascending=False)
    final_ranked_tables = data_frame_sorting.iloc[0:20,2:3].values
    
    accuracy = get_accuracy(final_ranked_tables[0])

100%|██████████| 1875/1875 [10:19<00:00,  3.03it/s]


In [16]:
result_top1 = len(accuracy)/total_articles
result_top1

0.49706666666666666

In [None]:
attention = 0.3642666666666667

In [None]:
affinity = 0.27

In [None]:
coattention = 0.1328