In [1]:
from tqdm import tqdm
import numpy as np
import pandas as pd
import random
import gensim as gs
from elasticsearch import Elasticsearch
from elasticsearch import helpers
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()
indexing_distinct_tables = Elasticsearch(timeout=30, max_retries=10, retry_on_timeout=True)
import tensorflow as tf
import warnings
warnings.filterwarnings('ignore')

In [2]:
tf.__version__

'2.0.0-rc0'

In [3]:
test_articles = pd.read_csv('../dataset/data_articles_test.csv', delimiter=',')

In [4]:
embedding_model = gs.models.FastText.load('../train_embedding_models/fasttext_embedding_50d_all_signals')

In [5]:
indexing_distinct_tables.indices.close(index='distinct_tables')
indexing_distinct_tables.indices.put_settings(index='distinct_tables', body={"index": {"similarity": {"default": {"type": "classic"}}}})
indexing_distinct_tables.indices.open(index='distinct_tables')

{'acknowledged': True, 'shards_acknowledged': True}

In [6]:
# "tablePgID"
# "tablePgTitle"
# "tablePgFullText"
# "tablePgMetaDescription"
# "tablePgSummary"
# "tablePgKeywords"
# "tableSectionTitle"
# "tableCaption"
# "tableHeader"
# "tableBody"

In [7]:
def search_indexing(query):
    
    result= indexing_distinct_tables.search(
        index="distinct_tables", 
        body = {
        "_source": ["tablePgID","tablePgTitle","tablePgSummary"],
        "from" : 0,
        "size" : 100,
        "query": {
            "multi_match":{
              "type": "most_fields",
              "query":    query, 
              "fields": ["tablePgFullText"] 
            }
        }
    })
    
    return result

In [8]:
def get_accuracy(ID_goal,ranked_tables_ID):
    
    accuracy = 0
    
    for table_ID in ranked_tables_ID:
        
        if table_ID[0] == ID_goal:
    
            accuracy = 1
            break;

    return accuracy

In [9]:
MAX_PAD_TITLE = 55

def sequence_padding_title(X_DIM, value):
    
    value_padding = np.pad(value, ((0,MAX_PAD_TITLE - X_DIM),(0,0)), 'constant')
    
    return value_padding

In [10]:
def create_embedding_title(value):

    value = tknzr.tokenize(str(value))
    
    if len(value) < MAX_PAD_TITLE:
        
        embedding = embedding_model.wv[value]
        embedding = embedding.astype('float16')
        
        padding_embedding = sequence_padding_title(embedding.shape[0],embedding)
        
        return padding_embedding
        
    else:
        
        embedding = embedding_model.wv[value[0:MAX_PAD_TITLE]]
        embedding = embedding.astype('float16')
        
        return embedding

In [11]:
MAX_PAD_MAIN_PASSAGE = 55

def sequence_padding_main_passage(X_DIM, value):
    
    value_padding = np.pad(value, ((0,MAX_PAD_MAIN_PASSAGE - X_DIM),(0,0)), 'constant')
    
    return value_padding

In [12]:
def create_embedding_main_passage(value):

    value = tknzr.tokenize(str(value))
    
    if len(value) < MAX_PAD_MAIN_PASSAGE:
        
        embedding = embedding_model.wv[value]
        embedding = embedding.astype('float16')
        
        padding_embedding = sequence_padding_main_passage(embedding.shape[0],embedding)
        
        return padding_embedding
        
    else:
        
        embedding = embedding_model.wv[value[0:MAX_PAD_MAIN_PASSAGE]]
        embedding = embedding.astype('float16')
        
        return embedding

In [13]:
def search_index(query):
    
    tables_index = []

    result_index = search_indexing(query)
        
    for hit in result_index['hits']['hits']:
    
        table_ID = hit['_source']['tablePgID']
        
        table_page_title = hit['_source']['tablePgTitle']
        
        table_page_main_passage = hit['_source']['tablePgSummary']
    
        tables_index.append([table_ID,table_page_title,table_page_main_passage])
    
    return tables_index

In [23]:
ranking_model = tf.keras.models.load_model('attention_model_title_main_passage_27_0.9501.h5')

In [15]:
test_articles.head(1)

Unnamed: 0,full_text,keywords,meta_description,meta_keywords,page_id,page_title,summary,tags
0,when comes pioneers progressive rock handful b...,faces anderson album chris jon went record tre...,yes among year rock roll hall fame inductees c...,,4125505,rock roll hall famers jon anderson trevor rabi...,because good photos the best yes album coverss...,


In [24]:
result = []

def run_search(k):
    
    TOP_K = k
    accuracy = []
    
    for i, row in tqdm(test_articles.iterrows()):
        
        article_ID = row['page_id']
        article_title_text = row['page_title']
        article_main_passage_text = row['meta_description']
        
        query = row['page_title']+" "+row['meta_description']+" "+row['keywords']
        
        article_title = []
        article_main_passage = []
        table_title = []
        table_main_passage = []
        ranked_tables_model = []

        ranked_tables_index = search_index(query)
        
        if len(ranked_tables_index) > 0:
            
            article_title_embedding = create_embedding_title(article_title_text)
            article_main_passage_embedding = create_embedding_main_passage(article_main_passage_text)
            
            for table_ID, index_table_title, index_table_main_passage in (ranked_tables_index):
                    
                table_title_embedding = create_embedding_title(str(index_table_title))
                table_main_passage_embedding = create_embedding_main_passage(str(index_table_main_passage))

                article_title.append(article_title_embedding)
                article_main_passage.append(article_main_passage_embedding)
                table_title.append(table_title_embedding)
                table_main_passage.append(table_main_passage_embedding)

            article_title = np.array(article_title, dtype='float16')
            article_main_passage = np.array(article_main_passage, dtype='float16')
            table_title = np.array(table_title, dtype='float16')
            table_main_passage = np.array(table_main_passage, dtype='float16')
           
            table_ranking_model = ranking_model.predict([article_title, article_main_passage, table_title, table_main_passage])

            for i in range(0,len(table_ranking_model)):

                ranked_tables_model.append([ranked_tables_index[i][0],ranked_tables_index[i][1],table_ranking_model[i][0]]) 

            data_frame = pd.DataFrame(ranked_tables_model, columns = ['table_ID', 'table_title','table_ranking']) 
            data_frame_sorting = data_frame.sort_values('table_ranking', ascending=False)   
#             final_ranked_tables = data_frame_sorting.iloc[0:TOP_K,0:1].values
            
            selected_top = data_frame_sorting.head(TOP_K)
            min_score = selected_top['table_ranking'].min()
            draw_tables_socres = data_frame_sorting[data_frame_sorting['table_ranking'] >= min_score]
            final_ranked_tables = draw_tables_socres.iloc[:,0:1].values
        
            accuracy.append(get_accuracy(article_ID, final_ranked_tables))

    result.append(["Acc@"+str(k),str(round(np.mean(accuracy),4))])

In [25]:
accuracy_K = [1,5,10,20]

for k in accuracy_K:
     
    run_search(k)

1026it [05:13,  3.30it/s]
1026it [05:08,  3.35it/s]
1026it [02:50,  6.70it/s]
1026it [02:34,  6.67it/s]


In [26]:
result

[['Acc@1', '0.386'],
 ['Acc@5', '0.4103'],
 ['Acc@10', '0.4366'],
 ['Acc@20', '0.5019']]