In [1]:
from tqdm import tqdm
import numpy as np
import pandas as pd
import random
from elasticsearch import Elasticsearch
from elasticsearch import helpers
indexingTables = Elasticsearch(timeout=30, max_retries=10, retry_on_timeout=True)

In [2]:
def get_accuracy(idRankedTables, idQueryGoal):
    
    accuracy = 0

    for idTable in idRankedTables:
        
        if idTable == idQueryGoal:
    
            accuracy = 1
            break;

    return accuracy

In [3]:
indexingTables.indices.close(index='data_table_newyork')
indexingTables.indices.put_settings(index='data_table_newyork', body={"index": {"similarity": {"default": {"type": "BM25"}}}})
indexingTables.indices.open(index='data_table_newyork')

{'acknowledged': True, 'shards_acknowledged': True}

In [4]:
def searchIndexing(query, tableField, topK):
    
    result= indexingTables.search(
        index="data_table_newyork", 
        body = {
        "_source": ["table_url"],
        "from" : 0,
        "size" : topK,
        "query": {
            "multi_match":{
              "type": "most_fields",
              "query":    query, 
              #"fields": ["tablePgTitle"] 
              "fields": ["table_page_title"] 
              #"fields": [tableField]
            }
        }
    })
    
    return result

In [5]:
articles = pd.read_csv('../dataset/test_articles_dataset_newyork.csv', delimiter=',')

In [6]:
articles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148 entries, 0 to 147
Data columns (total 10 columns):
article_full_text           141 non-null object
article_key_match           148 non-null object
article_keywords            143 non-null object
article_main_passage        148 non-null object
article_meta_description    132 non-null object
article_meta_keywords       58 non-null object
article_summary             141 non-null object
article_tags                54 non-null object
article_title               148 non-null object
article_url                 148 non-null object
dtypes: object(10)
memory usage: 11.6+ KB


In [7]:
def executeSearch(tableField, topK):

    accuracy = []

    for i,row in (articles.iterrows()):
    
        article_url = row['article_url']
        article_page_title = row['article_title']
        article_page_summary = str(row['article_summary'])
        article_page_meta_description = str(row['article_meta_description'])
        article_keywords = str(row['article_keywords'])
        article_main_passage = str(row['article_main_passage'][0:1000])
        
        catch = article_page_title+" "+article_page_meta_description+" "+article_keywords
        
        table_url = row['article_key_match'] 
        articl_key = table_url
        
        result = searchIndexing(article_page_title,tableField,topK)

        if result['hits']['total'] > 0:

            return_tables = []

            for hit in result['hits']['hits']:

                tablePgId = hit['_source']['table_url']
                table_score = hit['_score']
                new_row = {"table_id": tablePgId,"table_score": table_score}
                return_tables.append(new_row)

            df_return_tables = pd.DataFrame(return_tables)
            df_return_tables_sorting = df_return_tables.sort_values('table_score', ascending=False)

            selected_top = df_return_tables_sorting.head(topK)
            min_score = selected_top['table_score'].min()
            draw_tables_socres = df_return_tables_sorting[df_return_tables_sorting['table_score'] >= min_score]
            final_ranked_tables = draw_tables_socres.iloc[:,0:1].values

            accuracy.append(get_accuracy(final_ranked_tables,articl_key))

    print(str(round(np.mean(accuracy),4)))

In [8]:
tableField = ["table_page_title","table_page_content","table_page_full_text","table_page_summary","table_page_keywords","table_caption","table_header","table_body"]
topK = [1,5,10,20]

In [26]:
# for k in topK:
    
#     print("")
    
#     for field in tableField:
        
#         print("Acc@"+str(k)+" - "+field)
        
#         executeSearch(field,k)

In [9]:
for k in topK:
    
    print("")
         
    print("Acc@"+str(k))
        
    executeSearch("",k)


Acc@1
0.2905

Acc@5
0.4324

Acc@10
0.5203

Acc@20
0.5811
