In [1]:
from tqdm import tqdm
import numpy as np
import pandas as pd
import random
from elasticsearch import Elasticsearch
from elasticsearch import helpers
indexing_distinct_tables = Elasticsearch(timeout=30, max_retries=10, retry_on_timeout=True)

In [2]:
def get_accuracy(idRankedTables, idQueryGoal):
    
    accuracy = 0

    for idTable in idRankedTables:
    
        if idTable[0] == idQueryGoal:
    
            accuracy = 1
            break;

    return accuracy

In [3]:
indexing_distinct_tables.indices.close(index='distinct_tables')
indexing_distinct_tables.indices.put_settings(index='distinct_tables', body={"index": {"similarity": {"default": {"type": "classic"}}}})
indexing_distinct_tables.indices.open(index='distinct_tables')

{'acknowledged': True, 'shards_acknowledged': True}

In [17]:
def search_indexing(query,tableField):
    
    result= indexing_distinct_tables.search(
        index="distinct_tables", 
        body = {
        "_source": ["tablePgID"],
        "from" : 0,
        "size" : 300,
        "query": {
            "multi_match":{
              "type": "most_fields",
              "query":    query, 
              #"fields": ["tablePgTitle"] 
              "fields": ["tablePgTitle","tablePgFullText"] 
              #"fields": [tableField]
            }
        }
    })
    
    return result

In [5]:
articles = pd.read_csv('dataset/data_articles_test.csv', delimiter=',')

In [10]:
articles.head(1)

Unnamed: 0,full_text,keywords,meta_description,meta_keywords,page_id,page_title,summary,tags
0,when comes pioneers progressive rock handful b...,faces anderson album chris jon went record tre...,yes among year rock roll hall fame inductees c...,,4125505,rock roll hall famers jon anderson trevor rabi...,because good photos the best yes album coverss...,


In [16]:
def execute_search(tableField, topK):

    accuracy = []

    for i,row in (articles.iterrows()):

        articl_key = row['page_id']
        query = row['page_title']+" "+row['meta_description']

        result = search_indexing(query,tableField)

        if result['hits']['total'] > 0:

            return_tables = []

            for hit in result['hits']['hits']:

                tablePgId = hit['_source']['tablePgID']
                table_score = hit['_score']
                new_row = {"table_id": tablePgId,"table_score": table_score}
                return_tables.append(new_row)

            df_return_tables = pd.DataFrame(return_tables)
            df_return_tables_sorting = df_return_tables.sort_values('table_score', ascending=False)

            selected_top = df_return_tables_sorting.head(topK)
            min_score = selected_top['table_score'].min()
            draw_tables_socres = df_return_tables_sorting[df_return_tables_sorting['table_score'] >= min_score]
            final_ranked_tables = draw_tables_socres.iloc[:,0:1].values

            accuracy.append(get_accuracy(final_ranked_tables,articl_key))

    print(str(round(np.mean(accuracy),4)))

In [7]:
tableField = ["tablePgTitle","tablePgFullText","tablePgSummary","tablePgKeywords","tableSectionTitle","tableCaption","tableHeader","tableBody"]
topK = [1,5,10,20,100]

In [None]:
for k in topK:
    
    print("")
    
    for field in tableField:
        
        print("Acc@"+str(k)+" - "+field)
        
        execute_search(field,k)

In [18]:
for k in topK:
    
    print("Acc@"+str(k))
    
    execute_search("",k)

Acc@1
0.2885
Acc@5
0.4084
Acc@10
0.4581
Acc@20
0.4922
Acc@100
0.577
