In [1]:
from tqdm import tqdm
import numpy as np
import pandas as pd
import random
from elasticsearch import Elasticsearch
from elasticsearch import helpers
indexingTables = Elasticsearch(timeout=30, max_retries=10, retry_on_timeout=True)

In [2]:
def get_accuracy(idRankedTables, idQueryGoal):
    
    accuracy = 0

    for idTable in idRankedTables:
    
        if idTable == idQueryGoal:
    
            accuracy = 1
            break;

    return accuracy

In [3]:
indexingTables.indices.close(index='tables')
indexingTables.indices.put_settings(index='tables', body={"index": {"similarity": {"default": {"type": "classic"}}}})
indexingTables.indices.open(index='tables')

{'acknowledged': True, 'shards_acknowledged': True}

In [4]:
def searchIndexing(query, tableField, topK):
    
    result= indexingTables.search(
        index="tables", 
        body = {
        "_source": ["tablePgID"],
        "from" : 0,
        "size" : topK,
        "query": {
            "multi_match":{
              "type": "most_fields",
              "query":    query, 
              "fields": ["tablePgTitle"] 
              #"fields": ["tablePgTitle","tableHeader","tableBody"] 
              #"fields": [tableField]
            }
        }
    })
    
    return result

In [5]:
articles = pd.read_csv('dataset/test_dataset_all_articles', delimiter=',', header=None)
formattedArticles = articles.iloc[:,:].values

In [6]:
def executeSearch(tableField, topK):

    accuracy = []

    for articles in (formattedArticles[0:50]):
    
        articl_key = articles[0]
        article_title = str(articles[1])
        article_full_text = str(articles[2][0:6000])
        article_meta_description = str(articles[3])
        article_summary = str(articles[4])
        keywords = str(articles[5])
        
        catch = article_title +" "+article_meta_description+" "+keywords
        
        result = searchIndexing(catch,tableField,topK)
    
        idRankedTables = []
    
        for hit in result['hits']['hits']:
    
            tablePgId = hit['_source']['tablePgID']
    
            idRankedTables.append(tablePgId)
    
        accuracy.append(get_accuracy(idRankedTables,articl_key))

    print(str(round(np.mean(accuracy),4)))

In [7]:
tableField = ["tablePgTitle","tableSectionTitle","tableCaption","tableHeader","tableBody"]
topK = [1,5,10,20,50,100,1000]

In [8]:
for k in topK:
    
    print("")
    
    for field in tableField:
        
        print("Acc@"+str(k)+" - "+field)
        
        executeSearch(field,k)


Acc@1 - tablePgTitle
0.34
Acc@1 - tableSectionTitle
0.34
Acc@1 - tableCaption
0.34
Acc@1 - tableHeader
0.34
Acc@1 - tableBody
0.34

Acc@5 - tablePgTitle
0.44
Acc@5 - tableSectionTitle
0.44
Acc@5 - tableCaption
0.44
Acc@5 - tableHeader
0.44
Acc@5 - tableBody
0.44

Acc@10 - tablePgTitle
0.5
Acc@10 - tableSectionTitle
0.5
Acc@10 - tableCaption
0.5
Acc@10 - tableHeader
0.5
Acc@10 - tableBody
0.5

Acc@20 - tablePgTitle
0.52
Acc@20 - tableSectionTitle
0.52
Acc@20 - tableCaption
0.52
Acc@20 - tableHeader
0.52
Acc@20 - tableBody
0.52

Acc@50 - tablePgTitle
0.58
Acc@50 - tableSectionTitle
0.58
Acc@50 - tableCaption
0.58
Acc@50 - tableHeader
0.58
Acc@50 - tableBody
0.58

Acc@100 - tablePgTitle
0.62
Acc@100 - tableSectionTitle
0.62
Acc@100 - tableCaption
0.62
Acc@100 - tableHeader
0.62
Acc@100 - tableBody
0.62

Acc@1000 - tablePgTitle
0.7
Acc@1000 - tableSectionTitle
0.7
Acc@1000 - tableCaption
0.7
Acc@1000 - tableHeader
0.7
Acc@1000 - tableBody
0.7


In [14]:
for k in topK:
    
    print("Acc@"+str(k))
    
    executeSearch("",k)

Acc@1
0.34
Acc@5
0.44
Acc@10
0.5
Acc@20
0.52
Acc@50
0.58
Acc@100
0.62
Acc@1000
0.7
