In [1]:
from tqdm import tqdm
import numpy as np
import pandas as pd
import random
from elasticsearch import Elasticsearch
from elasticsearch import helpers
indexingTables = Elasticsearch(max_content_length=1000)

In [2]:
tables = pd.read_csv('articlesDataset/cleanDataTables', delimiter=',', header=None)

In [3]:
formattedTables = tables.iloc[:,:].values

In [4]:
def getAccuracy(idRankedTables, idQueryGoal):
    
    accuracy = 0

    for idTable in idRankedTables:
    
        if idTable == idQueryGoal:
    
            accuracy = 1
            break;

    return accuracy

In [5]:
def createIndexing():
    
    for tables in tqdm(formattedTables):
        
        tablePgID = tables[0]
        tablePgTitle = str(tables[1])
        tableSectionTitle = str(tables[2])
        tableCaption = str(tables[3])
        tableHeader = str(tables[4])
        tableBody = str(tables[5])
        
        yield {
            "_index": "tables",
            "_type": "tables",
            "_source": {
                "tablePgID": tablePgID,
                "tablePgTitle": tablePgTitle,
                "tableSectionTitle": tableSectionTitle,
                "tableCaption": tableCaption,
                "tableHeader": tableHeader,
                "tableBody": tableBody
            }
            
        }

In [6]:
indexingTables.indices.delete(index='tables')

{'acknowledged': True}

In [7]:
helpers.bulk(indexingTables, createIndexing())

100%|██████████| 298793/298793 [01:51<00:00, 2689.64it/s]


(298793, [])

In [8]:
indexingTables.indices.close(index='tables')

{'acknowledged': True}

In [9]:
indexingTables.indices.put_settings(index='tables', body={"index": {"similarity": {"default": {"type": "classic"}}}})

{'acknowledged': True}

In [10]:
indexingTables.indices.open(index='tables')

{'acknowledged': True, 'shards_acknowledged': True}

In [11]:
def searchIndexingMLT(query, tableField, topK):
        
    result = indexingTables.search(
    index="tables", 
    body = {
        "_source": ["tablePgID"],
        "from" : 0,
        "size" : topK,
        "query": {
            "more_like_this" : {
                "fields" : [tableField],
                "like" : "["+query+"]",
                "min_term_freq" : 0,
                "min_doc_freq":0,
                "max_query_terms":50
            }
        }
    }
    )
    
    return result

In [12]:
def searchIndexing(query, tableField, topK):
    
    result= indexingTables.search(
        index="tables", 
        body = {
        "_source": ["tablePgID"],
        "from" : 0,
        "size" : topK,
        "query": {
            "multi_match":{
              "type": "most_fields",
              "query":    query, 
              "fields": [tableField] 
            }
        }
    })
    
    return result

In [13]:
articles = pd.read_csv('articlesDataset/articlesTestDataset', delimiter=',', header=None)
formattedArticles = articles.iloc[:,:].values

In [14]:
def executeSearch(tableField, topK):

    accuracy = []

    for articles in (formattedArticles):
    
        articleKey = articles[0]
        
        articleTitle = str(articles[1])
    
        articleText = str(articles[2])
        
        result = searchIndexing(articleTitle,tableField,topK)
    
        idRankedTables = []
    
        for hit in result['hits']['hits']:
    
            tablePgId = hit['_source']['tablePgID']
    
            idRankedTables.append(tablePgId)
    
        accuracy.append(getAccuracy(idRankedTables,articleKey))

    print(str(round(np.mean(accuracy),4))+" (±) "+str(round(np.std(accuracy),4)))

In [15]:
tableField = ["tablePgTitle","tableSectionTitle","tableCaption","tableHeader","tableBody"]
topK = [1,10,100,1000]

In [16]:
for k in topK:
    
    print("Acc@"+str(k))
    
    for field in tableField:
        
        executeSearch(field,k)

Acc@1
0.1899 (±) 0.3922
0.0035 (±) 0.0591
0.0049 (±) 0.0698
0.0054 (±) 0.0733
0.0138 (±) 0.1167
Acc@10
0.2818 (±) 0.4499
0.0085 (±) 0.0918
0.0132 (±) 0.1141
0.0191 (±) 0.1369
0.0512 (±) 0.2204
Acc@100
0.3786 (±) 0.485
0.0265 (±) 0.1606
0.0395 (±) 0.1948
0.0469 (±) 0.2114
0.1268 (±) 0.3327
Acc@1000
0.466 (±) 0.4988
0.0691 (±) 0.2536
0.0902 (±) 0.2865
0.096 (±) 0.2946
0.2916 (±) 0.4545
