In [17]:
from tqdm import tqdm
import numpy as np
import pandas as pd
import random
import gensim as gs
from elasticsearch import Elasticsearch
from elasticsearch import helpers
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()
indexing_tables = Elasticsearch(timeout=30, max_retries=10, retry_on_timeout=True)
import tensorflow as tf
import warnings
warnings.filterwarnings('ignore')

In [2]:
tf.__version__

'2.0.0-rc0'

In [18]:
test_articles = pd.read_csv('dataset/data_articles_test.csv', delimiter=',')

In [19]:
# embedding_model = gs.models.FastText.load_fasttext_format('pre_trained_models/cc.en.300.bin')
embedding_model = gs.models.FastText.load('embedding_model')

In [20]:
indexing_tables.indices.close(index='tables')
indexing_tables.indices.put_settings(index='tables', body={"index": {"similarity": {"default": {"type": "classic"}}}})
indexing_tables.indices.open(index='tables')

{'acknowledged': True, 'shards_acknowledged': True}

In [21]:
def search_indexing(query):
    
    result= indexing_tables.search(
        index="tables", 
        body = {
        "_source": ["tablePgID","tablePgTitle"],
        "from" : 0,
        "size" : 1000,
        "query": {
            "multi_match":{
              "type": "most_fields",
              "query":    query, 
              "fields": ["tablePgTitle"] 
            }
        }
    })
    
    return result

In [22]:
def get_accuracy(ID_goal,ranked_tables_ID):
    
    accuracy = 0
    
    for table_ID in ranked_tables_ID:
        
        if table_ID == ID_goal:
    
            accuracy = 1
            break;

    return accuracy

In [23]:
MAX_PAD = 9

def sequence_padding(X_DIM, value):
    
    value_padding = np.pad(value, ((0,MAX_PAD - X_DIM),(0,0)), 'constant')
    
    return value_padding

In [24]:
def create_embedding(value):

    value = tknzr.tokenize(str(value))
    
    if len(value) < MAX_PAD:
        
        embedding = embedding_model.wv[value]
        
        padding_embedding = sequence_padding(embedding.shape[0],embedding)
        
        return padding_embedding
        
    else:
        
        embedding = embedding_model.wv[value[0:MAX_PAD]]
        
        return embedding

In [25]:
def search_index(article_title):
    
    tables_index = []

    result_index = search_indexing(article_title)
        
    for hit in result_index['hits']['hits']:
    
        table_ID = hit['_source']['tablePgID']
        
        table_page_title = hit['_source']['tablePgTitle']
    
        tables_index.append([table_ID,table_page_title])
    
    return tables_index

In [35]:
ranking_model = tf.keras.models.load_model('attention_model_1_1.h5')

In [39]:
result = []

def run_search(k):
    
    TOP_K = k
    accuracy = []
    
    for i, row in tqdm(test_articles.iterrows()):
        
        article_ID = row['page_id']
        article_title_text = row['page_title']
       
        article_title = []
        title_table = []
        ranked_tables_model = []

        ranked_tables_index = search_index(article_title_text)
        
        if len(ranked_tables_index) > 0:
        
            article_title_embedding = create_embedding(article_title_text)

            for table_ID, table_title in (ranked_tables_index):

                table_title_embedding = create_embedding(str(table_title))

                article_title.append(article_title_embedding)
                title_table.append(table_title_embedding)

            article_title = np.array(article_title)
            title_table = np.array(title_table)

            table_ranking_model = ranking_model.predict([article_title,title_table])

            for i in range(0,len(table_ranking_model)):

                ranked_tables_model.append([ranked_tables_index[i][0],ranked_tables_index[i][1],table_ranking_model[i][0]]) 

            data_frame = pd.DataFrame(ranked_tables_model, columns = ['table_ID', 'table_title','table_ranking']) 
            data_frame_sorting = data_frame.sort_values('table_ranking', ascending=False)
            final_ranked_tables = data_frame_sorting.iloc[0:TOP_K,0:1].values

            accuracy.append(get_accuracy(article_ID, final_ranked_tables))

    result.append(["Acc@"+str(k),str(round(np.mean(accuracy),4))])

In [40]:
accuracy_K = [10]

for k in accuracy_K:
     
    run_search(k)

1026it [02:09,  7.31it/s]


In [41]:
result

[['Acc@10', '0.259']]

In [15]:
# [['Acc@1', '0.2268'],
#  ['Acc@5', '0.263'],
#  ['Acc@10', '0.2864'],
#  ['Acc@20', '0.3167'],
#  ['Acc@50', '0.3627'],
#  ['Acc@100', '0.4086']]

In [67]:
result = []


    
TOP_K = 10
accuracy = []


row = test_articles.iloc[279]
article_ID = row['page_id']
article_title_text = row['page_title']

article_title = []
title_table = []
ranked_tables_model = []

ranked_tables_index = search_index(article_title_text)

if len(ranked_tables_index) > 0:

    article_title_embedding = create_embedding(article_title_text)

    for table_ID, table_title in (ranked_tables_index):

        table_title_embedding = create_embedding(str(table_title))

        article_title.append(article_title_embedding)
        title_table.append(table_title_embedding)

    article_title = np.array(article_title)
    title_table = np.array(title_table)

    table_ranking_model = ranking_model.predict([article_title,title_table])

    for i in range(0,len(table_ranking_model)):

        ranked_tables_model.append([ranked_tables_index[i][0],ranked_tables_index[i][1],table_ranking_model[i][0]]) 

    data_frame = pd.DataFrame(ranked_tables_model, columns = ['table_ID', 'table_title','table_ranking']) 
    data_frame_sorting = data_frame.sort_values('table_ranking', ascending=False)
    final_ranked_tables = data_frame_sorting.iloc[0:TOP_K,0:1].values

    accuracy.append(get_accuracy(article_ID, final_ranked_tables))


result.append(["Acc@"+str(k),str(round(np.mean(accuracy),4))])

In [68]:
accuracy

[1]

In [61]:
article_title_text

'hobart perth direct flights begin september virgin readies head west'

In [62]:
article_ID

860952

In [65]:
for i, row in data_frame.iterrows():
    if row['table_ID'] == 860952:
        print(row)
    

table_ID                               860952
table_title      hobart international airport
table_ranking                               1
Name: 58, dtype: object
table_ID                               860952
table_title      hobart international airport
table_ranking                               1
Name: 60, dtype: object


In [66]:
data_frame_sorting

Unnamed: 0,table_ID,table_title,table_ranking
0,13699,hobart,1.000000e+00
1,13699,hobart,1.000000e+00
2,13699,hobart,1.000000e+00
58,860952,hobart international airport,1.000000e+00
4,6486153,hobart international,1.000000e+00
60,860952,hobart international airport,1.000000e+00
18,6486153,hobart international,1.000000e+00
86,34186050,sydney hobart yacht race,9.999993e-01
89,34186050,sydney hobart yacht race,9.999993e-01
90,1858574,sydney hobart yacht race,9.999993e-01
