In [1]:
from tqdm import tqdm
import numpy as np
import pandas as pd
import random
import gensim as gs
from elasticsearch import Elasticsearch
from elasticsearch import helpers
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()
indexing_tables = Elasticsearch(timeout=30, max_retries=10, retry_on_timeout=True)
import tensorflow as tf
import warnings
warnings.filterwarnings('ignore')
from transformers import TFBertModel, BertTokenizer, TFBertMainLayer, BertConfig

In [2]:
tf.__version__

'2.2.0'

In [3]:
raw_articles = pd.read_csv('dataset/test_articles_dataset_newyork.csv', delimiter=',')

In [4]:
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

In [5]:
MAX_TOKENS = 200

In [6]:
indexing_tables.indices.close(index='data_table_newyork')
indexing_tables.indices.put_settings(index='data_table_newyork', body={"index": {"similarity": {"default": {"type": "BM25"}}}})
indexing_tables.indices.open(index='data_table_newyork')

{'acknowledged': True, 'shards_acknowledged': True}

In [7]:
def search_indexing(query):
    
    result= indexing_tables.search(
        index="data_table_newyork", 
        body = {
        "_source": ["table_url","table_page_title","table_page_summary"],
        "from" : 0,
        "size" : 30,
        "query": {
            "multi_match":{
              "type": "most_fields",
              "query":    query, 
              "fields": ["table_page_title","table_page_content","table_page_keywords"] 
            }
        }
    })
    
    return result

In [8]:
def get_accuracy(ID_goal,ranked_tables_ID):
    
    accuracy = 0
    
    for table_ID in ranked_tables_ID:
        
        if table_ID == ID_goal:
    
            accuracy = 1
            break;
            
    return accuracy

In [9]:
def search_index(article_title):
    
    tables_index = []

    result_index = search_indexing(article_title)
        
    for hit in result_index['hits']['hits']:
    
        table_ID = hit['_source']['table_url']
        
        table_page_title = hit['_source']['table_page_title']
        
        table_page_cotent = hit['_source']['table_page_summary']
    
        tables_index.append([table_ID,table_page_title,table_page_cotent])
    
    return tables_index

In [10]:
ranking_model = tf.keras.models.load_model('../learning_to_rank_models/bert/bert_model_title_main_passage_mlp')

In [11]:
raw_articles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148 entries, 0 to 147
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   article_full_text         141 non-null    object
 1   article_key_match         148 non-null    object
 2   article_keywords          143 non-null    object
 3   article_main_passage      148 non-null    object
 4   article_meta_description  132 non-null    object
 5   article_meta_keywords     58 non-null     object
 6   article_summary           141 non-null    object
 7   article_tags              54 non-null     object
 8   article_title             148 non-null    object
 9   article_url               148 non-null    object
dtypes: object(10)
memory usage: 11.7+ KB


In [14]:
result = []

def run_search(k):
    
    TOP_K = k
    accuracy = []
    
    for i,row in tqdm(raw_articles.iterrows()):
    
        test_title_ids = []
        test_title_mask = []
        test_title_seg = []
        
        ranked_tables_model = []
    
        article_ID = row['article_key_match']
        article_title_text = str(row['article_title'])
        article_main_passage_text = str(row['article_main_passage'][0:1000])
        article_keywords_text = str(row['article_title'])
        
        catch = article_title_text+" "+article_main_passage_text+" "+article_keywords_text
        
        ranked_tables_index = search_index(catch)
        
        for table_ID, table_title_index, table_page_content_index in (ranked_tables_index):
            
            return_tokenizer1 = bert_tokenizer.encode_plus(
              article_title_text+" "+article_main_passage_text,
              table_title_index+" "+table_page_content_index,
              max_length=MAX_TOKENS,
              add_special_tokens=True,
              return_token_type_ids=True,
              pad_to_max_length=True,
              return_attention_mask=True,
                )

            test_title_ids.append(return_tokenizer1['input_ids'])
            test_title_mask.append(return_tokenizer1['attention_mask'])
            test_title_seg.append(return_tokenizer1['token_type_ids'])  

        test_title_ids = np.array(test_title_ids)
        test_title_mask = np.array(test_title_mask)
        test_title_seg = np.array(test_title_seg)

        table_ranking_model = ranking_model.predict([test_title_ids,test_title_mask,test_title_seg])
    
        for i in range(0,len(table_ranking_model)):
        
            ranked_tables_model.append([ranked_tables_index[i][0],ranked_tables_index[i][1],table_ranking_model[i][0]]) 
        
        data_frame = pd.DataFrame(ranked_tables_model, columns = ['table_ID', 'table_title','table_ranking']) 
        data_frame_sorting = data_frame.sort_values('table_ranking', ascending=False)
        final_ranked_tables = data_frame_sorting.iloc[0:TOP_K,0:1].values
        
#         selected_top = data_frame_sorting.head(TOP_K)
#         min_score = selected_top['table_ranking'].min()
#         draw_tables_socres = data_frame_sorting[data_frame_sorting['table_ranking'] >= min_score]
#         final_ranked_tables = draw_tables_socres.iloc[:,0:1].values
           
        accuracy.append(get_accuracy(article_ID, final_ranked_tables))
        
    result.append(["Acc@"+str(k),str(round(np.mean(accuracy),4))])

In [15]:
accuracy_K = [1,5,10,20]

for k in accuracy_K:
     
    run_search(k)

148it [01:02,  2.36it/s]
148it [00:49,  3.00it/s]
148it [00:49,  2.99it/s]
148it [00:49,  2.97it/s]


In [16]:
result

[['Acc@1', '0.5'],
 ['Acc@5', '0.6689'],
 ['Acc@10', '0.7635'],
 ['Acc@20', '0.8176']]