In [None]:
from tqdm import tqdm
import numpy as np
import pandas as pd
from lxml import html
import requests
import re
import csv
from sklearn.model_selection import train_test_split
from elasticsearch import Elasticsearch
from elasticsearch import helpers
indexing_distinct_tables = Elasticsearch(timeout=30, max_retries=10, retry_on_timeout=True)

In [None]:
def search_indexing_match(query):
    
    result= indexing_distinct_tables.search(
        index="distinct_tables", 
        body = {
        "_source": ["tablePgID","tablePgTitle","tablePgFullText","tablePgMetaDescription","tablePgSummary","tablePgKeywords","tableSectionTitle","tableCaption","tableHeader","tableBody"],
        "from" : 0,
        "size" : 100,
        "query": {
            "multi_match":{
              "type": "most_fields",
              "query":    query, 
              "fields": ["tablePgTitle","tablePgSummary"] 
            }
        }
    })
    
    return result

In [None]:
articles = pd.read_csv('../../dataset/data_articles_train.csv', delimiter=',')

In [None]:
articles.head(1)

In [None]:
true_pairs_by_cossine = []

for i,row in tqdm(articles.iterrows()):
    
    articleID = row['page_id']
    articleTitle = row['page_title']
    article_meta_description = row['meta_description']
    
    query = articleTitle+" "+article_meta_description
    
    #getting the table title match
    try:
        
        contMatch = 0
        result = search_indexing_match(query)
        max_score = result['hits']['max_score']

        for hit in result['hits']['hits']:

            tableID = hit['_source']['tablePgID']
            tableTitle = (hit['_source']['tablePgTitle'])
            tablePgSummary = (hit['_source']['tablePgSummary'])
            table_score = hit['_score']

            normalized_score = table_score/max_score

            #getting match data
            if (articleID == tableID) and (normalized_score >= 0.7): 

                if contMatch < 1:

                    tableIDMatch = tableID
                    tableTitleMatch = tableTitle
                    tablePgSummaryMatch = tablePgSummary

                    new_row = {"article_id": articleID,
                               "article_page_title": articleTitle,
                               "article_meta_description": article_meta_description,
                               "table_id": tableIDMatch,
                               "table_page_title": tableTitleMatch,
                               "table_page_summary": tablePgSummaryMatch,
                               "label": '1'}

                    true_pairs_by_cossine.append(new_row)

                    contMatch = contMatch + 1

                else:

                    contMatch = 0
                    break
    except:
        
        continue

In [None]:
df_true_samples = pd.DataFrame(true_pairs_by_cossine)

In [None]:
df_true_samples.to_csv('true_pairs_by_cosine_070.csv',index=False)

In [None]:
len(df_true_samples)