In [1]:
from tqdm import tqdm
import numpy as np
import pandas as pd
from lxml import html
import requests
import re
import csv
from sklearn.model_selection import train_test_split
from elasticsearch import Elasticsearch
from elasticsearch import helpers
indexing_distinct_tables = Elasticsearch(timeout=30, max_retries=10, retry_on_timeout=True)

In [2]:
def search_indexing_match(query):
    
    result= indexing_distinct_tables.search(
        index="distinct_tables", 
        body = {
        "_source": ["tablePgID","tablePgTitle","tablePgFullText","tablePgMetaDescription","tablePgSummary","tablePgKeywords","tableSectionTitle","tableCaption","tableHeader","tableBody"],
        "from" : 0,
        "size" : 100,
        "query": {
            "multi_match":{
              "type": "most_fields",
              "query":    query, 
              "fields": ["tablePgTitle","tablePgSummary"] 
            }
        }
    })
    
    return result

In [3]:
articles = pd.read_csv('../../dataset/data_articles_train.csv', delimiter=',')

In [4]:
articles.head(1)

Unnamed: 0,full_text,keywords,meta_description,meta_keywords,page_id,page_title,summary,tags
0,reality star heidi montag recently sat talk ex...,recently surgery implants reality size shes ge...,reality star heidi montag recently sat talk ex...,health problems breast augmentation heidi mont...,5765329,heidi montag gets plastic surgery removes over...,reality star heidi montag recently sat talk ex...,


In [5]:
true_pairs_by_cossine = []

for i,row in tqdm(articles.iterrows()):
    
    articleID = row['page_id']
    articleTitle = row['page_title']
    article_meta_description = row['meta_description']
    article_keywords = row['keywords']
    
    query = articleTitle+" "+article_meta_description+" "+article_keywords
    
    #getting the table title match
    try:
        
        contMatch = 0
        result = search_indexing_match(query)
        max_score = result['hits']['max_score']

        for hit in result['hits']['hits']:

            tableID = hit['_source']['tablePgID']
            tableTitle = (hit['_source']['tablePgTitle'])
            tablePgSummary = (hit['_source']['tablePgSummary'])
            tablePgKeywords = (hit['_source']['tablePgKeywords'])
            table_score = hit['_score']

            normalized_score = table_score/max_score

            #getting match data
            if (articleID == tableID) and (normalized_score >= 0.7): 

                if contMatch < 1:

                    tableIDMatch = tableID
                    tableTitleMatch = tableTitle
                    tablePgSummaryMatch = tablePgSummary

                    new_row = {"article_id": articleID,
                               "article_page_title": articleTitle,
                               "article_meta_description": article_meta_description,
                               "article_keywords":article_keywords,
                               "table_id": tableIDMatch,
                               "table_page_title": tableTitleMatch,
                               "table_page_summary": tablePgSummaryMatch,
                               "table_page_keywords": tablePgKeywords,
                               "label": '1'}

                    true_pairs_by_cossine.append(new_row)

                    contMatch = contMatch + 1

                else:

                    contMatch = 0
                    break
    except:
        
        continue

204107it [42:09, 80.69it/s]


In [6]:
df_true_samples = pd.DataFrame(true_pairs_by_cossine)

In [7]:
df_true_samples.to_csv('true_pairs_by_cosine_070_title_main_passage_keywords.csv',index=False)

In [8]:
len(df_true_samples)

94167