In [1]:
from tqdm import tqdm
import numpy as np
import pandas as pd
from lxml import html
import requests
import re
import csv
from elasticsearch import Elasticsearch
from elasticsearch import helpers
indexingTables = Elasticsearch(timeout=30, max_retries=10, retry_on_timeout=True)
from sklearn.model_selection import train_test_split

In [2]:
def search_indexing_match(articleTitle):
    
    result= indexingTables.search(
        index="tables", 
        body = {
        "_source": ["tablePgID","tablePgTitle","tableSectionTitle","tableCaption","tableHeader","tableBody"],
        "from" : 0,
        "size" : 1000,
        "query": {
            "multi_match":{
              "type": "most_fields",
              "query":    articleTitle, 
              "fields": ["tablePgTitle"] 
            }
        }
    })
    
    return result

In [3]:
def search_index_random():
    
    result= indexingTables.search(
        index="tables", 
        body = {
        "_source": ["tablePgID","tablePgTitle","tableSectionTitle","tableCaption","tableHeader","tableBody"],
        "from" : 0,
        "size" : 100,
        "query": {
            "function_score": {
                "random_score": {}
            }
        }
    })
    
    return result

In [4]:
read = pd.read_csv('../dataset/train_dataset_all_articles', delimiter=',', header=None)
train_data_articles = read.iloc[:,:].values

In [5]:
read = pd.read_csv('../dataset/test_dataset_all_articles', delimiter=',', header=None)
validation_data_articles = read.iloc[:,:].values

In [6]:
train_dataset = []

In [7]:
for article in tqdm(train_data_articles):
    
    articleID = article[0]
    articleTitle = article[1]
    articleText = article[2]
    meta_description = article[3]
    keywords = article[5]
    
    catch = articleTitle+" "+meta_description+" "+keywords
    
    #getting the table title match
    contMatch = 0
    contNonMatch = 0
    result = search_indexing_match(articleTitle)
    max_score = result['hits']['max_score']
    
    
    for hit in result['hits']['hits']:
    
        tableID = hit['_source']['tablePgID']
        tableTitle = (hit['_source']['tablePgTitle'])
        tableSectionTitle = (hit['_source']['tableSectionTitle'])
        tableCaption = (hit['_source']['tableCaption'])
        tableHeader = (hit['_source']['tableHeader'])
        tableBody = (hit['_source']['tableBody'])
        table_score = hit['_score']
        
        normalized_score = table_score/max_score
        
        #getting match data
        if (articleID == tableID) and (normalized_score >= 0.7): 
            
            if contMatch < 5:
            
                tableIDMatch = tableID
                tableTitleMatch = tableTitle
                tableSectionTitleMatch = tableSectionTitle
                tableCaptionMatch = tableCaption
                tableHeaderMatch = tableHeader
                tableBodyMatch = tableBody
    
                train_dataset.append([articleID,articleTitle, meta_description, keywords, tableIDMatch, tableTitleMatch,1])
                
                contMatch = contMatch + 1
                
            else:
                
                contMatch = 0
                break

100%|██████████| 184619/184619 [37:20<00:00, 82.38it/s] 


In [9]:
len(train_dataset)

154249

In [13]:
for article in tqdm(train_data_articles):
    
    articleID = article[0]
    articleTitle = article[1]
    articleText = article[2]
    meta_description = article[3]
    keywords = article[5]
    
    catch = articleTitle+" "+meta_description+" "+keywords
    
    #getting the table title match
    contMatch = 0
    contNonMatch = 0
    result = search_index_random()
    max_score = result['hits']['max_score']
    
    
    for hit in result['hits']['hits']:
    
        tableID = hit['_source']['tablePgID']
        tableTitle = (hit['_source']['tablePgTitle'])
        tableSectionTitle = (hit['_source']['tableSectionTitle'])
        tableCaption = (hit['_source']['tableCaption'])
        tableHeader = (hit['_source']['tableHeader'])
        tableBody = (hit['_source']['tableBody'])
        table_score = hit['_score']
        
        normalized_score = table_score/max_score
        
        #getting the table title non-match
        if (articleID != tableID) : 
            
            if contNonMatch < 2:
                
                tableIDNoMatch = tableID
                tableTitleNoMatch = tableTitle
                tableSectionTitleNoMatch = tableSectionTitle
                tableCaptionNoMatch = tableCaption
                tableHeaderNoMatch = tableHeader
                tableBodyNoMatch = tableBody

                train_dataset.append([articleID,articleTitle, meta_description, keywords, tableIDNoMatch,tableTitleNoMatch,0])
                
                contNonMatch = contNonMatch + 1
            else:
                
                contNonMatch = 0
                break
                        

100%|██████████| 184619/184619 [11:10<00:00, 275.40it/s]


In [14]:
len(train_dataset)

540559

In [15]:
with open('train_dataset_random_1_1', 'w') as myfile:
          
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)

    for i in tqdm(range(0,len(train_dataset))):
    
        articlePgID = train_dataset[i][0]
        articleTitle = str(train_dataset[i][1])
        article_meta_description = str(train_dataset[i][2])
        article_keywords = str(train_dataset[i][3])
        tableID = train_dataset[i][4]
        tableTitle = str(train_dataset[i][5])
        match = train_dataset[i][6]
        
        row = [articlePgID, articleTitle, article_meta_description, article_keywords, tableID, tableTitle, match]
        
        wr.writerow(row)

100%|██████████| 540559/540559 [00:02<00:00, 184011.00it/s]


In [None]:
validation_dataset = []

In [10]:
for article in tqdm(validation_data_articles):
    
    articleID = article[0]
    articleTitle = article[1]
    articleText = article[2]
    meta_description = article[3]
    keywords = article[5]
    
    catch = articleTitle+" "+meta_description+" "+keywords
    
    #getting the table title match
    contMatch = 0
    contNonMatch = 0
    result = search_indexing_match(articleTitle)
    max_score = result['hits']['max_score']
    
    
    for hit in result['hits']['hits']:
    
        tableID = hit['_source']['tablePgID']
        tableTitle = (hit['_source']['tablePgTitle'])
        tableSectionTitle = (hit['_source']['tableSectionTitle'])
        tableCaption = (hit['_source']['tableCaption'])
        tableHeader = (hit['_source']['tableHeader'])
        tableBody = (hit['_source']['tableBody'])
        table_score = hit['_score']
        
        normalized_score = table_score/max_score
        
        #getting match data
        if (articleID == tableID) and (normalized_score >= 0.7): 
            
            if contMatch < 5:
            
                tableIDMatch = tableID
                tableTitleMatch = tableTitle
                tableSectionTitleMatch = tableSectionTitle
                tableCaptionMatch = tableCaption
                tableHeaderMatch = tableHeader
                tableBodyMatch = tableBody
    
                train_dataset.append([articleID,articleTitle, meta_description, keywords, tableIDMatch, tableTitleMatch,1])
                
                contMatch = contMatch + 1
                
            else:
                
                contMatch = 0
                break

100%|██████████| 20514/20514 [04:19<00:00, 79.12it/s] 


In [12]:
len(train_dataset)

171321

In [None]:
for article in tqdm(validation_data_articles):
    
    articleID = article[0]
    articleTitle = article[1]
    articleText = article[2]
    meta_description = article[3]
    keywords = article[5]
    
    catch = articleTitle+" "+meta_description+" "+keywords
    
    #getting the table title match
    contMatch = 0
    contNonMatch = 0
    result = search_index_random()
    max_score = result['hits']['max_score']
    
    
    for hit in result['hits']['hits']:
    
        tableID = hit['_source']['tablePgID']
        tableTitle = (hit['_source']['tablePgTitle'])
        tableSectionTitle = (hit['_source']['tableSectionTitle'])
        tableCaption = (hit['_source']['tableCaption'])
        tableHeader = (hit['_source']['tableHeader'])
        tableBody = (hit['_source']['tableBody'])
        table_score = hit['_score']
        
        normalized_score = table_score/max_score
        
        #getting the table title non-match
        if (articleID != tableID) : 
            
            if contNonMatch < 5:
                
                tableIDNoMatch = tableID
                tableTitleNoMatch = tableTitle
                tableSectionTitleNoMatch = tableSectionTitle
                tableCaptionNoMatch = tableCaption
                tableHeaderNoMatch = tableHeader
                tableBodyNoMatch = tableBody

                validation_dataset.append([articleID,articleTitle, meta_description, keywords, tableIDNoMatch,tableTitleNoMatch,0])
                
                contNonMatch = contNonMatch + 1
            else:
                
                contNonMatch = 0
                break
                        

In [None]:
len(validation_dataset)

In [None]:
with open('validation_dataset_random_1_5', 'w') as myfile:
          
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)

    for i in tqdm(range(0,len(validation_dataset))):
    
        articlePgID = validation_dataset[i][0]
        articleTitle = str(validation_dataset[i][1])
        article_meta_description = str(validation_dataset[i][2])
        article_keywords = str(validation_dataset[i][3])
        tableID = validation_dataset[i][4]
        tableTitle = str(validation_dataset[i][5])
        match = validation_dataset[i][6]
        
        row = [articlePgID, articleTitle, article_meta_description, article_keywords, tableID, tableTitle, match]
        
        wr.writerow(row)