In [37]:
from tqdm import tqdm
import numpy as np
import pandas as pd
from lxml import html
import requests
import re
import csv
from elasticsearch import Elasticsearch
from elasticsearch import helpers
indexingTables = Elasticsearch()
from sklearn.model_selection import train_test_split

In [38]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = set(stopwords.words('english')) 

In [39]:
def removeStopwords(example_sent):
    
    word_tokens = word_tokenize(example_sent) 
  
    filtered_sentence = [w for w in word_tokens if not w in stop_words] 
  
    filtered_sentence = [] 
  
    for w in word_tokens: 
        if w not in stop_words: 
            filtered_sentence.append(w)
    
    formattedText = ""

    for word in filtered_sentence:
        
        if (len(word)>2):
            
            formattedText = formattedText + " " +word
    
    
    formattedText = formattedText.lstrip()
    
    formattedText = formattedText.rstrip()
        
    return formattedText.lower()

In [None]:
articles = pd.read_csv('../dataset/formatted_data_articles_notnull', delimiter=',', header=None)
dataArticles = articles.iloc[:,:].values

In [None]:
distinctArticlesIDs = []
distinctArticles = []

for articles in tqdm(dataArticles):
    
    articlePgID = articles[0]
    
    if articlePgID not in distinctArticlesIDs:
        
        distinctArticlesIDs.append(articlePgID)
        
        distinctArticles.append(articles)

In [None]:
len(distinctArticles)

In [None]:
with open('distinct_formatted_articles_notnull', 'w') as myfile:
        
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)

    for articles in tqdm(distinctArticles):
        
        wr.writerow(articles)

In [40]:
#distinctArticles = pd.read_csv('../dataset/distinct_formatted_articles_notnull', delimiter=',', header=None)
#dataDistinctArticles = distinctArticles.iloc[:,:].values

distinctArticles = pd.read_csv('../dataset/formatted_data_articles_notnull', delimiter=',', header=None)
dataDistinctArticles = distinctArticles.iloc[:,:].values

In [41]:
len(distinctArticles)

205133

## Shuffle the data in train, validation and test

In [42]:
x_train, test_dataset = train_test_split(dataDistinctArticles, test_size=0.10, random_state=42)

## creating the test dataset

In [43]:
with open('test_dataset_all', 'w') as myfile:
          
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)

    for article in tqdm(test_dataset):
        
        wr.writerow(article)

100%|██████████| 20514/20514 [00:00<00:00, 24713.88it/s]


## creating the train dataset

In [44]:
with open('train_dataset_all', 'w') as myfile:
          
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)

    for article in tqdm(x_train):
        
        wr.writerow(article)

100%|██████████| 184619/184619 [00:07<00:00, 25041.25it/s]


In [45]:
read = pd.read_csv('../dataset/train_dataset_all', delimiter=',', header=None)
train_data_articles = read.iloc[:,:].values

In [46]:
train_dataset, validation_dataset = train_test_split(train_data_articles, test_size=0.05, random_state=42)

In [4]:
def searchIndexingMatch(articleID):
    
    result= indexingTables.search(
        index="tables", 
        body = {
        "_source": ["tablePgID","tablePgTitle","tableSectionTitle","tableCaption","tableHeader","tableBody"],
        "from" : 0,
        "size" : 1,
        "query": {
            "match" : {
                "tablePgID" : {
                    "query" : articleID
                    }
                }
        }
    })
    
    return result

In [47]:
def searchIndexingNoMatch(articleTitle):
    
    result= indexingTables.search(
        index="tables", 
        body = {
        "_source": ["tablePgID","tablePgTitle","tableSectionTitle","tableCaption","tableHeader","tableBody"],
        "from" : 0,
        "size" : 1000,
        "query": {
            "multi_match":{
              "type": "most_fields",
              "query":    articleTitle, 
              "fields": ["tablePgTitle"] 
            }
        }
    })
    
    return result

## creating the validation dataset

In [48]:
final_validation_dataset = []

for article in tqdm(validation_dataset):
    
    articleID = article[0]
    articleTitle = article[1]
    articleText = article[2]
    
    #getting the table title match
    contMatch = 0
    contNonMatch = 0
    result = searchIndexingNoMatch(articleTitle)
    max_score = result['hits']['max_score']
    
    
    for hit in result['hits']['hits']:
    
        tableID = hit['_source']['tablePgID']
        tableTitle = (hit['_source']['tablePgTitle'])
        tableSectionTitle = (hit['_source']['tableSectionTitle'])
        tableCaption = (hit['_source']['tableCaption'])
        tableHeader = (hit['_source']['tableHeader'])
        tableBody = (hit['_source']['tableBody'])
        table_score = hit['_score']
        
        normalized_score = table_score/max_score
        
        #getting match data
        if (articleID == tableID) and (normalized_score >= 0.8): 
            
            if contMatch < 1:
            
                tableIDMatch = tableID
                tableTitleMatch = tableTitle
                tableSectionTitleMatch = tableSectionTitle
                tableCaptionMatch = tableCaption
                tableHeaderMatch = tableHeader
                tableBodyMatch = tableBody
    
                final_validation_dataset.append([articleID,articleTitle, tableIDMatch, tableTitleMatch,1])
                
                contMatch = contMatch + 1
                
            else:
                
                contMatch = 0
                break
                
        
        #getting the table title non-match
        if (articleID != tableID) and (normalized_score < 0.3): 
            
            if contNonMatch < 5:
                
                tableIDNoMatch = tableID
                tableTitleNoMatch = tableTitle
                tableSectionTitleNoMatch = tableSectionTitle
                tableCaptionNoMatch = tableCaption
                tableHeaderNoMatch = tableHeader
                tableBodyNoMatch = tableBody

                final_validation_dataset.append([articleID,articleTitle, tableIDNoMatch,tableTitleNoMatch,0])
                
                contNonMatch = contNonMatch + 1
            else:
                
                contNonMatch = 0
                break
                        

100%|██████████| 9231/9231 [02:27<00:00, 62.60it/s]


In [49]:
len_validation  = len(final_validation_dataset)

In [50]:
len_validation

16632

In [51]:
with open('validation_dataset_1_5_all', 'w') as myfile:
          
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)

    for i in tqdm(range(0,len_validation)):
    
        articlePgID = final_validation_dataset[i][0]
        articleTitle = str(final_validation_dataset[i][1])
        #articleText = str(final_validation_dataset[i][2])
        
        tableID = final_validation_dataset[i][2]
        tableTitle = str(final_validation_dataset[i][3])
        #tableSectionTitle = str(final_validation_dataset[i][5])
        #tableCaption = str(final_validation_dataset[i][6])
        #tableHeader = str(final_validation_dataset[i][7])
        #tableBody = str(final_validation_dataset[i][8])
        match = final_validation_dataset[i][4]
        
        row = [articlePgID,articleTitle, tableID, tableTitle, match]
        
        wr.writerow(row)

100%|██████████| 16632/16632 [00:00<00:00, 321652.46it/s]


## creating the train dataset

In [None]:
# train_dataset = []

# for articles in tqdm(dataDistinctArticles[9902:len(dataDistinctArticles)]):
    
#     train_dataset.append(articles)

In [52]:
final_train_dataset = []

for article in tqdm(train_dataset):
    
    articleID = article[0]
    articleTitle = article[1]
    articleText = article[2]
    
    #getting the table title match
    contMatch = 0
    contNonMatch = 0
    result = searchIndexingNoMatch(articleTitle)
    max_score = result['hits']['max_score']
    
    for hit in result['hits']['hits']:
    
        tableID = hit['_source']['tablePgID']
        tableTitle = (hit['_source']['tablePgTitle'])
        tableSectionTitle = (hit['_source']['tableSectionTitle'])
        tableCaption = (hit['_source']['tableCaption'])
        tableHeader = (hit['_source']['tableHeader'])
        tableBody = (hit['_source']['tableBody'])
        table_score = hit['_score']
        
        normalized_score = table_score/max_score
        
        if (articleID == tableID) and (normalized_score >= 0.8): 
            
            if contMatch < 1:
            
                tableIDMatch = tableID
                tableTitleMatch = tableTitle
                tableSectionTitleMatch = tableSectionTitle
                tableCaptionMatch = tableCaption
                tableHeaderMatch = tableHeader
                tableBodyMatch = tableBody
    
                final_train_dataset.append([articleID,articleTitle, tableIDMatch, tableTitleMatch,1])
                
                contMatch = contMatch + 1
                
            else:
                
                contMatch = 0
                break
        
        if (articleID != tableID) and (normalized_score < 0.3): 
            
            if contNonMatch < 5:
                
                tableIDNoMatch = tableID
                tableTitleNoMatch = tableTitle
                tableSectionTitleNoMatch = tableSectionTitle
                tableCaptionNoMatch = tableCaption
                tableHeaderNoMatch = tableHeader
                tableBodyNoMatch = tableBody

                final_train_dataset.append([articleID,articleTitle, tableIDNoMatch,tableTitleNoMatch,0])
                
                contNonMatch = contNonMatch + 1
            else:
                
                contNonMatch = 0
                break
               

  7%|▋         | 12304/175388 [02:43<57:39, 47.13it/s]  

RequestError: RequestError(400, 'index_closed_exception', 'closed')

In [None]:
len_train = len(final_train_dataset)

In [None]:
len_train

In [None]:
with open('train_dataset_1_5_all', 'w') as myfile:
          
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)

    for i in tqdm(range(0,len_train)):
    
        articlePgID = final_train_dataset[i][0]
        articleTitle = str(final_train_dataset[i][1])
        #articleText = str(final_train_dataset[i][2])
        
        tableID = final_train_dataset[i][2]
        tableTitle = str(final_train_dataset[i][3])
        #tableSectionTitle = str(final_train_dataset[i][5])
        #tableCaption = str(final_train_dataset[i][6])
        #tableHeader = str(final_train_dataset[i][7])
        #tableBody = str(final_train_dataset[i][8])
        match = final_train_dataset[i][4]
        
        row = [articlePgID,articleTitle, tableID, tableTitle, match]
        
        wr.writerow(row)

  7%|▋         | 12304/175388 [03:00<57:39, 47.13it/s]