In [1]:
from tqdm import tqdm
import numpy as np
import pandas as pd
from newspaper import Article
import re
import nltk
import csv
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = set(stopwords.words('english')) 

In [None]:
def remove_stopwords(example_sent):
    
    word_tokens = word_tokenize(example_sent) 
  
    filtered_sentence = [w for w in word_tokens if not w in stop_words] 
  
    filtered_sentence = [] 
  
    for w in word_tokens: 
        if w not in stop_words: 
            filtered_sentence.append(w)
    
    formattedText = ""

    for word in filtered_sentence:
        
        if (len(word)>2):
            
            formattedText = formattedText + " " +word
    
    
    formattedText = formattedText.lstrip()
    
    formattedText = formattedText.rstrip()
        
    return formattedText.lower()

In [None]:
def clear_string(text):
    
    text = re.sub('[^A-Za-z]+',' ',text)
    
    text = text.lstrip()
    
    text = text.rstrip()
    
    return text

In [None]:
def article_parse(url):
    
    article = Article(url)
    
    article.download()
    article.parse()
    article.nlp()
    
    full_text = remove_stopwords(clear_string(article.text))
    meta_description = remove_stopwords(clear_string(article.meta_description))
    summary = remove_stopwords(clear_string(article.summary))
    
    #get the list of keywords
    keywords = article.keywords
    aux1 = ''
    for word in keywords:
        
        aux1 = aux1 +" "+word
        
    keywords = remove_stopwords(clear_string(aux1))
    
    return full_text, meta_description, summary, keywords

In [None]:
distinct_tables = pd.read_csv('distinct_tables_plus_url.csv', delimiter=',')

In [None]:
tables_all_signals = []

for i, row in tqdm(distinct_tables.iterrows()):
    
    table_id = row['table_id']
    table_page_title = row['table_page_title']
    table_section_title = row['table_section_title']
    table_caption = row['table_caption']
    table_header = row['table_header']
    table_body = row['table_body']
    table_url = row['table_url']
    
    try:
    
        full_text, meta_description, summary, keywords = article_parse(table_url)
        
        line = {"table_id":table_id,
                "table_page_title":table_page_title,
                "table_page_full_text":full_text,
                "table_page_meta_description":meta_description,
                "table_page_summary":summary,
                "table_page_keywords":keywords,
                "table_section_title":table_section_title,
                "table_caption":table_caption,
                "table_header":table_header,
                "table_body":table_body,
                "table_url":table_url  
                }
         
    except:
        
        line = {"table_id":table_id,
                "table_page_title":table_page_title,
                "table_page_full_text":"",
                "table_page_meta_description":"",
                "table_page_summary":"",
                "table_page_keywords":"",
                "table_section_title":table_section_title,
                "table_caption":table_caption,
                "table_header":table_header,
                "table_body":table_body,
                "table_url":table_url  
                }
        continue
    
    tables_all_signals.append(line)
    break

In [None]:
df_distinct_tables_allsignals = pd.DataFrame(tables_all_signals)
df_distinct_tables_allsignals.to_csv('distinct_tables_allsignals.csv',index=False)