In [None]:
import pandas as pd
import numpy as np
import requests
import json
import collections
import time
import re
import string
import os
import argparse
from datetime import datetime

import nltk
from nltk.stem.wordnet import WordNetLemmatizer 
from nltk.corpus import wordnet

import bs4
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common import exceptions

from pdfminer import high_level

In [None]:
nltk.download("stopwords", quiet = True)
nltk.download("wordnet", quiet = True)
nltk.download("punkt", quiet = True)
nltk.download('averaged_perceptron_tagger', quiet = True)
lemmatizer = WordNetLemmatizer()
english_stopwords = set(nltk.corpus.stopwords.words('english'))

In [None]:
def clean_text(text):
    """
    Clean the input string by converting it to lowercase, removing 's and apostrophe.
    
    args:
        text (str) : the input text
        
    return:
        str : the cleaned text
    """
    text = text.lower()
    clean = re.sub(r'\'s*\s', ' ', text)
    clean = re.sub(r'\'s\Z', '', clean)
    clean = re.sub(r'\'', '', clean)
    return clean

def tokenize(cleaned_text):
    """
    Tokenize the input string.
    
    args:
        cleaned_text (str): the input text, output from clean_text
        
    return:
        List[str] : a list of tokens from the input text
    """
    token = nltk.word_tokenize(cleaned_text)
    token = [split for t in token for split in re.split(r'[^a-zA-Z0-9]', t)]
    token = [t for t in token if t != '']
    return token

def lemmatize(tokens, stopwords = {}):
    """
    Lemmatize each token in an input list of tokens
    
    args:
        tokens (List[str]) : a list of token, output from tokenize
    
    kwargs:
        stopwords (Set[str]) : the set of stopwords to exclude
    
    return:
        List[str] : a list of lemmatized and filtered tokens
    """
    def tag_to_part(tag):
        if tag.startswith('J'):
            return 'a'
        elif tag.startswith('V'):
            return 'v'
        elif tag.startswith('R'):
            return 'r'
        else:
            return 'n'
        
    lemmatizer = WordNetLemmatizer()
    
    parts = list()

    for i in range(len(tokens)):
        tag = nltk.pos_tag([tokens[i]])
        part = tag_to_part(tag[0][1])
        parts.append(part)
        
    lemmatized = list()
    
    for i in range(len(tokens)):
        lemma = lemmatizer.lemmatize(tokens[i], pos = parts[i])
        if lemma not in stopwords and len(lemma) >= 2:
            lemmatized.append(lemma)
            
    return lemmatized

def preprocess_text(text, stopwords = {}):
    cleaned_text = clean_text(text)
    tokens = tokenize(cleaned_text)
    return lemmatize(tokens, stopwords)

In [None]:
def retrieve_url(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    return soup

PATH_TO_CHROMEDRIVER = "C:/Users/jerryzhou3/Documents/chromedriver.exe"

def init_chromedriver(debug = False):
    options = webdriver.ChromeOptions()
    if not debug:
        options.add_argument('--headless')
        options.add_argument('--no-sandbox')
        options.add_argument("--disable-setuid-sandbox")
        options.add_argument('--user-agent=""Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36""')
    return webdriver.Chrome(executable_path = PATH_TO_CHROMEDRIVER, options = options)

In [None]:
def parse_page_nature(url):
    """
    Parse a single New York Times article at the given URL
    
    args:
        url (str) : the article URL
    
    return:
        Dict[str, str] : the parsed information stored in JSON format, which includes:
            Title, Author, Published Date, Summary and Content
    """
    month_to_num = {"January": "01", "February": "02", "March": "03", "April": "04", "May": "05", "June": "06", "July": "07", "August": "08", "September": "09", "October": "10", "November": "11", "December": "12"}
#     driver = init_chromedriver()
#     driver.get(url)
#     soup = BeautifulSoup(driver.page_source, "html.parser")
    soup = BeautifulSoup(requests.get(url).text, "html.parser")
    date = soup.find("time", {"itemprop": "datePublished"}).get_text().strip().split()
    year = date[2]
    month = month_to_num[date[1]]
    day = date[0]
    publishedDate = year + '-' + month + '-' + day
    title = soup.find("h1", {"itemprop": "headline"}).get_text().strip()
#     author = soup.find_all("span", class_ = "block hide-overflow nowrap overflow-ellipsis")
    author = soup.find_all("h3", {"data-tooltip": "Show author information"})
    authors = [re.sub(r'&', '', a.get_text()).strip() for a in author]
    summary = soup.find("div", class_ = "article-item__teaser-text serif").get_text().strip()
    content = soup.find("div", class_ = "article__body serif cleared").find_all("p")
    content = [c.get_text().strip() for c in content if c.find("aside") is None]
    content = [c for c in content if len(c) > 0]
    parse = {"Title": title, "Author": authors, "Published Date": publishedDate, "Summary": summary, "Content": content}
    return parse

def extract_nature_articles(start_date, end_date, base_url = "https://www.nature.com"):
    """
    Search for and parse all coronavirus-related News article from the Nature journal that were
    published in a given period
    
    args:
        start_date (str): the lower bound of the date range to filter articles,
            has the format yyyy-mm-dd
        end_date (str): the upper bound (inclusive) of the date range to filter articles,
            has the format yyyy-mm-dd
    
    return:
        List[Dict[str, str]] : a list of parsed JSON for each articles returned by
            the search query
    """
    month_to_num = {"January": "01", "February": "02", "March": "03", "April": "04", "May": "05", "June": "06", "July": "07", "August": "08", "September": "09", "October": "10", "November": "11", "December": "12"}
    search_url = "https://www.nature.com/search?title=coronavirus&order=date_asc&article_type=news&journal=nature"
    search_url += "&date_range=" + start_date[:4] + "-" + end_date[:4]
    articles = list()
    soup = BeautifulSoup(requests.get(search_url).text, "html.parser")
    script = soup.find("script", {"data-test": "dataLayer"}).string.strip()
    script_dict = json.loads(script[script.index('=') + 1:  -1])
    total_pages = script_dict[0]["page"]["search"]["totalPages"]
    
    for page in range(1, total_pages + 1):
        search_url_page = search_url +"&page=" + str(page)
        soup = BeautifulSoup(requests.get(search_url_page).text, "html.parser")
        titles = soup.find_all("a", {"data-track-action": "search result"})
        for title in titles:
            title_text = title.get_text().strip()
            if "Daily briefing" in title_text or "Podcast" in title_text or "Backchat" in title_text:
                continue
            date = title.find_previous("time").get_text().strip().split()
            month = date[1]
            day = date[0]
            year = date[2]
            day = '0' * (2 - len(day)) + day
            date = year + '-' + month_to_num[month] + '-' + day
            if date < start_date or date > end_date:
                continue
            url = base_url + title["href"]
            article = parse_page_nature(url)
            articles.append(article)
    articles = sorted(articles, key = lambda a: (a["Published Date"], a["Title"]))
    return articles