In [1]:
import newspaper
import requests

In [2]:
def test_url(url):
    try:
        return requests.get(url, timeout=2).ok
    except requests.exceptions.ReadTimeout:
        return False
    except requests.exceptions.ConnectionError:
        return False
    except requests.exceptions.TooManyRedirects:
        return False

In [3]:
newspaper_config = newspaper.Config()
newspaper_config.fetch_images = False
newspaper_config.request_timeout = 2
newspaper_config.memoize_articles = False

In [4]:
newspaper_config.verbose = 1

url = 'https://www.who.int/emergencies/diseases/novel-coronavirus-2019'

In [5]:
# newspaper_obj = newspaper.build(url, config=newspaper_config, request_timeout=3, number_threads=2)

In [6]:
def get_article(article):
    article.download()
    html = article.html
    article.parse()
    article_data = {}
    article.url = article.url.strip()
    
    article_data['title'] = article.title
    article_data['text'] = article.text
    article_data['url'] = article.url
    article_data['html'] = html
    return article_data

In [7]:
# art_data = get_article(newspaper_obj.articles[10])

In [8]:
headers = {
                'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11'
            }
def get_html(url):
    url_lower = url.lower()
    for ending in ['pdf', 'jpg', 'jpeg', 'png']:
        if url_lower.endswith(ending):
            return ''
    try:
        response = requests.get(url, headers=headers, timeout=5.0)
    except:
        return ''
    if response.status_code != 200:
        return ''
    if 'text/html' not in response.headers['content-type']:
#         print(response.headers['content-type'])
        return ''
    html = response.text
    if html.startswith('%PDF'):
        return ''
    return html

In [9]:
from bs4 import BeautifulSoup
from urllib.parse import urlparse

def get_domain(url):
    parsed_url = urlparse(url)
    domain = '{uri.netloc}'.format(uri=parsed_url)
    return domain


def scrape_sub_urls(url):
    html = get_html(url)
    soup = BeautifulSoup(html)
    my_domain = get_domain(url)
    links = [link.attrs['href'] for link in soup(['a']) if 'href' in link.attrs and get_domain(link.attrs['href']) == my_domain]
    return set(links), html

In [10]:
def scrape_urls(root_url, max_depth=3):
    all_urls = {}
    def scrape_recursive(url, all_urls, depth):
        if depth > max_depth:
            return 
        sub_urls, html = scrape_sub_urls(url)
        all_urls[url] = html
        new_urls = sub_urls.difference(all_urls.keys())
        all_urls.update(dict.fromkeys(new_urls))
#         print(url, len(new_urls))
        for u in new_urls:
            scrape_recursive(u, all_urls, depth+1)
            
    all_urls[root_url] = None
    scrape_recursive(root_url, all_urls, 0)
    
    n_leafs = sum([html is None for html in all_urls.values()])
    print('downloading %d leaf nodes' % n_leafs)
    for url, html in all_urls.items():
        if all_urls[url] is None:
            all_urls[url] = get_html(url)
    return all_urls

In [11]:
from tqdm import tqdm
def crawler_urls(url, max_depth=1):
    urls = scrape_urls(url, max_depth=max_depth)
    articles = {}
    for u, html in tqdm(urls.items()):
        art = newspaper.Article(url)
        if html == '' or html is None:
            continue
        try:
            art.download(input_html=html)
            art.parse()
            art.nlp()
            articles[u] = art
        except:
            print(u)
            pass
    return articles

In [23]:
def crawler_newspaper(root_url, max_depth=5):
    all_articles = {}
    def crawl(url, all_articles, depth):
        my_domain = get_domain(url)
        if depth > max_depth:
            return
        newspaper_obj = newspaper.build(url, config=newspaper_config, request_timeout=8, number_threads=4)
        
        new_dict = dict(zip(newspaper_obj.article_urls(), newspaper_obj.articles))
        new_dict = {u: a for u, a in new_dict.items() if get_domain(url) == crawl}
        new_keys = set(new_dict.keys()).difference(all_articles)
        print(url, len(newspaper_obj.articles), len(new_keys))
#         if depth >= 1:
#             print(new_keys)
        
        all_articles.update(new_dict)
#         print(url, len(newspaper_obj.articles))
        for u in newspaper_obj.article_urls():
            if u in new_keys:
                crawl(u, all_articles, depth+1)
                
        
    crawl(root_url, all_articles, 0)
    
    return all_articles

In [24]:
def get_all_articles(url, max_depth=3):
    print('scraping by url')
    articles_urls = crawler_urls(url, max_depth=max_depth)
    print('scraping using newspaper')
    articles_newspaper = crawler_newspaper(url, max_depth=max_depth)

    to_remove = []
    for url, art in tqdm(articles_newspaper.items()):
        if url in articles_urls:
            to_remove.append(url)
            continue
        try:
            art.download()
            art.parse()
            art.nlp()
        except:
            to_remove.append(url)
    for u in to_remove:
        del articles_newspaper[u]

    all_articles = dict(articles_urls)
    all_articles.update(articles_newspaper)

    all_articles = {u: a for u, a in all_articles.items() if a.text != ''}

    print('scraped %d articles' % len(all_articles))
    return all_articles

In [43]:
def crawler_newspaper(root_url, max_depth=5):
    all_articles = {}
    my_domain = get_domain(root_url)
    def crawl(url, my_domain, all_articles, depth):
        
        if depth > max_depth:
            return
        newspaper_obj = newspaper.build(url, config=newspaper_config, request_timeout=8, number_threads=4)
        
        new_dict = dict(zip(newspaper_obj.article_urls(), newspaper_obj.articles))
        new_dict = {u: a for u, a in new_dict.items() if get_domain(url) == my_domain}
        new_keys = set(new_dict.keys()).difference(all_articles)
        print(url, len(newspaper_obj.articles), len(new_keys))
#         if depth >= 1:
#             print(new_keys)
        
        all_articles.update(new_dict)
#         print(url, len(newspaper_obj.articles))
        for u in newspaper_obj.article_urls():
            if u in new_keys:
                crawl(u, my_domain, all_articles, depth+1)
                
        
    crawl(root_url,my_domain, all_articles, 0)
    
    return all_articles

In [44]:
urls = {
    
    'Unicef': 'https://www.unicef.org/coronavirus/covid-19',
    'DWB':'https://www.doctorswithoutborders.org/facts-and-figures-about-coronavirus-disease-outbreak-covid-19',
    'NIH': 'https://www.nih.gov/health-information/coronavirus',
    'cov gov': 'https://www.coronavirus.gov/',
    'CDC': 'https://www.cdc.gov/coronavirus/2019-ncov/index.html',
    'cov uk': 'https://www.gov.uk/coronavirus',
    'WHO': 'https://www.who.int/emergencies/diseases/novel-coronavirus-2019',
}

In [51]:
results = {}

In [None]:
name

In [None]:
import pickle
import os
for name, url in urls.items():
    fn = name+'.pkl'
    if os.path.isfile(fn):
        continue
    all_articles = get_all_articles(url, max_depth=3)
    results[name] = all_articles
    all_article_data = {url: get_article_data(art) for url, art in all_articles.items()}
    with open(fn, 'wb') as f:
        pickle.dump(all_article_data, f, -1)

In [67]:
for a in all_articles.values():
    break

In [84]:
for name, all_articles in results.items():
    fn = name+'.pkl'
    all_article_data = {url: get_article_data(art) for url, art in all_articles.items() if art.meta_lang in ['', 'en']}
    with open(fn, 'wb') as f:
        pickle.dump(all_article_data, f, 4)

In [81]:
from collections import Counter
for name, all_articles in results.items():
    langs = [a.meta_lang for a in all_articles.values()]
    c = Counter(langs)
    print(name)
    print(c.most_common(20))

Unicef
[('en', 208), ('fr', 39), ('es', 31), ('zh', 29), ('ar', 27), ('', 7)]
DWB
[('en', 31)]
NIH
[('en', 253), ('', 33), ('pl', 7), ('es', 5)]
cov gov
[('en', 16)]
CDC
[('en', 417), ('es', 144), ('', 4), ('zh', 1)]
cov uk
[('en', 407)]
WHO
[('en', 440), ('', 142)]


In [74]:
def get_article_data(article):
    article_data = {}
    article.url = article.url.strip()
    article_data['title'] = article.title
    article_data['text'] = article.text
    article_data['url'] = article.url.strip()
    article_data['keywords'] = article.keywords
    article_data['summary'] = article.summary
    article_data['meta_lang'] = article.meta_lang
    article_data['meta_keywords'] = article.meta_keywords
    return article_data

In [48]:
kwds = ['covid19', 'coronavirus', 'wuhan', 'sars', 'mers', 'covid-19']

In [49]:
for u, art in all_articles.items():
    if len(art.text) > 200 and any([kw in art.text.lower() for kw in kwds]):
        print(u)

https://www.unicef.org/coronavirus/covid-19
https://www.unicef.org/innovation/stories/UReportCoronavirusIndonesia
https://www.unicef.org/appeals/covid-2019.html
http://www.unicef.org/wash
https://www.unicef.org/stories/novel-coronavirus-outbreak-what-parents-should-know
https://www.unicef.org/coronavirus/how-talk-your-child-about-coronavirus-covid-19
https://www.unicef.org/coronavirus/6-ways-parents-can-support-their-kids-through-coronavirus-covid-19
https://www.unicef.org/coronavirus/how-teenagers-can-protect-their-mental-health-during-coronavirus-covid-19
https://www.unicef.org/press-releases/covid-19-children-heightened-risk-abuse-neglect-exploitation-and-violence-amidst
https://www.unicef.org/coronavirus/everything-you-need-know-about-washing-your-hands-protect-against-coronavirus-covid-19
https://www.unicef.org/coronavirus/how-teachers-can-talk-children-about-coronavirus-disease-covid-19
https://www.unicef.org/press-releases/un-releases-15-million-help-vulnerable-countries-battle-

In [None]:
print()