# News Scraping

In [1]:
import requests
from bs4 import BeautifulSoup
import regex as re
import pandas as pd
import time
import numpy as np
from datetime import datetime

News sites to scrape:
- Associated Press News ('https://apnews.com/')
- BBC ('https://www.bbc.com/news')
- NPR ('https://www.npr.org/')

In [2]:
url = ['https://apnews.com/',
      'https://www.bbc.com/news',
      'https://www.npr.org/'
      ]
    

## Associated Press News

In [3]:
# Scrapping apnews
html = requests.get('https://apnews.com').text
soup = BeautifulSoup(html)
coverpage_news = soup.find_all('a', class_='Component-headline-0-2-56')

apnews_headline = []
apnews_url = []
apnews_published_date = []
apnews_article = []

for line in coverpage_news:
    href = line['href']
    article_url = 'https://apnews.com' + href
    if article_url in apnews_url:
        continue
    else:
        article_html = requests.get(article_url).text
        article_soup = BeautifulSoup(article_html)
        article_headline = article_soup.find('h1').get_text()
        article_date_check = article_soup.find_all('script', type='application/ld+json')[0].get_text().split(',')
        if 'datePublished' in article_date_check[4]:
            article_published_date = article_date_check[4].split('T')[0].split('\"')[-1]
        elif 'datePublished' in article_date_check[5]:
            article_published_date = article_date_check[5].split('T')[0].split('\"')[-1]
        else:
            article_published_date = 'error'
        article_tags = article_soup.find_all('p')
        article_text = [tag.get_text().strip() for tag in article_tags]
    
        # Filter out sentences that contain newline characters '\n' or don't contain periods.
        sentence_list = [sentence for sentence in article_text if not '\n' in sentence]
        sentence_list = [sentence for sentence in sentence_list if '.' in sentence]
        # Combine list items into string.
        article = ' '.join(sentence_list)
        apnews_headline.append(article_headline)
        apnews_url.append(article_url)
        apnews_published_date.append(article_published_date)
        apnews_article.append(article)
        
apnews = pd.DataFrame(list(zip(apnews_headline,apnews_url,apnews_published_date,apnews_article)),
                      columns=['headline', 'url', 'published_date', 'article'])

apnews['source'] = 'Associated Press News'

In [4]:
apnews

Unnamed: 0,headline,url,published_date,article,source
0,GOP strains to contain Bolton fallout in impea...,https://apnews.com/34f8a36cfc3a15ffeede3a74cd0...,2020-01-29,WASHINGTON (AP) — President Donald Trump’s imp...,Associated Press News
1,GOP squirms as Bolton prepares to dish on Trum...,https://apnews.com/cc7316d191075ac57e0bcd34f05...,2020-01-29,WASHINGTON (AP) — For much of the last 20 year...,Associated Press News
2,Virus cases in China top SARS as evacuations b...,https://apnews.com/a980aeb44b3840341c674a6e67c...,2020-01-29,BEIJING (AP) — Countries began evacuating thei...,Associated Press News
3,Israeli Cabinet postpones vote on West Bank an...,https://apnews.com/27d373978c683097ca06137bea8...,2020-01-29,JERUSALEM (AP) — A senior Israeli minister sai...,Associated Press News
4,Kobe Bryant helicopter lacked recommended safe...,https://apnews.com/86ff5b1a17513d79c7a2bbf2e7b...,2020-01-28,LOS ANGELES (AP) — The helicopter carrying Kob...,Associated Press News
5,Leaked report shows United Nations suffered hack,https://apnews.com/0d958e15d7f5081dd612f07482f...,2020-01-29,GENEVA (AP) — Sophisticated hackers infiltrate...,Associated Press News
6,"Trump looks to sign trade deal, show wins duri...",https://apnews.com/e6ede49e1c07c7e928238c778fd...,2020-01-29,WASHINGTON (AP) — President Donald Trump is ea...,Associated Press News
7,"Confined at home, Chinese get creative to beat...",https://apnews.com/bb8aba68717061e34c5fde28816...,2020-01-29,BANGKOK (AP) — Chinese around the country conf...,Associated Press News
8,Allies worry as US ponders cutting military fo...,https://apnews.com/53c70dbee943ac99dbac6d7b35b...,2020-01-29,"DAKAR, Senegal (AP) — As extremist violence gr...",Associated Press News
9,Hall of Fame DE Doleman dies at age 58,https://apnews.com/2d20863dd89a02746dc67659c93...,2020-01-29,MINNEAPOLIS (AP) — Hall of Fame defensive end ...,Associated Press News


## BBC

In [5]:
# Scrapping bbc
html = requests.get('https://www.bbc.com/news').text
soup = BeautifulSoup(html)
coverpage_news = soup.find_all('a', class_='gs-c-promo-heading')

bbc_headline = []
bbc_url = []
bbc_published_date = []
bbc_article = []

for line in coverpage_news:
    href = line['href']
    if 'http' in href:
        continue
    if 'live' in href:
        continue
    else:
        article_url = 'https://www.bbc.com' + href
        if article_url in bbc_url:
            continue
        else:
            article_html = requests.get(article_url).text
            article_soup = BeautifulSoup(article_html)
            article_headline = article_soup.find('title').get_text()
            if 'sport' in href:
                article_published_date = article_soup.find_all('meta', property="rnews:datePublished")[0]['content'].split(' ')[0]
            else:
                article_published_date = article_soup.find_all('script')[1].get_text().split(',')[8].split('T')[0].split('\"')[-1]
            article_tags = article_soup.find_all('p')
            article_text = [tag.get_text().strip() for tag in article_tags]
    
            # Filter out sentences that contain newline characters '\n' or don't contain periods.
            sentence_list = [sentence for sentence in article_text if not '\n' in sentence]
            sentence_list = [sentence for sentence in sentence_list if '.' in sentence]
            # Combine list items into string.
            article = ' '.join(sentence_list)
            bbc_headline.append(article_headline)
            bbc_url.append(article_url)
            bbc_published_date.append(article_published_date)
            bbc_article.append(article)

        
bbc = pd.DataFrame(list(zip(bbc_headline,bbc_url,bbc_published_date,bbc_article)),
                      columns=['headline', 'url', 'published_date', 'article'])

drop_rows = bbc[(bbc['published_date'].map(len)>10)|(bbc['published_date'].map(len)==0)].index
bbc.drop(drop_rows, axis=0, inplace=True)

bbc['source'] = 'BBC'

In [6]:
bbc

Unnamed: 0,headline,url,published_date,article,source
0,Brexit: MEPs say goodbye to UK ahead of Brexit...,https://www.bbc.com/news/uk-politics-51287430,2020-01-29,Members of the European Parliament are bidding...,BBC
1,Coronavirus: Britons on Wuhan flights to be qu...,https://www.bbc.com/news/uk-51292590,2020-01-29,Hundreds of British citizens being flown back ...,BBC
2,Bala Devi: Rangers' new signing is India's fir...,https://www.bbc.com/sport/football/51298687,2020/01/29,India top scorer Bala Devi's groundbreaking mo...,BBC
4,"Shaheen Bagh: Anurag Thakur, Parvesh Varma pen...",https://www.bbc.com/news/world-asia-india-5127...,2020-01-29,Two MPs belonging to India's ruling party have...,BBC
5,Trump Middle East plan: Palestinians reject 'c...,https://www.bbc.com/news/world-middle-east-512...,2020-01-29,Palestinians have dismissed US President Donal...,BBC
7,Kunal Kamra: Airline ban on India comedian spa...,https://www.bbc.com/news/world-asia-india-5129...,2020-01-29,That is the burning question currently being d...,BBC
8,Coronavirus: Denmark in cartoon bust-up with C...,https://www.bbc.com/news/world-europe-51295225,2020-01-29,A Danish newspaper has rejected China's demand...,BBC
9,Australian Open: Dominic Thiem stuns Rafael Na...,https://www.bbc.com/sport/tennis/51295106,2020/01/29,Top seed Rafael Nadal was denied a place in th...,BBC
10,Syria war: Army 'recaptures' key town from opp...,https://www.bbc.com/news/world-middle-east-512...,2020-01-29,The Syrian army says it has recaptured the str...,BBC
11,5G: EU issues guidance on 'high-risk' supplier...,https://www.bbc.com/news/technology-51294691,2020-01-29,The European Commission has issued its guidanc...,BBC


## NPR

In [7]:
# Scrapping npr
html = requests.get('https://www.npr.org').text
soup = BeautifulSoup(html)
coverpage_news = soup.find_all('a', href=re.compile('^https://www.npr.org/2020'))

npr_headline = []
npr_url = []
npr_published_date = []
npr_article = []

for line in coverpage_news:
    href = line['href']
    article_url = href
    if article_url in npr_url:
        continue
    else:
        article_html = requests.get(article_url).text
        article_soup = BeautifulSoup(article_html)
        article_headline = article_soup.find_all('meta')[10]['content']
        article_published_date = article_soup.find_all('meta')[8]['content']
        article_tags = article_soup.find_all('p')
        article_text = [tag.get_text().strip() for tag in article_tags]
    
        # Filter out sentences that contain newline characters '\n' or don't contain periods.
        sentence_list = [sentence for sentence in article_text if not '\n' in sentence]
        sentence_list = [sentence for sentence in sentence_list if '.' in sentence]
        # Combine list items into string.
        article = ' '.join(sentence_list)
        npr_headline.append(article_headline)
        npr_url.append(article_url)
        npr_published_date.append(article_published_date)
        npr_article.append(article)
        
npr = pd.DataFrame(list(zip(npr_headline,npr_url,npr_published_date,npr_article)),
                      columns=['headline', 'url', 'published_date', 'article'])

npr['source'] = 'NPR'

In [8]:
npr

Unnamed: 0,headline,url,published_date,article,source
0,"Impeachment Trial Moves To Question Phase, Whi...",https://www.npr.org/2020/01/29/799371386/impea...,2020-01-29,House Democratic impeachment manager Rep. Adam...,NPR
1,"Impeachment Recap: As Defense Rests, Focus Shi...",https://www.npr.org/2020/01/28/800568636/impea...,2020-01-28,President Trump's lawyer Jay Sekulow rides an ...,NPR
2,McConnell: Republicans Do Not Yet Have Votes T...,https://www.npr.org/2020/01/28/799370839/trump...,2020-01-28,White House counsel Pat Cipollone speaks durin...,NPR
3,1 Simple Step Could Help Election Security. Go...,https://www.npr.org/2020/01/29/800131854/1-sim...,2020-01-29,"Computer mouse pads with ""Secure the Vote"" log...",NPR
4,NPR Seeks 'Clarification' From State Departmen...,https://www.npr.org/2020/01/28/800538653/trump...,2020-01-28,"""That reporter couldn't have done too good a j...",NPR
5,Florida's Brightline Railroad Is Nation's Dead...,https://www.npr.org/2020/01/29/799962246/brigh...,2020-01-29,A Brightline train approaches a railroad cross...,NPR
6,"Space Traffic Is Surging, And Critics Worry Th...",https://www.npr.org/2020/01/29/800433686/space...,2020-01-29,SpaceX has launched dozens of satellites as pa...,NPR
7,"Starbucks Closes More Than 2,000 Stores In Chi...",https://www.npr.org/2020/01/28/800636364/starb...,2020-01-28,Updated at 8:20 p.m. ET Starbucks has temporar...,NPR
8,"Powerful Earthquake Strikes Caribbean, But No ...",https://www.npr.org/2020/01/28/800547848/power...,2020-01-28,Waves splash in a pool during an earthquake in...,NPR
9,What Iowa Looks Like Ahead Of The Caucuses,https://www.npr.org/2020/01/29/800549553/what-...,2020-01-29,"Edward Kennedy, 11, of Waukee, Iowa, and mayor...",NPR


## Consolidation

In [9]:
news = pd.concat([apnews, bbc, npr], axis=0, ignore_index=True)
news['published_date'] = pd.to_datetime(news['published_date'])
news = news.sort_values('published_date', ascending=False).reset_index(drop=True)
news

Unnamed: 0,headline,url,published_date,article,source
0,GOP strains to contain Bolton fallout in impea...,https://apnews.com/34f8a36cfc3a15ffeede3a74cd0...,2020-01-29,WASHINGTON (AP) — President Donald Trump’s imp...,Associated Press News
1,Brexit: MEPs say goodbye to UK ahead of Brexit...,https://www.bbc.com/news/uk-politics-51287430,2020-01-29,Members of the European Parliament are bidding...,BBC
2,Australian Open: Dominic Thiem stuns Rafael Na...,https://www.bbc.com/sport/tennis/51295106,2020-01-29,Top seed Rafael Nadal was denied a place in th...,BBC
3,Coronavirus: Denmark in cartoon bust-up with C...,https://www.bbc.com/news/world-europe-51295225,2020-01-29,A Danish newspaper has rejected China's demand...,BBC
4,Kunal Kamra: Airline ban on India comedian spa...,https://www.bbc.com/news/world-asia-india-5129...,2020-01-29,That is the burning question currently being d...,BBC
...,...,...,...,...,...
88,Americans flown from China virus zone arrive i...,https://apnews.com/e3f13d6f778eae1dd6a062499ed...,2020-01-28,"RIVERSIDE, Calif. (AP) — A plane evacuating 20...",Associated Press News
89,Tokushoryu win: Underdog sumo wrestler celebra...,https://www.bbc.com/news/world-asia-51283431,2020-01-28,No-one could have predicted the victory of sum...,BBC
90,China coronavirus: Misinformation spreads onli...,https://www.bbc.com/news/blogs-trending-51271037,2020-01-28,More than 100 people have now been killed by t...,BBC
91,"Virus in China affects sports events, Olympic ...",https://apnews.com/ca92d5f5ad02c17ea31399b026a...,2020-01-28,GENEVA (AP) — Amid growing concern about the s...,Associated Press News


In the following cells below, I am essentially removing any scraped articles that are older than 90 days.

In [10]:
news['published_date'].unique()

array(['2020-01-29T00:00:00.000000000', '2020-01-28T00:00:00.000000000'],
      dtype='datetime64[ns]')

In [11]:
today = datetime.today()
today

datetime.datetime(2020, 1, 30, 0, 46, 50, 903982)

In [12]:
old_news = news[today - news['published_date']>'90days'].index
news.drop(old_news, inplace=True)
news

Unnamed: 0,headline,url,published_date,article,source
0,GOP strains to contain Bolton fallout in impea...,https://apnews.com/34f8a36cfc3a15ffeede3a74cd0...,2020-01-29,WASHINGTON (AP) — President Donald Trump’s imp...,Associated Press News
1,Brexit: MEPs say goodbye to UK ahead of Brexit...,https://www.bbc.com/news/uk-politics-51287430,2020-01-29,Members of the European Parliament are bidding...,BBC
2,Australian Open: Dominic Thiem stuns Rafael Na...,https://www.bbc.com/sport/tennis/51295106,2020-01-29,Top seed Rafael Nadal was denied a place in th...,BBC
3,Coronavirus: Denmark in cartoon bust-up with C...,https://www.bbc.com/news/world-europe-51295225,2020-01-29,A Danish newspaper has rejected China's demand...,BBC
4,Kunal Kamra: Airline ban on India comedian spa...,https://www.bbc.com/news/world-asia-india-5129...,2020-01-29,That is the burning question currently being d...,BBC
...,...,...,...,...,...
88,Americans flown from China virus zone arrive i...,https://apnews.com/e3f13d6f778eae1dd6a062499ed...,2020-01-28,"RIVERSIDE, Calif. (AP) — A plane evacuating 20...",Associated Press News
89,Tokushoryu win: Underdog sumo wrestler celebra...,https://www.bbc.com/news/world-asia-51283431,2020-01-28,No-one could have predicted the victory of sum...,BBC
90,China coronavirus: Misinformation spreads onli...,https://www.bbc.com/news/blogs-trending-51271037,2020-01-28,More than 100 people have now been killed by t...,BBC
91,"Virus in China affects sports events, Olympic ...",https://apnews.com/ca92d5f5ad02c17ea31399b026a...,2020-01-28,GENEVA (AP) — Amid growing concern about the s...,Associated Press News


In [13]:
news.to_csv('./data/news_articles.csv', index=False)