In [299]:
from bs4 import BeautifulSoup
import requests
import pymongo
from splinter import Browser
import pandas as pd

In [274]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

In [275]:
# Define database and collection
db = client.news_db
collection = db.items

### Scraping yahoo.com

In [313]:
# URL of page to be scraped (Yahoo News)
url = 'https://news.yahoo.com/'

# Retrieve page with the requests module
response = requests.get(url)
# Create BeautifulSoup object; parse with 'lxml'
soup = BeautifulSoup(response.text, 'lxml')

In [314]:
results = soup.find_all(class_='Ov(h) Pend(44px) Pstart(25px)')

In [315]:
# first headline
results[0].find(class_='Fw(b) Fz(20px) Lh(23px) Fz(17px)--sm1024 Lh(19px)--sm1024 mega-item-header-link Td(n) C(#0078ff):h C(#000) LineClamp(2,46px) LineClamp(2,38px)--sm1024 not-isInStreamVideoEnabled').text

"A 'shocking' two-thirds of patients recently hospitalized in NY had been staying home"

In [316]:
# first article
results[0].find(class_='Fz(14px) Lh(19px) Fz(13px)--sm1024 Lh(17px)--sm1024 LineClamp(2,38px) LineClamp(2,34px)--sm1024 M(0) D(n)--sm1024 Bxz(bb) Pb(2px)').text

'New York Gov. Andrew Cuomo said Wednesday that a survey showed that a "shocking" two-thirds of patients recently hospitalized for coronavirus became infected despite largely staying at home. Hospitals were asked to document where their most recent COVID-19 patients had been staying before admission, Cuomo said, and 66 percent came from their own homes. About 18 percent came from nursing homes, 4 percent from assisted-living facilities, 2 percent were homeless, 2 percent had been at other "congregate" settings, fewer than 1 percent were prison or jail inmates, and 8 percent were classified as "other."'

In [317]:
# Examine the results, then determine element that contains sought info
# results are returned as an iterable list
results = soup.find_all(class_='Ov(h) Pend(44px) Pstart(25px)')

# Loop through returned results
for result in results:
    # Error handling
    try:
        # Identify and return title of listing
        headline = result.find(class_='Fw(b) Fz(20px) Lh(23px) Fz(17px)--sm1024 Lh(19px)--sm1024 mega-item-header-link Td(n) C(#0078ff):h C(#000) LineClamp(2,46px) LineClamp(2,38px)--sm1024 not-isInStreamVideoEnabled').text
        # Identify and return link to listing
        article = result.find(class_='Fz(14px) Lh(19px) Fz(13px)--sm1024 Lh(17px)--sm1024 LineClamp(2,38px) LineClamp(2,34px)--sm1024 M(0) D(n)--sm1024 Bxz(bb) Pb(2px)').text

        # Run only if headline and article are available
        if (headline and article):
            # Print results
            print('-------------')
            print(headline)
            print(article)

            # Dictionary to be inserted as a MongoDB document
            post = {
                'headline': headline,
                'article': article
            }
#             collection.insert_one(post)

    except Exception as e:
        print(e)

-------------
A 'shocking' two-thirds of patients recently hospitalized in NY had been staying home
New York Gov. Andrew Cuomo said Wednesday that a survey showed that a "shocking" two-thirds of patients recently hospitalized for coronavirus became infected despite largely staying at home. Hospitals were asked to document where their most recent COVID-19 patients had been staying before admission, Cuomo said, and 66 percent came from their own homes. About 18 percent came from nursing homes, 4 percent from assisted-living facilities, 2 percent were homeless, 2 percent had been at other "congregate" settings, fewer than 1 percent were prison or jail inmates, and 8 percent were classified as "other."
-------------
Trump denies U.S. role in what Venezuela says was 'mercenary' incursion
President Donald Trump on Tuesday denied any involvement by the U.S. government in what Venezuelan officials have called a failed armed incursion into the South American country that led to the capture of t

### Scraping NPR news archive

I was able to scrape 12 articles from yahoo.com even though there were 25 headlines (lots of headlines were missing contents). At this pace, we would be able to get about 50 articles by the end of projects, so I decided to find other platforms to scrape more articles.

In [276]:
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)

I had to change my approch to use 'Browser'. The issue with 'request' approach was that it was only able to grab objects of default status of webpage. For NPR news archive, you could load as many as articles that you want on one web page with 'load more articles' button. With this approach, I was able to get objects after JavaScript action was performed. 

In [277]:
url = 'https://www.npr.org/sections/news/archive'
browser.visit(url)

In [290]:
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

In [291]:
results = soup.find_all(class_='item-info')

In [293]:
results[-1].time['datetime']

'2015-12-31'

In [294]:
# clean up 'date' from article

for result in results:
    try:
        result.find(class_='date').decompose()
        
    except Exception as e:
        print(e)
len(results)

7125

I ran below codes 5 times for each year (2020,2019,2018,2017,2016). Each year has about 6000 articles to scrape.

In [310]:
# Examine the results, then determine element that contains sought info
# results are returned as an iterable list
results = soup.find_all(class_='item-info')

# Loop through returned results
for result in results:
        
    # Error handling
    try:
        # Identify and return title of listing
        headline = result.find(class_='title').text
        # Identify and return article to listing
        article = result.find(class_='teaser').text
        # Identify and return date to listing
        date = result.time['datetime']
        # Identify and return category to listing
        category = result.find(class_='slug').text
        

        # Run only if all objects are available
        if (headline and article and date and category):
            # Print results
            print('-------------')
            print(headline)
            print(article)
            print(date)
            print(category)

            # Dictionary to be inserted as a MongoDB document
            post = {
                'headline': headline,
                'article': article,
                'date': date,
                'category': category
            }
#             collection.insert_one(post)

    except Exception as e:
        continue

-------------
Chief Justice John Roberts Lauds Federal District Judges In Year-End Report
Roberts praised the "selfless, patriotic and brave" lower court judges, writing, "District judges make a difference every day, and leave a lasting legacy, by making our society more fair and just."
2016-12-31
The Two-Way
-------------
Dozens Of People Killed In Attack On Turkish Nightclub; Suspect Still At Large
A gunman opened fire during New Year's celebrations at a club in Istanbul, in what the city's governor called a terror attack. At least 39 people were killed and dozens more were wounded.
2016-12-31
The Two-Way
-------------
Syria And Russia Among Major International Stories Of 2016
Al-Jazeera's D.C. bureau chief Abderrahim Foukara talks about the biggest international stories of 2016 and what's upcoming in 2017, including the Israeli-Palestinian conflict and the fight for Mosul.
2016-12-31
World
-------------
Underwear, Dolls And More: Latin American New Year's Traditions
Radio Ambulante'

In [301]:
# Convert Mongo db to pandas dataframe
data = pd.DataFrame(list(collection.find()))
data.head()

Unnamed: 0,_id,headline,article,date,category
0,5eb2fa9c28431efad3712d16,From Loss Of Smell To 'COVID Toes': What Exper...,It's not just a fever and dry cough. For milde...,2020-05-06,Goats and Soda
1,5eb2fa9c28431efad3712d17,"As Businesses Reopen, A Fight Is Brewing Over ...",Republican Senate Leader Mitch McConnell insis...,2020-05-06,Coronavirus Live Updates
2,5eb2fa9c28431efad3712d18,"Michigan Legislature Sues Gov. Whitmer, Seekin...",The legislators say the governor is acting ill...,2020-05-06,Coronavirus Live Updates
3,5eb2fa9c28431efad3712d19,The Risk Of Coronavirus In Afghanistan's Priso...,The Taliban accuse the government of spreading...,2020-05-06,Coronavirus Live Updates
4,5eb2fa9c28431efad3712d1a,Officials Investigating Multiple COVID-19 Deat...,Multiple investigations are underway at the So...,2020-05-06,Coronavirus Live Updates


In [320]:
# checking the database length
len(data)

27035

In [302]:
# export to csv file
data.to_csv('scraped_data.csv')