# NBC News Scraper
### You need Google Chrome for running this notebook

#### When you already have Chrome installed:
1. Go to the website: https://sites.google.com/chromium.org/driver/downloads?authuser=0
2. Download a chrome driver that is the same version as your chrome
3. Double click the driver to open it
4. Come back here, start running the notebook

In [41]:
# Necessary libraries
from bs4 import BeautifulSoup
from selenium import webdriver   
from requests import get
import time
import re
import csv
import pandas as pd
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import datetime
import random
from selenium.common.exceptions import NoSuchElementException

# the dictionary for storing links of news
# dictionary can prevent duplicates
article_links = {}

## Web-scrape links of news

In [45]:
# Luckily, NBC has a news archive. Every news of NBC was stored neatly in the archive which facilitates web-scraping
# The url of archive is "https://www.nbcnews.com/archive/articles/2018" with different year number.

# For each archive, this cell will open a chrome window and scrape news LINKS from the page.
# This cell may fail, when it fails, just re-run the cell.
for i in reversed(range(2010,2021)):
    url = "https://www.nbcnews.com/archive/articles/" + str(i)
    driver = webdriver.Chrome('/Users/wufangzheng/Downloads/chromedriver') #set the webdriver to Chrome driver
    driver.get(url)  
    time.sleep(3)
    months = driver.find_element_by_class_name("YearPage").find_elements_by_tag_name("li")
    months_len = len(months)
    back_time = 1
    for j in range(months_len):
        driver.find_element_by_class_name("YearPage").find_elements_by_tag_name("li")[j].find_element_by_tag_name("a").click()
        time.sleep(3)
        not_politics = 0
        politics = 0
        while True:
            links = driver.find_element_by_class_name("MonthPage").find_elements_by_tag_name("a")
            for item in links:
                link = item.get_attribute("href")
                x = re.search('^https://www.nbcnews.com/politics/', link)
                if x==None:
                    not_politics+=1
                else:
                    politics+=1
                    if link not in article_links:
                        article_links[link] = 0
            try:
                next_button = driver.find_element_by_class_name("Pagination__next")
            except NoSuchElementException:
                print(driver.current_url)
                print(str(politics) + " " + str(not_politics) + " " + str(politics + not_politics))
                driver.back()
                time.sleep(2)
                back_time = 1
                break
            if next_button.tag_name == "a":
                next_button.click()
                back_time+=1
                time.sleep(3)
            else:
                print(driver.current_url)
                print(str(politics) + " " + str(not_politics) + " " + str(politics + not_politics))
                for k in range(back_time):
                    driver.back()
                time.sleep(2)
                back_time = 1
                
                
                break

    time.sleep(5)
    driver.close()
    time.sleep(5)

https://www.nbcnews.com/archive/articles/2017/january/2
244 1459 1703
https://www.nbcnews.com/archive/articles/2017/february/2
269 1375 1644
https://www.nbcnews.com/archive/articles/2017/march/2
342 1522 1864
https://www.nbcnews.com/archive/articles/2017/april/2
208 1420 1628
https://www.nbcnews.com/archive/articles/2017/may/2
241 1480 1721
https://www.nbcnews.com/archive/articles/2017/june/2
276 1397 1673
https://www.nbcnews.com/archive/articles/2017/july/2
234 1088 1322
https://www.nbcnews.com/archive/articles/2017/august/2
239 1094 1333
https://www.nbcnews.com/archive/articles/2017/september
237 924 1161
https://www.nbcnews.com/archive/articles/2017/october/2
234 1141 1375
https://www.nbcnews.com/archive/articles/2017/november/2
239 1084 1323
https://www.nbcnews.com/archive/articles/2017/december
229 953 1182
https://www.nbcnews.com/archive/articles/2016/january/3
402 2122 2524
https://www.nbcnews.com/archive/articles/2016/february/2
521 1957 2478
https://www.nbcnews.com/archive/art

In [48]:
links = list(article_links.keys())
links[1000]

'https://www.nbcnews.com/politics/politics-news/culture-wars-strain-once-unshakeable-bond-between-republicans-corporate-america-n1262797'

## Web-scrape article text from those links

In [94]:
# Open every link from the links, and web-scrape text from it, and store the data to NBC_articles.csv

# The data we scrape: title, text (article body), date
# The output data will have the format of: index, date, title, text, label(left/lean left/neutral/lean right/right), link

# This cell may also fail
# Every time it fails, just re-run this cell. It will automatically continue with the remaining links

difference = real_count - count
current_time = time.time()
start_point = real_count
count = start_point - difference -1
real_count = start_point
for link in links[start_point:]:
    real_count +=1
    count+=1
    
    # If a link is denied, wait for 5 secs and try again.
    # If the link is denied more than 10 times, skip this link
    err_count = 0
    while True:
        page = get(link)
        if page.status_code == 200:
            break
        else:
            print("Access denied! Reconnect in 5 seconds...")
            err_count+=1
            time.sleep(5)
            if err_count > 10:
                break
    if err_count>10:
        print("This link is not found:")
        print(link)
        count-=1
        continue
    
    # Get the html source code of this article
    soup = BeautifulSoup(page.content, 'html.parser')
    # If there is no content, skip
    article = soup.find('div', class_='article-body__content')
    if not article:
        count-=1
        print("This link does not contain article:")
        print(link)
        continue
#     print(link)

    # Get text
    text = ""
    for item in article.children:
        if item.name == "p":
            text+=item.text+"\n"
        if item.name == "a":
            text+=item.text+"\n"
        if item.name == "ul":
            for bp in item:
                text+=bp.text+"\n"
#     print(text)
    
    # Get date
    date = soup.find('div', class_ = "article-body__date-source")
    if not date:
        date = soup.find('div', class_="article-hero-headline__timestamp")
    if not date:
        count-=1
        print("error date")
        print(link)
        continue
    date = date.find('time')['datetime']
    date = convertdate(date)
    if date == "error":
        print("date is not scraped")
        print(link)
        count-=1
        continue

        
    # Store the data
    with open("NBC_articles.csv", "a") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow([str(count),
                         date,
                         soup.title.text,
                         text,
                         "lean left",
                         link])
    if count%200 == 0:
        print(count)
        print("Time Spent: " + str(time.time()-current_time))
        current_time = time.time()

9200
Time Spent: 36.492724895477295
9400
Time Spent: 134.10739493370056
9600
Time Spent: 94.50657033920288
This link does not contain article:
https://www.nbcnews.com/politics/politics-news/trump-signs-massive-two-year-budget-deal-law-n1038786
9800
Time Spent: 142.48645401000977
10000
Time Spent: 137.95052433013916
This link does not contain article:
https://www.nbcnews.com/politics/2019-special-elections/north-carolina-results
10200
Time Spent: 108.79192519187927
10400
Time Spent: 97.67095184326172
10600
Time Spent: 126.5882248878479
10800
Time Spent: 129.21596384048462
error date
https://www.nbcnews.com/politics/2020-election/stage-set-msnbc-wapo-debate-catch-candidates-n1083941
This link does not contain article:
https://www.nbcnews.com/politics/2019-general-elections/kentucky-results
This link does not contain article:
https://www.nbcnews.com/politics/2019-general-elections/mississippi-results
11000
Time Spent: 141.2536609172821
error date
https://www.nbcnews.com/politics/trump-imp

In [95]:
print(count)
print(real_count)
print(links[real_count-1])

28093
28377
https://www.nbcnews.com/politics/politics-news/toon-housing-bust-flna1C7101183


In [75]:
# The function used when scraping date from websites
def convertdate(datestr):
    if "-" not in datestr:
        datestr = datestr.split(" ")
        month = datestr[1]
        day = datestr[2]
        year = datestr[3]
        if month == "Jan":
            month = "01"
        elif month == "Feb":
            month = "02"
        elif month == "Mar":
            month = "03"
        elif month == "Apr":
            month = "04"
        elif month == "May":
            month = "05"
        elif month == "Jun":
            month = "06"
        elif month == "Jul":
            month = "07"
        elif month == "Aug":
            month = "08"
        elif month == "Sep":
            month = "09"
        elif month == "Oct":
            month = "10"
        elif month == "Nov":
            month = "11"
        elif month == "Dec":
            month = "12"
        else:
            print("Error!")
            return "error"
        return year+"/"+month+"/"+day
    else:
        datestr = datestr.split("-")
        return datestr[0]+"/"+datestr[1]+"/"+datestr[2].split("T")[0]