# Fox News Scraper
### You need Google Chrome for running this notebook

#### When you already have Chrome installed:
1. Go to the website: https://sites.google.com/chromium.org/driver/downloads?authuser=0
2. Download a chrome driver that is the same version as your chrome
3. Double click the driver to open it
4. Come back here, start running the notebook

In [None]:
# Necessary libraries
from bs4 import BeautifulSoup
from selenium import webdriver   
from requests import get
import time
import re
import csv
import pandas as pd
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import datetime

# the dictionary for storing links of news
# dictionary can prevent duplicates
links = {}

## Web-scrape links of news

In [None]:
# Fox political news has 7 sections, see the list "urls", each url is a section
# You can comment out those sections you don't want to web-scrape

urls = [#"https://www.foxnews.com/category/politics/executive",
#         "https://www.foxnews.com/category/politics/senate",
#         "https://www.foxnews.com/category/politics/house-of-representatives",
#         "https://www.foxnews.com/category/politics/judiciary",
#         "https://www.foxnews.com/category/politics/foreign-policy",
        "https://www.foxnews.com/category/politics/elections",
       "https://www.foxnews.com/category/columns/fox-news-poll"]

# For each section, this cell will open a chrome window and scrape news LINKS from the page.
# This cell may fail, when it fails, just re-run the cell.
# Basically, the code below keep scrolling down the page and look for news that appears.
for url in urls:
    driver = webdriver.Chrome('/Users/wufangzheng/Downloads/chromedriver') #set the webdriver to Chrome driver
    driver.get(url)
    wait = WebDriverWait(driver, 10)
    print(url)
    for j in range(10):
        err_count = 0
        try:
            for i in range (1000):

                element = wait.until(EC.visibility_of_element_located((By.XPATH, "(//div[@class='button load-more js-load-more'])[1]/a")))
                element.click()
                if i%100 == 0:
                    print(i)
                if i%20 == 0:
                    articleLinks = driver.find_elements_by_class_name("article-list")[0].find_elements_by_class_name("m")
                    count = 0
                    for item in articleLinks:
                        link = item.find_element_by_tag_name('a').get_attribute('href')
                        if link not in links:
                            links[link] = 0
                            count+=1
                    print("Added " + str(count) + " new article links")
        except BaseException as err:
            err_count+=1
            if err_count>1:
                closeButton = wait.until(EC.visibility_of_element_located((By.XPATH, "(//div[@class='pf-widget-close'])[1]/a")))
                closeButton.click()
                err_count = 0
            print(err)
            time.sleep(5)
            continue

    driver.close()

In [None]:
print([links])

In [None]:
print(list(links.keys())[15060])

In [None]:
# Store the links.
with open("Fox News Links.csv", "w") as csvfile:
    writer = csv.writer(csvfile)
    for link in links_list:
        writer.writerow([link])

In [None]:
links_list = list(links.keys())
print(len(links_list))

In [None]:
# Remove the links that starts with "https://video.foxnews.com",
# as they do not contain articles
print(len(links_list))
count_list = []
for link in links_list:
    x = link.startswith('https://video.foxnews.com')
    if x:
        count_list.append(link)
for link in count_list:
    links_list.remove(link)
print(len(links_list))

## Web-scrape article text from those links

In [276]:
# Open every link from the links, and web-scrape text from it, and store the data to FOX_articles.csv

# The data we scrape: title, text (article body), date
# The output data will have the format of: index, date, title, text, label(left/lean left/neutral/lean right/right), link

today = datetime.date.today()
current_time = time.time()
count = 0
for link in links_list:
    count+=1
    
    # If a link is denied, wait for 5 secs and try again.
    # If the link is denied more than 10 times, skip this link
    err_count = 0
    while True:
        page = get(link)
        if page.status_code == 200:
            break
        else:
            print("Access denied! Reconnect in 5 seconds...")
            err_count+=1
            time.sleep(5)
            if err_count > 10:
                break
    if err_count>10:
        print("This link is not found:")
        print(link)
        count-=1
        continue
    
    # Get the html source code of this article
    soup = BeautifulSoup(page.content, 'html.parser')
    # If there is no content, skip
    article = soup.find('div', class_='article-body')
    if not article:
        count-=1
        print("This link does not contain article:")
        print(link)
        continue
    paras = article.find_all('p', recursive=False)
    
    # Get text
    text = ""
    for para in paras:
        if(para.find("strong")):
            continue
        text += para.text+'\n'
        
    # Get date
    # It's very tricky to scrape and format date from the website.
    # Basically, the code below considers and handles different types of timestamps:
        # For example, "xxx hours ago", "xxx days ago", "Nov 26, 2020", "Dec 1"
    date = soup.find('div', class_='article-date').find('time').text
    # If the timestamp is something like "xxx hours ago"
    if 'hours' in date:
        date = datetime.datetime(
            year=today.year, 
            month=today.month,
            day=today.day,
        )    
    # If the timestamp is something like "xxx days ago"
    elif 'day' in date:
        temp = date.split(" ")
        n = int(temp[1])
        date = today - datetime.timedelta(days=n)
        date = datetime.datetime(
            year=date.year, 
            month=date.month,
            day=date.day,
        )
    # If the timestamp is something like "Nov 26, 2020"
    elif ',' in date:
        date = datetime.datetime.strptime(date, ' %B %d, %Y')
    # If the timestamp only contains month and day but year, like "Dec 12"
    else:
        date = datetime.datetime.strptime(str(today.year) + " " + date, '%Y %B %d')
        today_with_time = datetime.datetime(
            year=today.year, 
            month=today.month,
            day=today.day,
        )
        if date > today_with_time:
            date = date - datetime.timedelta(days=365)
            
    
#     print(date.strftime("%Y/%m/%d"))
#     print(link)
#     print(str(i+1) + '\n')
#     print(soup.title.text + "\n")
#     print(text)
    
    # Store the data
    with open("FOX_articles.csv", "a") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow([str(count),
                         date.strftime("%Y/%m/%d"),
                         soup.title.text,
                         text,
                         "right",
                         link])
    if count%1000 == 0:
        print(count)
        print("Time Spent: " + str(time.time()-current_time))
        current_time = time.time()
print(count)

1000
Time Spent: 524.9663081169128
This link does not contain article:
https://www.foxnews.com/us/washingtons-security-preps-high-gear-inauguration
2000
Time Spent: 531.1011960506439
Access denied! Reconnect in 5 seconds...
Access denied! Reconnect in 5 seconds...
Access denied! Reconnect in 5 seconds...
Access denied! Reconnect in 5 seconds...
Access denied! Reconnect in 5 seconds...
Access denied! Reconnect in 5 seconds...
Access denied! Reconnect in 5 seconds...
Access denied! Reconnect in 5 seconds...
Access denied! Reconnect in 5 seconds...
Access denied! Reconnect in 5 seconds...
Access denied! Reconnect in 5 seconds...
This link is not found:
https://www.foxnews.com/fox-friends/bill-bennett-blasts-bidens-foreign-policy-claims-its-speak-softly-carry-no-stick
3000
Time Spent: 616.6402521133423
4000
Time Spent: 543.0081729888916
This link does not contain article:
https://www.foxnews.com/politics/fox-news-poll-8-15
This link does not contain article:
https://www.foxnews.com/politic

This link does not contain article:
https://www.foxnews.com/politics/fox-news-poll-aug-31-2016
This link does not contain article:
https://www.foxnews.com/politics/fox-news-poll-aug-4-2016
This link does not contain article:
https://www.foxnews.com/politics/fox-news-poll-trump-dominates-gop-race-in-florida
This link does not contain article:
https://www.foxnews.com/politics/fox-news-poll-kasich-ahead-in-ohio
This link does not contain article:
https://www.foxnews.com/politics/fox-news-poll-march-16-2017
This link does not contain article:
https://www.foxnews.com/politics/fox-news-poll-march-2-2017
This link does not contain article:
https://www.foxnews.com/politics/fox-news-poll-obama-has-edge-over-romney-in-ohio
This link does not contain article:
https://www.foxnews.com/politics/fox-news-poll-romney-edges-obama-as-approval-of-president-drops
This link does not contain article:
https://www.foxnews.com/politics/fox-news-poll-68-percent-concerned-about-their-health-care-under-new-law
Th

In [277]:
print(count)


14262
