In [1]:
# -*- coding: utf-8 -*-
"""
Created on Oct 20, 2018
by olahosa

based on jasonweinreb's WSJ scrapper: 
https://github.com/jweinreb/python-wsj/blob/master/wsj-parser.py
"""

import pandas as pd
import numpy as np
import datetime, re, time, random

from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException

In [2]:
def scrape_NYT(to_search, start_date, end_date):
    """
    Scrape NYT for news headlines and summaries.
    inputs:
    - browser -> Firefox browser object (geckodriver) 
    with website with executed 1st page of search results in NYT search tool
    - to_search -> str, search term used
    outputs: df -> with links, dates, headline text, summary text for news
    also saves the df as .csv
    """
    
    # get browser to do the search, provide a path to geckodriver you have donloaded here!
    browser = webdriver.Firefox(executable_path='/Users/olahosa/Desktop/CAP/geckodriver')
    # construct a query to pass to hte NYT search tool
    query = '%22' + to_search.replace('&', '%26').replace(' ', '%20') + '%22'
    # construct the website that would result from entering that query in NYT search tool
    website = 'https://www.nytimes.com/search?endDate=' + end_date + '&query=' + query + '&sort=best&startDate=' + start_date
    # open that website
    browser.get(website)
    # get the number of results as appear on NYT website after search executed
    # use xpath to access it
    num = browser.find_element_by_xpath('//*[@id="site-content"]/div/div/div[1]/p').text.split()[1]
    # clean the result of the xpath result to be only numerical and cast to int
    num = int(re.sub('[^0-9]','', num))

    print(to_search, start_date, end_date, num)

    # init lists to store data
    links, summaries, headlines = [], [], []

    i = 1
    ad = 0
    ad_record = [0]*6

    while True:
        
        time.sleep(random.randint(30,40))
        
        try:
            # scroll down the page
            browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            # wait a bit
            time.sleep(random.randint(1,3))
            # click on 'show more' botton
            showmore = browser.find_element_by_class_name('Search-showMore--Plid0')
            showmore = showmore.find_element_by_tag_name('button')
            showmore.click()
        # if there is no 'show more' button - got to the end of search results or error
        except NoSuchElementException:
            print('No more show more button')
            break
        # if there was 'show more'
        else:
            # get number of search results on the augmented page
            element_num = len(browser.find_elements_by_xpath('//*[@id="site-content"]/div/div/div[2]/div[1]/ol/li'))

            # for every new item (not collected yet)
            for j in range(i, element_num+1):
                try: 
                    # get link, summary and headline
                    link = browser.find_elements_by_xpath(
                        '//*[@id="site-content"]/div/div/div[2]/div[1]/ol/li['+str(j)+']/div/div/a')[0].get_attribute('href')
                    summary = browser.find_elements_by_xpath(
                        '//*[@id="site-content"]/div/div/div[2]/div[1]/ol/li['+str(j)+']/div/div/a/p')[0].text
                    headline = browser.find_elements_by_xpath(
                        '//*[@id="site-content"]/div/div/div[2]/div[1]/ol/li['+str(j)+']/div/div/a/h4')[0].text

                    # only add newly captured stuff if it's not already in our collected list
                    if link not in links:
                        links.append(link)
                        summaries.append(summary)
                        headlines.append(headline)
                    else:
                        print('c!', end=' ')

                # ads don't have links, summaries and headlines
                except:
                    ad += 1

                # inc num of elements
                i += 1

                # print current time every 100 records
                if i%100==0:
                    print(str(datetime.datetime.now()).split(' ')[1][:5], end=', ')

            # work-around for when the bot get stuck on the same page
            # by hte number of ads recognize if anything changes
            # and wait longer the more nothing changes
            if ad==ad_record[-5]:
                time.sleep(random.randint(60,70))
                print('show more fails so sleep a bit')
            elif ad==ad_record[-3]:
                time.sleep(random.randint(40,50))
                print('show more fails so sleep a bit')
            elif ad==ad_record[-2]:
                time.sleep(random.randint(30,40))
                print('show more fails so sleep a bit')
                
            # update number of adds record for the above waiting procedure
            ad_record.append(ad)

            # print number of adds - proxy for augmentations
            # 1 ad per a augmentation: 1 ad per 10 news
            print(ad, end=', ')

    # -----------------------------------------------------------------------
    # POST-PROCESSING
    
    # print total num of ads
    print('num ads:', ad)

    # detect dates in links
    dates = []

    for i, link in enumerate(links):
        try:
            date = re.search(r'\d{4}/\d{2}/\d{2}', str(link))[0]
            dates.append(date)
        except:
            print('\n'+link)
            dates.append('')

    print('Len of links %d, headlines %d, summaries %d and dates %d' % 
          (len(links), len(headlines), len(summaries), len(dates)))
    print('Num unique links %d, headlines %d, summaries %d and dates %d' %
          (len(set(links)), len(set(headlines)), len(set(summaries)), len(set(dates))))

    # make df with all data
    my_df = pd.DataFrame(
        {'link': links,
         'headline': headlines,
         'summary': summaries,
         'date': dates
        })

    # sort by date
    my_df = my_df.sort_values(by='date')

    # save df as csv
    time_span = start_date + '_' + end_date
    my_df.to_csv(to_search.replace(' ','-')+'_'+time_span+'_NYT_url-head-sum.csv')
    
    return my_df

In [3]:
# make a list of comany names and associated lists of:
# - search terms that were used for scraping
# - search intervals for these search terms

comp_names = ['21st-Century-Fox', 'AT&T', 'CBS', 'Comcast', 'Verizon']

all_search_terms = [['News Corp', '21st Century Fox', ], 
                    ['Southwestern Bell', 'SBC', 'AT&T'],
                    ['Viacom', 'CBS'], 
                    ['Comcast_1', 'Comcast_2'], 
                    ['Bell Atlantic', 'Verizon']]

all_search_dates = [[('20031124', '20130705'), ('20130617', '20180831')],
                    [('19930315', '19950509'), ('19941108', '20051121'), ('20051105', '20180831')],
                    [('19930112', '20060109'), ('20060105', '20180831')],
                    [('20011030', '20030110'), ('20021030', '20180831')],
                    [('19930121', '20001114'), ('20000908', '20180831')]]

In [4]:
# for each company (and associated search terms and search time)
for comp_name, search_terms, search_dates in zip(comp_names, all_search_terms, all_search_dates):
    
    for search_term, search_date in zip(search_terms, search_dates):
        
        start_date = search_date[0]
        end_date = search_date[1]
        
        # work-around to deal with the fact that search terms for some companies are the same
        if search_term[-2] == '_':
            to_search = search_term[:-2]
        else:
            to_search = search_term
        
        scrape_NYT(to_search, start_date, end_date)

Comcast 20180801 20181001 27
2, 2, No more show more button
num ads: 2
Len of links 20, headlines 20, summaries 20 and dates 20
Num unique links 20, headlines 20, summaries 20 and dates 13
CBS 20180802 20181011 210
2, 3, 4, 5, 6, 7, 8, 21:02, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 21:09, 19, 20, 21, No more show more button
num ads: 21

https://www.nytimes.com/video/opinion/100000005884108/trump-maga-not-the-way-he-thinks.html

https://www.nytimes.com/paidpost/cbs/why-are-murder-rates-so-high-in-the-rust-belt.html

https://www.nytimes.com/video/opinion/100000005831656/earthrise.html
Len of links 210, headlines 210, summaries 210 and dates 210
Num unique links 210, headlines 208, summaries 192 and dates 63
