In [1]:
# -*- coding: utf-8 -*-
"""
Created on Oct 20, 2018
by olahosa

based on jasonweinreb's WSJ scrapper: 
https://github.com/jweinreb/python-wsj/blob/master/wsj-parser.py
"""

import pandas as pd
import numpy as np
import datetime, re, glob, csv, time
from dateutil.parser import parse

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [2]:
# company of interest
comp_name = "21st Century Fox"

# change to your uni and password here (Columbia login credentials)
my_login = 'my_uni'
my_password = 'my_Columbia_password'

# Get links to desired articles

In [3]:
def getPageUrl(elementLinks):
    extractLinks = []
    for element in elementLinks:
        links = element.get_attribute('href')
        extractLinks.append(links)
    return(extractLinks)

In [4]:
# download 'geckodriver' and provide path to it here
browser = webdriver.Firefox(executable_path = '/Users/ola/Desktop/CAP/geckodriver')

# loading home url for ProQuest
browser.get('https://search-proquest-com.ezproxy.cul.columbia.edu')

In [5]:
# login credentials
loginID = browser.find_element_by_id("username").send_keys(my_login)
loginPass = browser.find_element_by_id("password").send_keys(my_password)
loginReady = browser.find_element_by_class_name("btn-submit")
loginReady.click()

In [6]:
# DO SELECTIONS MANUALLY in Advanced search

# - enter company name
# - seleft 'Full text'
# - Source type: select 'Historical Newspapers' and 'Newspapers'
# - Language: select 'English'
# - More options: Items per page - select 100; exclude duplicates
# - click 'Search'

# - Publication date: select desired range (enter range when company existed under name)
# - Location - select all variations of US, e.g. USA, United Sstates etc.
# - enter 'Company/Organization' - the same as company name (select all name variations)
# - (Optional) Publication title (e.g. WSJ) - or don't select for all news

In [6]:
# get results count
results = browser.find_element_by_xpath('//div[@class="resultsPageSpecific"]').text
results = results.split(' ')
resultCount = int(results[2].replace(',',''))
resultCount

26

In [7]:
# get page count
pageCount = resultCount//100 + 1
pageCount

1

In [8]:
# extract all article urls
articleLinks = []

count = 0
bad = 0

for i in range(0, pageCount):
    elementLinks = browser.find_elements_by_xpath('//*[@id="citationDocTitleLink"]')
    for j in range(100):
        if count < resultCount:
            bub = '(//a[@id="citationDocTitleLink"])['+str(j+1)+']'
            try:
                elementLink = browser.find_elements_by_xpath(bub)
                link = getPageUrl(elementLink)
                articleLinks.append(link[0])
            # skip artickes wit bad links
            except:
                print('\n', bad, 'bad link - article skipped')
                bad += 1
            count += 1
        
    print(str(i+1) + '/' + str(pageCount), end=', ')
    try:
        browser.find_element_by_xpath('//a[@title="Next page"]').click()
    except NoSuchElementException:
        pass

1/1, 

In [9]:
# how many articles?
print(len(articleLinks))

26


In [10]:
# save links to csv
out_file = comp_name+'_ProQuest_urls.csv'

with open(out_file, "w") as csvfile:
    writer = csv.writer(csvfile, delimiter= ",")
    hdr = ['articleLink']
    writer.writerow(hdr)
    for link in articleLinks:
        entry = [link] 
        writer.writerow(entry)

# Download articles

In [11]:
# read the lisk of links to download
out_file = comp_name+'_ProQuest_urls.csv'
df = pd.read_csv(out_file)

In [12]:
df.tail()

Unnamed: 0,articleLink
21,https://search-proquest-com.ezproxy.cul.columb...
22,https://search-proquest-com.ezproxy.cul.columb...
23,https://search-proquest-com.ezproxy.cul.columb...
24,https://search-proquest-com.ezproxy.cul.columb...
25,https://search-proquest-com.ezproxy.cul.columb...


In [13]:
# function to recognize a date in a string
def is_date(string):
    try: 
        parse(string)
        return True
    except ValueError:
        return False

# get a dte from a string
def get_date_from_string(string):
    final_date = datetime.datetime.strptime('10 10 1000', '%d %m %Y')
    for i in range(len(string)):
        for j in range(15):
            if is_date(string[i:i+j]):
                try:
                    date = datetime.datetime.strptime(string[i:i+j], '%d %b %Y')
                    if final_date == datetime.datetime.strptime('10 10 1000', '%d %m %Y'):
                        final_date = date
                    else:
                        pass
                except ValueError:
                    pass
                
                try:
                    date = datetime.datetime.strptime(string[i:i+j], '%d %B %Y')
                    if final_date == datetime.datetime.strptime('10 10 1000', '%d %m %Y'):
                        final_date = date
                    else:
                        pass
                except ValueError:
                    pass                
    return final_date

In [14]:
# scrape article
def scrape(articles, browser, i):
    browser.get(df.articleLink.iloc[i])
    
    title = browser.find_element_by_id("documentTitle").text
    
    # get abstract if exists, if not save empty
    try:
        abstract = browser.find_element_by_class_name("abstract").text
    except NoSuchElementException:
        abstract = ''
        
    try:
        text = browser.find_element_by_id('fullTextZoneId').text
    except NoSuchElementException:
        text = browser.find_element_by_css_selector('body').text

    details = browser.find_elements_by_xpath('//span[@class="titleAuthorETC"]')
    details = details[1].text

    date = get_date_from_string(details)
#     print(date, ', ', title)

    articles[i] = {}

    articles[i]["title"] = title
    articles[i]["abstract"] = abstract
    articles[i]["text"] = text
    articles[i]["date"] = date

In [15]:
# get to the url
browser = webdriver.Firefox(executable_path = '/Users/ola/Desktop/CAP/geckodriver')
browser.get('https://search-proquest-com.ezproxy.cul.columbia.edu')
loginID = browser.find_element_by_id("username").send_keys(my_login)
loginPass = browser.find_element_by_id("password").send_keys(my_password)
loginReady = browser.find_element_by_class_name("btn-submit")
loginReady.click()

In [16]:
# download articles
articles = dict()

for i in range(df.shape[0]):
    print(i, end=', ')

    success = 0
    while success == 0:
        try:
            scrape(articles, browser, i)
            success = 1
        # if you encounter a captcha: wait 5 seconds and open another browser
        except NoSuchElementException:
            print('catptcha - I sleep')
            time.sleep(5)
            
            browser.close() 
            
            browser = webdriver.Firefox(executable_path = '/Users/ola/Desktop/CAP/geckodriver')
            browser.get('https://search-proquest-com.ezproxy.cul.columbia.edu')
            
            loginID = browser.find_element_by_id("username").send_keys(my_login)
            loginPass = browser.find_element_by_id("password").send_keys(my_password)
            loginReady = browser.find_element_by_class_name("btn-submit")
            loginReady.click()

0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 

In [17]:
# save dict as pd dataframe
df_art = pd.DataFrame.from_dict(articles, orient='index')
df_art = df_art.sort_values(by='date')
df_art.reset_index(inplace=True)

# save dataframe as csv
out_file2 = comp_name+'_ProQuest_scrape.csv'
df_art.to_csv(out_file2)

df_art

Unnamed: 0,index,title,abstract,text,date
0,0,What skills will Tesla’s new chairman need to ...,,Elon Musk has a lot of titles: Chief executive...,2018-10-14
1,1,Sky CEO's €15m deal windfall: BRIEFS,,÷ SKY CEO Jeremy Darroch has pocketed £13.4m (...,2018-10-14
2,2,TV streaming giant casts its net ever wider: T...,,"Last week, reports emerged of what may be the ...",2018-10-14
3,3,Foxtel's 'critical' year: News Corp has Plan B,"On a recent trip to the US, Goldman Sachs anal...",News Corporation could cut its losses and sell...,2018-10-15
4,4,Direct streaming and Fox assets loom large for...,,Between the pending $US71.3 billion ingestion ...,2018-10-15
5,5,The Walt Disney Company Announces Amendments t...,,The Walt Disney Co. issued the following news ...,2018-10-15
6,6,Bank of America Is Missing Out on Wall Street'...,,Bank of America Corp.'s purchase of Merrill Ly...,2018-10-15
7,7,"After Years on the Right, Dennis Miller Wants ...",,"In his stand-up comedy special, ""Fake News, Re...",2018-10-15
8,8,"Venom wins box office, boosts Sony superhero s...",,"Venom, a character from the comic-book world o...",2018-10-16
9,9,Elon Musk fraud settlement approved by US judge,,Elon Musk’s settlement with a US regulator has...,2018-10-16


# Some checks

In [18]:
df_try = pd.read_csv(out_file2)

In [19]:
print(df_try.text.iloc[2])

Last week, reports emerged of what may be the world's first case of Netflix addiction. A 26-year-old man in Bangalore, India, checked himself into a clinic claiming he had suffered fatigue and eye strain through months of compulsive binge-watching.
It is something many of us could relate to. The video-streaming behemoth, which has had hit after hit with glossy TV shows such as Stranger Things and Orange is the New Black, has eaten up so many hours of subscribers' lives that there is seemingly little time left in the evening for it to conquer.
Reed Hastings, Netflix's chief executive, is fond of saying many users only tune out when it is well past bedtime. "You get a show or a movie you're really dying to watch, and you end up staying up late at night," Hastings said last year. "We actually compete with sleep. And we're winning!"
Winning is something Netflix is used to. The company, founded as a DVD rental service 21 years ago (it only started letting users watch shows over the internet

In [20]:
df_try.date

0     2018-10-14
1     2018-10-14
2     2018-10-14
3     2018-10-15
4     2018-10-15
5     2018-10-15
6     2018-10-15
7     2018-10-15
8     2018-10-16
9     2018-10-16
10    2018-10-16
11    2018-10-16
12    2018-10-17
13    2018-10-18
14    2018-10-18
15    2018-10-18
16    2018-10-18
17    2018-10-18
18    2018-10-18
19    2018-10-18
20    2018-10-18
21    2018-10-18
22    2018-10-19
23    2018-10-19
24    2018-10-19
25    2018-10-20
Name: date, dtype: object