In [1]:
# -*- coding: utf-8 -*-
"""
Created on Oct 20, 2018
by olahosa

based on jasonweinreb's WSJ scrapper: 
https://github.com/jweinreb/python-wsj/blob/master/wsj-parser.py
"""

import pandas as pd
import numpy as np
import datetime, re, glob, csv

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [2]:
# Enter company name here and the time period of interest

comp_name = '"Comcast"'
start_date = "2018/08/01"
end_date = "2018/10/10"

# login info (from you WSJ membership) - enter your info
my_login = 'xxx@columbia.edu'
my_password = 'password'

# Get links to desired WSJ articles

In [3]:
def getPageUrl(elementLinks):
    extractLinks = []
    for element in elementLinks:
        links = element.get_attribute('href')
        extractLinks.append(links)
    return(extractLinks)

In [4]:
# download 'geckodriver' and provide path to it here
browser = webdriver.Firefox(executable_path = '/Users/ola/Desktop/CAP/geckodriver')

# loading home url for WSJ
browser.get('http://markets.wsj.com/?mod=Homecle_MDW_MDC')

# ## login Credentials
# login = browser.find_element_by_link_text("Log In").click()
# loginID = browser.find_element_by_id("username").send_keys(my_login)
# loginPass = browser.find_element_by_id("password").send_keys(my_password)
# loginReady = browser.find_element_by_class_name("solid-button")
# loginReady.submit()

# search for company name
WebDriverWait(browser, 10).until(
        EC.presence_of_element_located((By.ID, "globalHatSearchInput")))

search_box = browser.find_element_by_id("globalHatSearchInput")
search_box.clear()
search_box.send_keys(comp_name) # Input search keyword
WebDriverWait(browser, 5)
search_req = browser.find_element_by_css_selector('.button-search').click()

In [5]:
# DO MANUALLY - in firefox window close cookie policy popup
## Close cookie policy if needed
# try:
#     browser.find_element_by_class_name("close").click()
# except NoSuchElementException:
#     print('Cookie agreement already acknowledged')

In [7]:
# enter advanced search and enter desider time period
toggleMenu = browser.find_element_by_link_text("ADVANCED SEARCH")
toggleMenu.click()
menuOptions = browser.find_element_by_class_name('datePeriod')
browser.find_element_by_name("sfrom").send_keys(start_date)
browser.find_element_by_name("sto").send_keys(end_date)

In [8]:
# restrict search to articles whose subject is the company
browser.find_element_by_id('metadata').send_keys(comp_name)

In [9]:
# restrict search to articles only (exclude videos, blogs, etc)
browser.execute_script("window.scrollTo(0, 500)")
browser.find_element_by_link_text("WSJ Blogs").click()
browser.find_element_by_link_text("WSJ Videos").click()
browser.find_element_by_link_text("WSJ Site Search").click()

In [10]:
# execute search button
browser.execute_script("window.scrollTo(0, 0)")
searchArchive = browser.find_element_by_class_name('keywordSearchBar')
searchArchive.find_element_by_class_name("searchButton").click()

In [11]:
# get page count or results and number results count
pageCount = browser.find_elements_by_class_name("results-count")[1].text
pageCount = int(re.sub(r'of ', '', pageCount))
resultCount = browser.find_elements_by_class_name("results-count")[0].text
resultCount = int(resultCount.rpartition("of ")[2])

print(pageCount)

1


In [12]:
## extract all article urls
articleLinks = []

for j in range(0, pageCount):
    elementLinks = browser.find_elements_by_xpath('//h3[@class="headline"]/a')
    links = getPageUrl(elementLinks)
    articleLinks.append(links)
    print('done with page ' + str(j+1) + ' of ' + str(pageCount))
    try:
        browser.find_element_by_class_name("next-page").click()
    except NoSuchElementException:
        pass

done with page 1 of 1


In [13]:
articleLinks = [y for x in articleLinks for y in x]

In [14]:
# write list of urls to a csv file for later use:

out_file = comp_name.strip('"').replace(" ", "")+"_"+start_date.replace("/", "")+"_"+end_date.replace("/", "")+"_urls.csv"

with open(out_file, "w") as csvfile:
    writer = csv.writer(csvfile, delimiter= ",")
    hdr = ['articleLink']
    writer.writerow(hdr)
    for link in articleLinks:
        entry = [link] 
        writer.writerow(entry)
        
df = pd.read_csv(out_file)

# Download desired articles

In [15]:
# open WSJ homepage and log in (adjust path to geckodriver)
browser = webdriver.Firefox(executable_path = '/Users/ola/Desktop/CAP/geckodriver')
browser.get('http://www.wsj.com')

# DO MANUALLY - close cookie policy popup
# browser.find_element_by_class_name("close").click()
# browser.find_element_by_class_name("cs-close").click()
# browser.find_element_by_xpath('//div[contains(@class,"ui-dialog") and @aria-describedby="dialogContent2"]//button[@title="Close"]').click()
# alrt = browser.switch_to.alert

In [16]:
login = browser.find_element_by_link_text("Sign In").click()
loginID = browser.find_element_by_id("username").send_keys(my_login)
loginPass = browser.find_element_by_id("password").send_keys(my_password)
loginReady = browser.find_element_by_class_name("solid-button")
loginReady.submit()

In [17]:
# how many articles to save?
print(len(articleLinks))

15


In [18]:
# save atricles as a dictionary
articles = {}
article_count = 0

for i, link in enumerate(df.articleLink):
    print(i, end=', ')
    
    browser.get(link)
    
    # get headline if it exists (otherwise empty) 
    try:
        headline = browser.find_element_by_class_name("wsj-article-headline").text
    except NoSuchElementException:
        print(str(i) + ' : no headline')
        headline = link
        
    print(headline)
    
    # enter article headline into dictionary
    articles[i] = {}
    articles[i]['headline'] = headline
    
    # get timestamp if it exists (otherwise empty)
    try:
        timestamp = browser.find_element_by_class_name("timestamp").text
    except NoSuchElementException:
        print(str(i) + ' : no time stamp')
        timestamp = ''
    
    # clean time stamp if it exists 
    timestamp = re.sub(r'Updated ', '', timestamp)
    timestamp = re.sub(r' ET', '', timestamp)
    timestamp = re.sub(r'p.m.', 'PM', timestamp)
    timestamp = re.sub(r'a.m.', 'AM', timestamp)
    if 'Sept.' in timestamp:
        timestamp = re.sub(r'Sept.', 'Sep.', timestamp)
    if "COMMENTS" in timestamp:
        timestamp = timestamp.split("\n")[0]
    if 'AM' in timestamp or 'PM' in timestamp:
        try:
            timestamp = datetime.datetime.strptime(timestamp, '%b. %d, %Y %I:%M %p')
        except ValueError:
            timestamp = datetime.datetime.strptime(timestamp, '%B %d, %Y %I:%M %p')
    else:
        try:
            timestamp = datetime.datetime.strptime(timestamp, '%B %d, %Y')
        except ValueError:
            timestamp = datetime.datetime.strptime(timestamp, '%b. %d, %Y')
    
    # put timestamp and link into Article dictionary 
    articles[i]["date"] = timestamp
    articles[i]["link"] = link
        
    # extract article text
    try:
        paragraphs = browser.find_element_by_class_name("article-wrap").text
    except NoSuchElementException:
        paragraphs = ''
    articles[i]["text"] = paragraphs
    
    try:
        sub_head = browser.find_element_by_class_name("sub-head").text
    except NoSuchElementException:
        sub_head = ''
    articles[i]["sub-head"] = sub_head
            
    article_count += 1     

0, Stocks to Watch: Amazon, GE, Facebook, Target, Ford, UPS, Home Depot, JPMorgan, Comcast
1, Comcast’s Revenge: Never Sell Hulu
2, Disney Gets $15 Billion to Invest in Streaming
3, 21st Century Fox to Sell Sky Stake to Comcast
4, Comcast Shares Slide After Regulators Accept Sky Bid
5, Robert Greenblatt, Chairman of NBC Entertainment, to Step Down
6, Stocks to Watch: Comcast, Disney, Pandora, Newmont, Dell, Chipotle, Michael Kors, Nordstrom, Chevron
7, For Comcast, the Next Test on Sky Is Convincing Investors
8, Sky Takeover Proves a Dream Trade for Hedge Funds
9, Sky Loss Is a Disappointment for Disney Ambitions
10, Game Theory Part of the Art of the Sky Deal
11, Comcast, Fox to Settle $35 Billion Takeover Battle for Sky in Weekend Auction
12, Winning Sky Is Just the Start for Disney or Comcast
13, Fox, Comcast Fight for Sky Could End in Sealed Bids
14, Fox, Comcast Reach New Deal on Big Ten Network


In [19]:
# how many articles downloaded?
len(articles)

15

In [20]:
# save dictionary as pd dataframe
df_art = pd.DataFrame.from_dict(articles, orient='index')
df_art = df_art.sort_values(by='date')
df_art.reset_index(inplace=True)
df_art

Unnamed: 0,index,headline,date,link,text,sub-head
0,14,"Fox, Comcast Reach New Deal on Big Ten Network",2018-08-24 19:18:00,http://online.wsj.com/articles/fox-comcast-rea...,Maryland wide receiver Taivon Jacobs speaks du...,Renewal agreement averts a full blackout of th...
1,13,"Fox, Comcast Fight for Sky Could End in Sealed...",2018-09-16 12:00:00,http://online.wsj.com/articles/bidding-war-for...,British broadcasting giant Sky is one of the b...,U.K. takeover rules could require a blind auct...
2,12,Winning Sky Is Just the Start for Disney or Co...,2018-09-20 10:24:00,http://online.wsj.com/articles/winning-sky-is-...,"2 COMMENTS\nBy Stephen Wilmot\nSept. 20, 2018 ...",Whoever buys the European pay-TV leader faces ...
3,11,"Comcast, Fox to Settle $35 Billion Takeover Ba...",2018-09-20 19:27:00,http://online.wsj.com/articles/comcast-fox-to-...,Sky has attracted takeover interest from some ...,The regulator hasn’t run such a process many t...
4,10,Game Theory Part of the Art of the Sky Deal,2018-09-21 05:30:00,http://online.wsj.com/articles/blind-auction-c...,"Sky sells wireless, TV and internet services t...","Comcast, Disney and Fox could use the same tac..."
5,9,Sky Loss Is a Disappointment for Disney Ambitions,2018-09-22 15:45:00,http://online.wsj.com/articles/sky-loss-is-a-d...,Robert Iger’s tenure at Disney has been marked...,Disney must now continue strategy of controlli...
6,8,Sky Takeover Proves a Dream Trade for Hedge Funds,2018-09-23 07:37:00,http://online.wsj.com/articles/sky-takeover-pr...,Several hedge funds have reaped huge profits f...,The U.K. broadcaster had been on investors’ ra...
7,7,"For Comcast, the Next Test on Sky Is Convincin...",2018-09-23 19:27:00,http://online.wsj.com/articles/comcast-wins-bi...,"Manchester City’s Raheem Sterling, right, vies...",Winning $38.8 billion bid in U.K. auction ends...
8,6,"Stocks to Watch: Comcast, Disney, Pandora, New...",2018-09-24 09:23:00,http://online.wsj.com/articles/stocks-to-watch...,"Nordstrom shares dropped 7.9% last week, their...",Here are some of the companies with shares exp...
9,5,"Robert Greenblatt, Chairman of NBC Entertainme...",2018-09-24 13:00:00,http://online.wsj.com/articles/robert-greenbla...,Chairman of NBC Entertainment Robert Greenblat...,Executive who led prime-time revival with ‘Thi...


In [21]:
# save dataframe to csv
out_file2 = comp_name.strip('"').replace(" ", "")+"_"+start_date.replace("/", "")+"_"+end_date.replace("/", "")+"_scraped.csv"
df_art.to_csv(out_file2)

# Some checks

In [22]:
# pring # characters (length) of all features
for i in range(df_art.shape[0]):
    print(i, len(df_art['headline'].iloc[i]), len(df_art.text.iloc[i]), len(str(df_art['sub-head'].iloc[i])))

0 46 1453 107
1 51 4679 104
2 51 3668 96
3 77 6082 106
4 43 5890 130
5 49 2812 170
6 49 5057 121
7 57 8209 110
8 100 2776 89
9 62 3996 92
10 52 3233 62
11 45 4374 101
12 46 2057 88
13 34 2468 117
14 87 2554 89


In [23]:
df_try = pd.read_csv('Comcast_20180801_20181010_scraped.csv')
print(df_try.text.iloc[3])

Sky has attracted takeover interest from some of the world’s biggest media companies. PHOTO: OLI SCARFF/AGENCE FRANCE-PRESSE/GETTY IMAGES
By Stu Woo and
Ben Dummett
Updated Sept. 20, 2018 7:27 p.m. ET
LONDON— Comcast Corp. CMCSA 0.11% and 21st Century Fox Inc. FOX 0.22% will settle their takeover battle for Sky SKYAY 0.22% PLC in a weekend auction run by British regulators, setting up a dramatic climax to a 21-month sale process that has pitted some of the world’s biggest media giants against each other.
The U.K. Takeover Panel, which polices deal making in the country, laid out rules for the auction on Thursday. It is a process the regulator hasn’t run many times previously—and never before with such a large company as the prize. London-listed Sky has a market value of some $36 billion.
Such government-mandated auctions of big, publicly traded companies are rare. The Sky auction pits Rupert Murdoch’s 21st Century Fox, which already owns 39% of Sky, against Comcast. Walt Disney Co. DIS

In [24]:
df_try.date

0     2018-08-24 19:18:00
1     2018-09-16 12:00:00
2     2018-09-20 10:24:00
3     2018-09-20 19:27:00
4     2018-09-21 05:30:00
5     2018-09-22 15:45:00
6     2018-09-23 07:37:00
7     2018-09-23 19:27:00
8     2018-09-24 09:23:00
9     2018-09-24 13:00:00
10    2018-09-24 13:18:00
11    2018-09-26 10:57:00
12    2018-09-26 12:59:00
13    2018-10-07 10:00:00
14    2018-10-08 09:24:00
Name: date, dtype: object

In [25]:
for f in glob.glob('*_scraped.csv'):
    print(f)
    df = pd.read_csv(f)
    print(min(df.date), max(df.date), df.shape[0])
    print()

Comcast_20180801_20181010_scraped.csv
2018-08-24 19:18:00 2018-10-08 09:24:00 15

