In [1]:
import joblib
import nltk
import time
import random
import datetime
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
from os import path
from dateutil import parser
from bs4 import BeautifulSoup
from selenium.common.exceptions import NoSuchElementException
from selenium import webdriver

In [2]:
SP500_list = open("../data/SP500.txt", 'r').read().split('\n')
driver_path = r"./chromedriver.exe"

In [3]:
def scroll_to_bottom():
    old_position = 0
    new_position = None

    while new_position != old_position:
        # Get old scroll position
        old_position = driver.execute_script(
                ("return (window.pageYOffset !== undefined) ?"
                 " window.pageYOffset : (document.documentElement ||"
                 " document.body.parentNode || document.body);"))
        # Sleep and Scroll
        time.sleep(1)
        driver.execute_script((
                "var scrollingElement = (document.scrollingElement ||"
                " document.body);scrollingElement.scrollTop ="
                " scrollingElement.scrollHeight;"))
        time.sleep(2 + random.random())
        # Get new position
        new_position = driver.execute_script(
                ("return (window.pageYOffset !== undefined) ?"
                 " window.pageYOffset : (document.documentElement ||"
                 " document.body.parentNode || document.body);"))

In [11]:
class ReutersCrawlerV1:
    """
    Parameters:
        query: str
        
    Example:
        RC = Reuters_Crawler()
        df = RC.parse_to_dataframe(query="Google")
    """
    def __init__(self):
        self.driver_path = r"./chromedriver.exe"
        self.next_button = '//*[@id="content"]/section[2]/div/div[1]/div[4]/div/div[4]/div[1]'
    
    def parse_to_dataframe(self, query):
        """
        Parameters:
            query: str
        """
        # Open driver
        self.query = query
        self.url = "https://www.reuters.com/search/news?blob={}&dateRange=all".format(query)
        self.driver = webdriver.Chrome(self.driver_path)
        self.driver.get(self.url)
        time.sleep(2)
        # Scroll down page
        self.scroll_to_bottom()
        # Parsing
        soup = BeautifulSoup(self.driver.page_source, "html.parser")
        self.driver.quit()
        news_list = soup.find_all(name="div", attrs={"class": "search-result-content"})
        news_list_generator = self.get_news_list(news_list)
        df = pd.DataFrame(list(news_list_generator), columns=["title", "date", "query", "url"])
        df = df.drop_duplicates(subset="title")
        df["date"] = pd.to_datetime(df["date"], utc=True)
        return df
                
    def check_exists_by_xpath(self, xpath):
        try:
            self.driver.find_element_by_xpath(xpath)
        except NoSuchElementException:
            return False
        return True

    def scroll_to_bottom(self):
        old_position = 0
        new_position = None

        while new_position != old_position:
            # Get old scroll position
            old_position = self.driver.execute_script(
                    ("return (window.pageYOffset !== undefined) ?"
                     " window.pageYOffset : (document.documentElement ||"
                     " document.body.parentNode || document.body);"))
            # Sleep and Scroll
            time.sleep(1)
            self.driver.execute_script((
                    "var scrollingElement = (document.scrollingElement ||"
                    " document.body);scrollingElement.scrollTop ="
                    " scrollingElement.scrollHeight;"))
            time.sleep(2 + random.random())
            self.driver.find_element_by_xpath(self.next_button).click()
            time.sleep(2 + random.random())
            # Get new position
            new_position = self.driver.execute_script(
                    ("return (window.pageYOffset !== undefined) ?"
                     " window.pageYOffset : (document.documentElement ||"
                     " document.body.parentNode || document.body);"))
    
    def get_news_list(self, news_list):
        for i in range(len(news_list)):
            title = news_list[i].find(name="a").text
            date = news_list[i].find(name="h5", attrs={"class": "search-result-timestamp"}).text
            date = parser.parse(date, tzinfos={"EDT": "UTC-8", "EST": "UTC-8"})
            url = news_list[i].find(name="a").get("href")
            url = "https://www.reuters.com" + url
            yield [title, date, self.query, url]

In [6]:
for url in tqdm(SP500_list):
    driver = webdriver.Chrome(driver_path)
    driver.get(url)
    time.sleep(1 + random.random())
    # scroll_to_bottom()
    soup = BeautifulSoup(driver.page_source, "html.parser")
    driver.quit()
    table = soup.find(name="div", attrs={"class": "FeedScroll-feed-container-106s7"})
    if table is None:
        print(url)
    driver.quit()

HBox(children=(FloatProgress(value=0.0), HTML(value='')))




In [8]:
company_list = []
for url in tqdm(SP500_list):
    res = requests.get(url, timeout=5)
    soup = BeautifulSoup(res.text, "html.parser")
    company = soup.find("div", attrs={"class": "QuoteRibbon-name-ric-epp2J"}).find("h1").text
    company_list.append(company)

HBox(children=(FloatProgress(value=0.0), HTML(value='')))




In [15]:
df = pd.DataFrame()
for query in tqdm(company_list):
    try: 
        RC = ReutersCrawlerV1()
        df_temp = RC.parse_to_dataframe(query=query)
        df = pd.concat([df, df_temp], axis=0)
    except: 
        print(query)

HBox(children=(FloatProgress(value=0.0), HTML(value='')))



AT&T Inc.
3M Co
Fidelity National Information Servcs Inc
ServiceNow Inc



In [16]:
df

Unnamed: 0,title,date,query,url
0,BRIEF-Apple Inc Says Not Allowing Entertainmen...,2020-03-15 02:16:00+00:00,Apple Inc.,https://www.reuters.com/article/idUSFWN2B61K2
1,Apple signs multi-year deals with major music ...,2020-03-12 21:46:00+00:00,Apple Inc.,https://www.reuters.com/article/idUSKBN20Z33J
2,Apple signs multi-year deals with major music ...,2020-03-12 21:39:00+00:00,Apple Inc.,https://www.reuters.com/article/idUSL4N2B54T2
3,Chinese regulators remove 'Plague Inc' game fr...,2020-02-28 05:16:00+00:00,Apple Inc.,https://www.reuters.com/article/idUSKCN20M043
4,UPDATE 1-Chinese regulators remove 'Plague Inc...,2020-02-28 01:31:00+00:00,Apple Inc.,https://www.reuters.com/article/idUSL3N2AS0OO
...,...,...,...,...
389,Britain mulls replacing corporate CO2 scheme,2012-03-21 23:04:00+00:00,Target Corporation,https://www.reuters.com/article/idUSBRE82K0VK2...
391,Switzerland proposes scrapping some corporate ...,2014-09-22 18:52:00+00:00,Target Corporation,https://www.reuters.com/article/idUSKCN0HH21F2...
392,China tightens oversight of corporate bills-so...,2012-06-07 17:42:00+00:00,Target Corporation,https://www.reuters.com/article/idUSL3E8H774U2...
398,RESEARCH ALERT-Danaher: Bernstein raises price...,2013-11-20 06:15:00+00:00,Target Corporation,https://www.reuters.com/article/idUSWNBB035N62...


In [17]:
joblib.dump(df, "../data/sp500_top100_v1.bin", compress=5)

['../data/sp500_top100_v1.bin']

In [None]:
ce_list = ["AT%26T+Inc.", "3M", "Fidelity National Information", "ServiceNow"]

for query in tqdm(ce_list):
    try: 
        RC = ReutersCrawlerV1()
        df_temp = RC.parse_to_dataframe(query=query)
        df = pd.concat([df, df_temp], axis=0)
    except: 
        print(query)

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))