In [6]:
#!/usr/bin/python
import re
from urllib.request import urlopen
import csv
import os
import sys
import time
import datetime

import numpy as np
from bs4 import BeautifulSoup
import os

path = "/home/tomoaki/work/Sentiment-Analysis-in-Event-Driven-Stock-Price-Movement-Prediction"
class NewsReuters:
    def __init__(self):
        self.fin = open(os.path.join(path, 'input/tickerList.csv'))

        self.filterList = set()
        try: # this is used when we restart a task
            fList = open(os.path.join(path, 'input/finished.reuters'))
            for l in fList:
                self.filterList.add(l.strip())
        except: pass

        self.dateList = self.dateGenerator(3000) # look back on the past X days
        # for line in fin:
        #     line = line.strip().split(',')
        #     ticker, name, exchange, MarketCap = line
        #     if ticker in filterList: continue
        #     self.content(ticker, name, line, dateList)

    def content(self, ticker, name, line, dateList):
        url = "http://www.reuters.com/finance/stocks/companyNews?symbol=" + ticker

        # some company even doesn't have a single news, stop iterating dates if we found it doesn't have data for like 40 consecutive days
        has_Content = 0
        repeat_times = 4
        for _ in range(repeat_times): # repeat in case of http failure
            try:
                time.sleep(np.random.poisson(3))
                response = urlopen(url)
                data = response.read()
                soup = BeautifulSoup(data, "lxml")
                has_Content = len(soup.find_all("div", {'class': ['topStory', 'feature']}))
                break
            except:
                continue
        
        if has_Content > 0:
            missing_days = 0
            print(ticker, name)
            for timestamp in dateList:
                hasNews = self.repeatDownload(ticker, line, url, timestamp) 
                if hasNews: missing_days = 0 # if get news, reset missing_days as 0
                else: missing_days += 1
                if missing_days > has_Content * 5 + 20: # 2 NEWS: wait 30 days and stop, 10 news, wait 70 days
                    break # no news in X consecutive days, stop crawling
                if missing_days > 0 and missing_days % 20 == 0: # print the process
                    print(ticker, "has no news for ", missing_days, " days")
        else:
            print(ticker, "has no single news")

    def repeatDownload(self, ticker, line, url, timestamp): 
        new_time = timestamp[4:] + timestamp[:4] # change 20151231 to 12312015 to satisfy reuters format
        repeat_times = 3 # repeat downloading in case of http error
        for _ in range(repeat_times): 
            try:
                time.sleep(np.random.poisson(3))
                response = urlopen(url + "&date=" + new_time)
                data = response.read()
                soup = BeautifulSoup(data, "lxml")
                hasNews = self.parser(soup, line, ticker, timestamp)
                if hasNews: return 1 # return if we get the news
                break # stop looping if the content is empty (no error)
            except: # repeat if http error appears
                continue
        return 0
  
    def parser(self, soup, line, ticker, timestamp):
        content = soup.find_all("div", {'class': ['topStory', 'feature']})
        if len(content) == 0: return 0
        fout = open('./input/news_reuters.csv', 'a+')
        for i in range(len(content)):
            title = content[i].h2.get_text().replace(",", " ").replace("\n", " ")
            body = content[i].p.get_text().replace(",", " ").replace("\n", " ")

            if i == 0 and len(soup.find_all("div", class_="topStory")) > 0: news_type = 'topStory'
            else: news_type = 'normal'

            print(ticker, timestamp, title, news_type)
            fout.write(','.join([ticker, line[1], timestamp, title, body, news_type]).encode('utf-8') + '\n')
        fout.close()
        return 1
    
    def dateGenerator(self, numdays): # generate N days until now
        base = datetime.datetime.today()
        date_list = [base - datetime.timedelta(days=x) for x in range(0, numdays)]
        for i in range(len(date_list)): date_list[i] = date_list[i].strftime("%Y%m%d")
        return date_list

In [7]:
obj = NewsReuters()

In [10]:
obj.filterList

set()

In [11]:
obj.dateList

['20170928',
 '20170927',
 '20170926',
 '20170925',
 '20170924',
 '20170923',
 '20170922',
 '20170921',
 '20170920',
 '20170919',
 '20170918',
 '20170917',
 '20170916',
 '20170915',
 '20170914',
 '20170913',
 '20170912',
 '20170911',
 '20170910',
 '20170909',
 '20170908',
 '20170907',
 '20170906',
 '20170905',
 '20170904',
 '20170903',
 '20170902',
 '20170901',
 '20170831',
 '20170830',
 '20170829',
 '20170828',
 '20170827',
 '20170826',
 '20170825',
 '20170824',
 '20170823',
 '20170822',
 '20170821',
 '20170820',
 '20170819',
 '20170818',
 '20170817',
 '20170816',
 '20170815',
 '20170814',
 '20170813',
 '20170812',
 '20170811',
 '20170810',
 '20170809',
 '20170808',
 '20170807',
 '20170806',
 '20170805',
 '20170804',
 '20170803',
 '20170802',
 '20170801',
 '20170731',
 '20170730',
 '20170729',
 '20170728',
 '20170727',
 '20170726',
 '20170725',
 '20170724',
 '20170723',
 '20170722',
 '20170721',
 '20170720',
 '20170719',
 '20170718',
 '20170717',
 '20170716',
 '20170715',
 '20170714',