In [None]:
from pystock.helper_functions import load_news_df, mergeFiles2df
from pystock import constants
import pandas as pd
import pickle
import requests
from tqdm import tqdm
import glob

# Get historical news data

In [2]:
def get_historical_news(ticker, date_from, date_to, outdir, website='finnhub'):
    data = {"symbol": ticker,
            "from": date_from,
            "to": date_to,
            "token": constants.FINNHUB_KEY} 
    
    if website == 'finnhub':
        response = requests.get('https://finnhub.io/api/v1/company-news', data)
    df = pd.DataFrame.from_dict(response.json()).drop(['category', 'id', 'image', 'url'], axis=1)
    df['datetime'] = pd.to_datetime(df['datetime'],unit='s')
    df = df.set_index('datetime')
    
    # actual date_from (rate limited by 500 responses/ API call)
    date_from = str(df.index.min().date())
    
    file = open(f'{outdir}/{ticker}_{date_from}_{date_to}_{website}.pkl', 'wb')
    pickle.dump(df, file)
    file.close()
    return df

In [5]:
def get_historical_news_recursive(ticker, dates, outdir):
    # start with 1 year span, then adjust date_to to earliest date in df
    # stop after 60 API calls (rate limited)
    # Adjust the end parameter in dates variable to last_date
    last_date = None
    for i in tqdm(range(60)):
        if i == 0:
            df = get_historical_news(ticker, date_from=dates[0], date_to=dates[-1], outdir=outdir)
        else:
            last_date = str(df.index.min().date())
            df = get_historical_news(ticker, date_from=dates[0], date_to=last_date, outdir=outdir)

        if last_date == dates[0] or str(df.index.min().date()) == last_date:
            break
    print(str(df.index.min().date()))
    

In [8]:
ticker = 'CCIV'
dates = pd.date_range(start='2020-03-20', end='2021-03-17').astype(str).tolist()
outdir = 'data/parsed_data/'
get_historical_news_recursive(ticker, dates, outdir)

 18%|███████████████                                                                   | 11/60 [00:15<01:07,  1.39s/it]


KeyboardInterrupt: 

# Merge news data

In [13]:
ticker = 'CCIV'
indir = f'data/parsed_data/{ticker}*'
files = glob.glob(indir)

In [10]:
df_merge = mergeFiles2df(files, load_news_df)

Creating dataset:

1/2
data/parsed_data\CCIV_2020-09-24_2020-09-24_finnhub.pkl
_____________
2/2
data/parsed_data\CCIV_2020-09-24_2021-03-17_finnhub.pkl
_____________
Dataset created!


In [11]:
df_merge.drop_duplicates(inplace=True)

In [12]:
df_merge

Unnamed: 0_level_0,headline,ticker,source,summary
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-09-24 14:38:00,Churchill Capital Launches Fifth SPAC,CCIV,Benzinga,"Michael Klein, one of the prominent names in t..."
2020-09-30 00:00:00,"Kevin O'Leary Shares SPAC Picks, Impressions",CCIV,benzinga,"Entrepreneur and ""Shark Tank"" judge Kevin O’Le..."
2020-10-02 16:29:48,"After Hours Most Active for Oct 2, 2020 : SBH...",CCIV,Nasdaq,The NASDAQ 100 After Hours Indicator is up 12....
2020-10-02 19:59:59,"After Hours Most Active for Oct 2, 2020 : SBH...",CCIV,Nasdaq,The NASDAQ 100 After Hours Indicator is down -...
2020-10-12 00:00:00,"Q3 The Busiest Since 2014 For IPOs, Led By Uni...",CCIV,benzinga,The third quarter saw 86 companies go public v...
...,...,...,...,...
2021-03-11 11:59:28,Jaws Spitfire Acquisition Corporation: SpaceX ...,CCIV,seekingalpha.com,
2021-03-11 12:06:42,Buckle In for a Wild Ride With Churchill Capit...,CCIV,Nasdaq,"InvestorPlace - Stock Market News, Stock Advic..."
2021-03-12 01:00:49,"While You Slept, Tesla Ushered In The Golden A...",CCIV,seekingalpha.com,Tesla's rapid ascent has helped usher in a new...
2021-03-15 07:04:14,Wall Street Breakfast: Most Valuable Startup,CCIV,seekingalpha.com,Listen on the go! A daily podcast of Wall Stre...


In [14]:
df_merge.to_csv(f'data/parsed_data/{ticker}_{df_merge.index.min().date()}_{df_merge.index.max().date()}_finnhub.csv')

# News and price analysis