In [1]:
import sys
import joblib
import logging
import warnings
import requests
import multiprocessing
import pandas as pd
import numpy as np
import yfinance as yf

from tqdm.notebook import tqdm
from bs4 import BeautifulSoup
from zero_shot_learner import extend_df_with_cos_sim
from preprocessor import NewsPreprocessor
from preprocessor import transform_df
from contractions import contractions_dict
warnings.filterwarnings("ignore")
logging.basicConfig(level=logging.ERROR)
logger = logging.getLogger()
logger.disabled = True

In [2]:
def progressbar(iter, prefix="", size=60, file=sys.stdout):
    # Reference from https://stackoverflow.com/questions/3160699/python-progress-bar
    count = len(iter)
    def show(t):
        x = int(size*t/count)
        # file.write("%s[%s%s] %i/%i\r" % (prefix, "#"*x, "."*(size-x), int(100*t/count), 100))
        file.write("{}[{}{}] {}%\r".format(prefix, "#"*x, "."*(size-x), int(100*t/count)))
        file.flush()
    show(0)
    for i, item in enumerate(iter):
        yield item
        show(i+1)
    file.write("\n")
    file.flush()

In [3]:
def load_stock(ticker_name, start_date="2012-01-01"):
    ticker = yf.Ticker(ticker_name)
    hist = ticker.history(period="max", start=start_date)
    hist.index = hist.index.set_names(['date'])
    hist = hist.reset_index(drop=False, inplace=False)
    hist["date"] = pd.to_datetime(hist["date"], utc=True)
    hist['date'] = hist['date'].apply(lambda x: x.date())
    hist.sort_values(by='date', inplace=True)
    hist.reset_index(drop=True, inplace=True)
    hist["ticker"] = ticker_name
    hist["label"] = hist["Close"].diff(periods=1)
    hist.dropna(inplace=True)
    hist["label"] = hist["label"].map(lambda x: 1 if float(x) >= 0 else 0)
    return hist

def load_news(df, labels, sort_by, k):
    """
    :param file_name: str
    :param labels: list of str (for zero-shot learner)
    :param sort_by: str (str in labels)
    :param k: int (top k news)
    :return: pandas dataframe
    """
    df.drop_duplicates(subset="title", inplace=True)
    preprocessor = NewsPreprocessor(contractions_dict=contractions_dict)
    df["clean_title"] = df["title"].apply(lambda x: preprocessor.ultimate_clean(x))
    df = extend_df_with_cos_sim(df=df, col="clean_title", labels=labels, sort_by=sort_by)
    df = transform_df(df=df, sort_by=sort_by, k=k)
    df.reset_index(drop=True, inplace=True)
    return df

In [4]:
# from jupyterthemes import jtplot

# # currently installed theme will be used to
# # set plot style if no arguments provided
# jtplot.style(theme="oceans16")

In [5]:
def merge_function(df, ticker, return_dict):
    news = df[df["ticker"] == str(ticker)]
    news = load_news(news, labels=["stock"], sort_by="stock", k=5)
    stock = load_stock(str(ticker), start_date="2012-01-01")
    news_and_stock = pd.merge(news, stock, on=["date"])
    news_and_stock.set_index('date', inplace=True)
    return_dict[ticker] = news_and_stock

In [6]:
TRAIN_START_DATE = "2012-01-01"
TRAIN_END_DATE = "2015-12-31"
VALID_START_DATE = "2016-01-01"
VALID_END_DATE = "2016-12-31"
TEST_START_DATE = "2017-01-01"
TEST_END_DATE = "2020-07-01"

train = pd.DataFrame()
valid = pd.DataFrame()
test = pd.DataFrame()

df_merge = joblib.load("../data/sp500_top100_v1.bin")

for ticker in tqdm(df_merge["ticker"].unique()):
    news = df_merge[df_merge["ticker"] == str(ticker)]
    news = load_news(news, labels=["finance"], sort_by="finance", k=10)
    stock = load_stock(str(ticker), start_date="2012-01-01")
    news_and_stock = pd.merge(news, stock, on=["date"])
    news_and_stock.set_index('date', inplace=True)
    
    train_temp = news_and_stock.loc[pd.to_datetime(TRAIN_START_DATE).date():pd.to_datetime(TRAIN_END_DATE).date()]
    valid_temp = news_and_stock.loc[pd.to_datetime(VALID_START_DATE).date():pd.to_datetime(VALID_END_DATE).date()]
    test_temp = news_and_stock.loc[pd.to_datetime(TEST_START_DATE).date():pd.to_datetime(TEST_END_DATE).date()]
    
    train = pd.concat([train, train_temp], axis=0)
    valid = pd.concat([valid, valid_temp], axis=0)
    test = pd.concat([test, test_temp], axis=0)

HBox(children=(FloatProgress(value=0.0, max=99.0), HTML(value='')))

Start zero-shot learner...
Done!
Spend 187.830 s
Start transforming dataframe...
Done!
Spend 1.130 s
Start zero-shot learner...
Done!
Spend 73.137 s
Start transforming dataframe...
Done!
Spend 0.456 s
Start zero-shot learner...
Done!
Spend 182.288 s
Start transforming dataframe...
Done!
Spend 0.871 s
Start zero-shot learner...
Done!
Spend 155.209 s
Start transforming dataframe...
Done!
Spend 0.977 s
Start zero-shot learner...
Done!
Spend 71.748 s
Start transforming dataframe...
Done!
Spend 0.353 s
Start zero-shot learner...
Done!
Spend 200.412 s
Start transforming dataframe...
Done!
Spend 1.072 s
Start zero-shot learner...
Done!
Spend 78.710 s
Start transforming dataframe...
Done!
Spend 0.413 s
Start zero-shot learner...
Done!
Spend 48.259 s
Start transforming dataframe...
Done!
Spend 0.310 s
Start zero-shot learner...
Done!
Spend 17.518 s
Start transforming dataframe...
Done!
Spend 0.110 s
Start zero-shot learner...
Done!
Spend 193.444 s
Start transforming dataframe...
Done!
Spend 1.0

Start zero-shot learner...
Done!
Spend 10.292 s
Start transforming dataframe...
Done!
Spend 0.033 s
Start zero-shot learner...
Done!
Spend 8.693 s
Start transforming dataframe...
Done!
Spend 0.019 s
Start zero-shot learner...
Done!
Spend 8.127 s
Start transforming dataframe...
Done!
Spend 0.013 s
Start zero-shot learner...
Done!
Spend 9.418 s
Start transforming dataframe...
Done!
Spend 0.022 s
Start zero-shot learner...
Done!
Spend 14.225 s
Start transforming dataframe...
Done!
Spend 0.073 s
Start zero-shot learner...
Done!
Spend 10.856 s
Start transforming dataframe...
Done!
Spend 0.037 s
Start zero-shot learner...
Done!
Spend 9.254 s
Start transforming dataframe...
Done!
Spend 0.019 s
Start zero-shot learner...
Done!
Spend 9.985 s
Start transforming dataframe...
Done!
Spend 0.032 s
Start zero-shot learner...
Done!
Spend 8.947 s
Start transforming dataframe...
Done!
Spend 0.019 s
Start zero-shot learner...
Done!
Spend 8.534 s
Start transforming dataframe...
Done!
Spend 0.019 s
Start z

In [7]:
joblib.dump(train, "../data/train_top10_v2.bin", compress=5)
joblib.dump(valid, "../data/valid_top10_v2.bin", compress=5)
joblib.dump(test, "../data/test_top10_v2.bin", compress=5)

['../data/test_top10_v2.bin']

In [8]:
train

Unnamed: 0_level_0,Top 1 News,Top 2 News,Top 3 News,Top 4 News,Top 5 News,Top 6 News,Top 7 News,Top 8 News,Top 9 News,Top 10 News,Open,High,Low,Close,Volume,Dividends,Stock Splits,ticker,label
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2012-01-04,apple hires adobe officer to lead iad: report,,,,,,,,,,50.71,51.28,50.62,51.13,65005500.0,0.0,0.0,AAPL,1
2012-01-05,refile-taiwan is elan says apple to pay $5 mln...,taiwan is elan says apple to pay $5 million in...,update 1-apple to pay elan $5 mln to settle pa...,apple to pay $5 million to settle patent lawsu...,,,,,,,51.32,51.76,51.04,51.70,67817400.0,0.0,0.0,AAPL,1
2012-01-09,apple is cook got rich stock award worth $376 ...,apple is siri puts voice-enabled search in spo...,chinese authors sue apple for copyright infrin...,,,,,,,,52.62,52.90,52.11,52.16,98506100.0,0.0,0.0,AAPL,0
2012-01-10,"kodak sues apple, htc over digital image patents",,,,,,,,,,52.67,52.68,52.13,52.34,64549100.0,0.0,0.0,AAPL,1
2012-01-11,strong apple contingent expected at ces,corrected-ces-strong apple contingent expected...,apple plans january 19 education event in new ...,,,,,,,,52.27,52.29,51.86,52.26,53771200.0,0.0,0.0,AAPL,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2014-02-24,fitch upgrades fidelity national information s...,,,,,,,,,,49.69,50.60,49.69,50.21,1004200.0,0.0,0.0,FIS,1
2014-01-29,servicenow is revenue jumps 67 pct,servicenow is revenue jumps 67 percent,,,,,,,,,58.19,58.22,56.44,57.60,1998900.0,0.0,0.0,NOW,0
2014-04-24,corrected-servicenow 1st-quarter revenue beats...,"servicenow sees second-quarter, 2014 revenue a...","update 1-servicenow sees q2, 2014 revenue abov...",,,,,,,,54.70,54.70,45.07,49.88,10440700.0,0.0,0.0,NOW,0
2015-03-10,four hewlett-packard patents invalidated in ca...,four hewlett-packard patents invalidated in se...,,,,,,,,,73.60,74.07,72.61,73.59,873500.0,0.0,0.0,NOW,0


In [9]:
valid

Unnamed: 0_level_0,Top 1 News,Top 2 News,Top 3 News,Top 4 News,Top 5 News,Top 6 News,Top 7 News,Top 8 News,Top 9 News,Top 10 News,Open,High,Low,Close,Volume,Dividends,Stock Splits,ticker,label
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2016-01-04,apple shares off but weather u.s. market selloff,,,,,,,,,,95.40,97.97,94.83,97.95,67649400.0,0.0,0.0,AAPL,1
2016-01-05,"apple expected to cut iphone 6s, 6s plus produ...","update 2-apple expected to cut iphone 6s, 6s p...","apple expected to cut iphone 6s, 6s plus produ...",,,,,,,,98.32,98.41,95.22,95.49,55791000.0,0.0,0.0,AAPL,0
2016-01-06,update 1-apple paid ceo tim cook $10.3 mln in ...,apple paid ceo tim cook $10.3 mln in 2015,apple paid ceo tim cook $10.3 million in 2015,apple reports over $1.1 bln in app store sales...,apple reports over $1.1 billion in app store s...,top apple supplier plans rare holiday as outpu...,update 7-top apple supplier plans rare holiday...,update 2-apple shares drop below $100 for firs...,corrected-apple shares drop below $100 for fir...,apple shares drop below $100 for first time si...,93.49,95.18,92.85,93.63,68457400.0,0.0,0.0,AAPL,0
2016-01-07,apple buys artificial intelligence startup emo...,update 1-apple suppliers cut revenue estimates...,,,,,,,,,91.75,93.10,89.66,89.67,81094400.0,0.0,0.0,AAPL,0
2016-01-08,"apple registers automobile domain names, inclu...",update 1-major apple supplier hon hai of taiwa...,major apple supplier hon hai of taiwan posts 2...,exclusive: nfl to live-stream all london games...,,,,,,,91.63,92.15,89.96,90.15,70798000.0,0.0,0.0,AAPL,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016-04-21,brief-servicenow q1 non-gaap earnings per shar...,,,,,,,,,,74.77,76.84,73.21,74.27,8395500.0,0.0,0.0,NOW,1
2016-06-01,brief-servicenow acquires brightpoint security,,,,,,,,,,71.38,73.50,71.06,73.17,1889700.0,0.0,0.0,NOW,1
2016-07-28,brief-servicenow q2 earnings per share view $0...,,,,,,,,,,75.40,78.77,74.15,76.38,3991900.0,0.0,0.0,NOW,1
2016-10-27,brief-servicenow q3 gaap loss per share $0.22,,,,,,,,,,86.82,89.79,84.56,84.96,11112300.0,0.0,0.0,NOW,1


In [10]:
test

Unnamed: 0_level_0,Top 1 News,Top 2 News,Top 3 News,Top 4 News,Top 5 News,Top 6 News,Top 7 News,Top 8 News,Top 9 News,Top 10 News,Open,High,Low,Close,Volume,Dividends,Stock Splits,ticker,label
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2017-01-04,apple confirms $1 bln investment in softbank t...,update 2-apple confirms $1 bln investment in s...,apple confirms $1 billion investment in softba...,apple pulls new york times app from itunes sto...,,,,,,,110.11,110.73,110.01,110.27,21118100.0,0.0,0.0,AAPL,0
2017-01-05,brief-apple says app store generated over $20 ...,update 1-apple is app store generated $20 bln ...,apple is app store generated $20 billion for d...,rpt-update 3-apple pulls new york times apps i...,update 3-apple pulls new york times apps in ch...,apple pulls new york times apps in china after...,india reluctant to give special tax incentives...,"apple plans first retail store in s.korea, pos...",,,110.17,111.07,110.07,110.83,22193600.0,0.0,0.0,AAPL,1
2017-01-06,brief-apple inc is ceo tim cook is total 2016 ...,canada is competition watchdog closes two-year...,,,,,,,,,110.99,112.30,110.70,112.07,31751900.0,0.0,0.0,AAPL,1
2017-01-10,china is wechat seeks slice of apple is app st...,china is wechat seeks slice of apple is app st...,tesla taps apple engineer for autopilot software,,,,,,,,112.88,113.46,112.44,113.21,24462100.0,0.0,0.0,AAPL,1
2017-01-12,u.s. appeals court revives antitrust lawsuit a...,,,,,,,,,,113.01,113.39,112.35,113.34,27086200.0,0.0,0.0,AAPL,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-03-17,brief-servicenow releases four emergency respo...,,,,,,,,,,271.41,302.98,270.02,287.42,3908100.0,0.0,0.0,NOW,1
2020-04-21,brief-lincoln financial announces steps to sup...,,,,,,,,,,298.96,303.70,281.57,288.77,2892800.0,0.0,0.0,NOW,0
2020-04-30,"u.s. research roundup- labcorp, servicenow, ze...",,,,,,,,,,327.84,357.78,327.49,351.54,5387700.0,0.0,0.0,NOW,1
2020-05-05,"servicenow, adobe pair their customer service ...",,,,,,,,,,360.00,376.18,356.07,370.64,2977200.0,0.0,0.0,NOW,1
