Loading Data

In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import nltk
from datetime import datetime as dt
import pytz
import pandas_market_calendars as mcal
from datetime import timedelta
from transformers import AutoTokenizer

Preprocessing Data

In [2]:
articles = pd.read_csv('Data/articles.csv')
articles.head()

Unnamed: 0.1,Unnamed: 0,title,date,stock
0,0.0,Stocks That Hit 52-Week Highs On Friday,2020-06-05 10:30:00-04:00,A
1,1.0,Stocks That Hit 52-Week Highs On Wednesday,2020-06-03 10:45:00-04:00,A
2,2.0,71 Biggest Movers From Friday,2020-05-26 04:30:00-04:00,A
3,3.0,46 Stocks Moving In Friday's Mid-Day Session,2020-05-22 12:45:00-04:00,A
4,4.0,B of A Securities Maintains Neutral on Agilent...,2020-05-22 11:38:00-04:00,A


In [3]:
def convert_to_datetime(date_string):
    
    date_string, _ = date_string.rsplit("-", 1)
    
    converted_date = dt.strptime(date_string, '%Y-%m-%d %H:%M:%S')
    return converted_date

def preprocess(df, sample_size=None):
    df = df.dropna()
    # sample if specified
    if sample_size:
        df = df.sample(sample_size)
    # remove uncessary index column
    df = df.drop(df.columns[0], axis=1)
    # change stock column name to ticker
    df.rename(columns={'stock': 'ticker'}, inplace=True)
    # convert headlines to lowercase
    # convert to datetime object
    df['date'] = df['date'].apply(convert_to_datetime)
    return df

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yiann\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Grab Stock Returns

Based on the time of the article published, we will retrieve two adjusted close prices of the stock and compute the corresponding return.

If the time of the article is published before 4:00 P.M. (non-inclusive), then:
1. The 'before' price will be the most recent (before the date) trading day's adjusted close price
2. The 'after' price will be the most upcoming trading day's adjusted close price

If the time of the article is published after 4:00 P.M., then:
1. The 'before' price will be the same day's adjusted close price
2. The 'after' priec will be the next day's adjusted close price

In [4]:
# The paramater forward is a boolean representing whether we are looking for the next valid trading day or the most recent trading day
def getValidTradingCloseDate(date, forward=True):
        nyse = mcal.get_calendar('NYSE')
        if forward:
            start_date = date
            end_date = date+timedelta(days=15)
        else:
            start_date = date-timedelta(days=15)
            end_date = date

        validTradingDays = nyse.valid_days(start_date=start_date , end_date=end_date)
        return validTradingDays.date[2] if forward else validTradingDays.date[-2]


In [5]:
#Get all the yfinance data we need based on date.
import yfinance as yf

def retrieve_yfinance_data(row):
    curr_date = row['date']
    
    eod = dt.strptime('16:00:00', '%H:%M:%S').time()
    
    if curr_date.time() > eod:
        start_date = curr_date.date()
        end_date = getValidTradingCloseDate(start_date, forward=True)
    else:
        end_date = curr_date.date()
        start_date = getValidTradingCloseDate(end_date, forward=False)
        end_date = end_date + timedelta(days=1)
        
    data = yf.download(row['ticker'], start=start_date, end=end_date, progress=False, show_errors=False)
    
    if len(data) > 0:
        returns = (data['Adj Close'][-1] - data['Adj Close'][0]) / data['Adj Close'][0]
        return returns
    else:
        return None

In [6]:
from tqdm import tqdm

def get_returns(df):
    # df['returns'] = df.apply(retrieve_yfinance_data, axis=1)
    for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
        returns = retrieve_yfinance_data(row)
        # print(returns)
        df.loc[idx,'returns'] = returns

    return df

In [7]:
small_test_set = preprocess(articles, sample_size=100)
small_test_set = get_returns(small_test_set)
pd.to_pickle(small_test_set, 'small_test_set.pkl')


# test_set = preprocess(articles, sample_size=100000)
# test_set = get_returns(test_set)
# pd.to_pickle(test_set, 'test_set.pkl')

100%|██████████| 100/100 [00:28<00:00,  3.53it/s]


In [12]:
small_test_set

Unnamed: 0,title,date,ticker,returns
219534,"Benzinga’s Top Upgrades (SBNY, CCL, MPW, MA, F...",2010-05-21 08:27:00,CCL,0.013952
912659,"Top Performing Industries For August 9, 2016",2016-08-09 10:54:00,NTL,-0.010309
165581,Bridgeline DIgital Reports Q3 Loss $0.26 Vs Es...,2015-08-14 08:01:00,BLIN,-0.048276
1355251,"Earnings Scheduled For March 24, 2015",2015-03-24 04:04:00,WSCI,0.001757
362331,Puts Purchased on Dick's Sporting Goods (DKS),2011-01-06 12:40:00,DKS,-0.045442
...,...,...,...,...
1236358,Standpoint Research Downgrades Tempur-pedic In...,2013-10-01 11:17:00,TPX,0.023658
1370405,"Sector Update: Utilities Leading, Consumer Goo...",2011-08-24 10:36:00,XLF,0.026678
90206,Aramark Acquires On-Demand Food Delivery Servi...,2019-08-06 06:44:00,ARMK,0.047018
1342675,Wheeler Real Estate Investment Trust Responds ...,2018-03-16 04:19:00,WHLR,0.064220


Merge Two Datasets into 1 Dataset