# Import Libraries and Market Data

In [1]:
!pip install -q -U yfinance numpy pandas_datareader pandas matplotlib seaborn scikit-learn



In [2]:
#Import Python Libraries
import numpy as np
import pandas as pd
from datetime import datetime

import warnings
warnings.filterwarnings('ignore')

import pandas_datareader.data as pdr
import yfinance as yf
import matplotlib.pyplot as plt
plt.style.use('seaborn')

In [3]:
#Import data
start = datetime(2021, 1, 1)
end = datetime(2021, 7, 18)

stock = yf.Ticker('AAPL').history(start=start, end=end) #Apple Inc. stock
market = yf.Ticker('SPY').history(start=start, end=end) #S&P 500 index
vix = yf.Ticker('^VIX').history(start=start, end=end)   #Volatility index
dxy = yf.Ticker('UUP').history(start=start, end=end)    #Dollar index
junk = yf.Ticker('JNK').history(start=start, end=end)   #Junk bond index

#Design Model

In [4]:
#Create target dataframe
target = pd.DataFrame()
target['return'] = (stock['Open']-stock['Close'].shift(1))/stock['Close'].shift(1) #Returns based on buying on the close the day before and selling on the open the day after
target = target.dropna() #get rid of any NaNs
target['direction'] = np.where(target['return'] > 0, 1, -1) #Overnight direction of the stock
target.tail()

Unnamed: 0_level_0,return,direction
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-07-12,0.00758,1
2021-07-13,-0.003253,-1
2021-07-14,0.016891,1
2021-07-15,0.000603,1
2021-07-16,-0.000135,-1


In [5]:
#Create features dataframe
features = pd.DataFrame()
features['market'] = market['Close'].pct_change(1)*100
features['vix'] = vix['Close'].diff() #Since VIX is measured in percentage terms
features['dxy'] = dxy['Close'].pct_change(1)*100
features['junk'] = junk['Close'].pct_change(1)*100
features = features.dropna()
features.tail()

Unnamed: 0_level_0,market,vix,dxy,junk
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-07-12,0.358192,-0.01,0.121109,-0.00909
2021-07-13,-0.340896,0.950001,0.604845,-0.263781
2021-07-14,0.149221,-0.790001,-0.480965,0.109435
2021-07-15,-0.341553,0.68,0.281916,-0.018217
2021-07-16,-0.78436,1.440001,0.120485,-0.164009


In [6]:
lastknown = features[-1:] #Values of features from the last trading session
features = features[:-1] #Subtracts last row from the features matrix so that it aligns with labels vector
features.tail()

Unnamed: 0_level_0,market,vix,dxy,junk
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-07-09,1.067478,-2.82,-0.241641,0.154853
2021-07-12,0.358192,-0.01,0.121109,-0.00909
2021-07-13,-0.340896,0.950001,0.604845,-0.263781
2021-07-14,0.149221,-0.790001,-0.480965,0.109435
2021-07-15,-0.341553,0.68,0.281916,-0.018217


#Gradient Boosting Classifier for Overnight Direction

In [7]:
#Get rid of return column for classifiers
targetclass = target.drop(axis=1, columns='return')
targetclass = targetclass[1:] #Removes the first row of labels since we are correlating today's features with tomorrow's opening values
targetclass.head()                                                                        

Unnamed: 0_level_0,direction
Date,Unnamed: 1_level_1
2021-01-06,-1
2021-01-07,1
2021-01-08,1
2021-01-11,-1
2021-01-12,-1


In [8]:
#Get rid of direction column for regressors
targetvalue = target.drop(axis=1, columns='direction')
targetvalue = targetvalue[1:] #Removes the first row of labels since we are correlating today's features with tomorrow's opening values
targetvalue.head()                                                                        


Unnamed: 0_level_0,return
Date,Unnamed: 1_level_1
2021-01-06,-0.025113
2021-01-07,0.013902
2021-01-08,0.011534
2021-01-11,-0.021658
2021-01-12,-0.003722


#Train, Test and Regularize Gradient Boosting Classifier

In [9]:
#Train and test classifier using Gini impurity performance metric
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split

classifier = GradientBoostingClassifier(random_state=1, learning_rate=0.01, max_depth=3)
targetclass = np.ravel(targetclass) 
features_train, features_test, targetclass_train, targetclass_test = train_test_split(features, targetclass, test_size = 0.25, random_state=0)
classifier.fit(features_train, targetclass_train)
print("Training score:", classifier.score(features_train, targetclass_train))
print("Testing score:", classifier.score(features_test, targetclass_test))

Training score: 0.8383838383838383
Testing score: 0.5


In [10]:
print("Tomorrow's direction:", classifier.predict(lastknown))
print("Probability of change", classifier.predict_proba(lastknown))

Tomorrow's direction: [-1]
Probability of change [[0.67811359 0.32188641]]


In [11]:
#Inferring the importance of each feature
print(features.columns)
print(classifier.feature_importances_)

Index(['market', 'vix', 'dxy', 'junk'], dtype='object')
[0.10880958 0.27942415 0.19184185 0.41992443]


#Gradient Boosting Regressor for Overnight Value Changes

#Train and Test GBRT Model

In [12]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split

regressor = GradientBoostingRegressor(random_state=1)
targetvalue = np.ravel(targetvalue) #Need to covert column vector into a 1-d array
features_train, features_test, targetvalue_train, targetvalue_test = train_test_split(features, targetvalue, test_size = 0.25, random_state=0)
regressor.fit(features_train, targetvalue_train)
print("Training score:", regressor.score(features_train, targetvalue_train))
print("Testing score:", regressor.score(features_test, targetvalue_test)) #Note that scikit-learn library makes the default mean squared error(mse) test score negative so that it is maximized instead of minimized
print("Tomorrow's value change:", regressor.predict(lastknown))

Training score: 0.936484599820489
Testing score: -0.2500402357807179
Tomorrow's value change: [-0.0046213]


#Regularize and Test GBRT Model

In [13]:
regressor_mae = GradientBoostingRegressor(criterion="mae", random_state=1, learning_rate=0.4, max_depth=5, n_estimators=200, min_samples_split=5, min_samples_leaf=10) #Reduce learning rate (between 0 and 1) to avoid overfitting
features_train, features_test, targetvalue_train, targetvalue_test = train_test_split(features, targetvalue, test_size = 0.25, random_state=1)
regressor_mae.fit(features_train, targetvalue_train)
print("Training score:", regressor_mae.score(features_train, targetvalue_train))
print("Testing score:", regressor_mae.score(features_test, targetvalue_test)) #Note that scikit-learn library makes the mean absolute error(mae) test score negative so that it is maximized instead of minimized
print("Tomorrow's value change:", regressor_mae.predict(lastknown))

Training score: 0.6375765723082714
Testing score: -0.8857183867105682
Tomorrow's value change: [-0.01983395]


In [14]:
#Inferring the importance of each feature
print(features.columns)
print(regressor.feature_importances_)
print(regressor_mae.feature_importances_)

Index(['market', 'vix', 'dxy', 'junk'], dtype='object')
[0.14255143 0.28752113 0.31202949 0.25789795]
[0.28661642 0.3689544  0.18059208 0.16383711]
