# Import Libraries and Market Data

In [1]:
!pip install -q -U yfinance numpy pandas_datareader pandas matplotlib seaborn scikit-learn



In [2]:
#Import Python Libraries
import numpy as np
import pandas as pd
from datetime import datetime

import warnings
warnings.filterwarnings('ignore')

import yfinance as yf
import pandas_datareader.data as pdr
import matplotlib.pyplot as plt
plt.style.use('seaborn')

In [3]:
#Import data
start = datetime(2021, 1, 1)
end = datetime(2021, 7, 18)

stock = yf.Ticker('AAPL').history(start=start, end=end) #Apple Inc. stock
market = yf.Ticker('SPY').history(start=start, end=end) #S&P 500 index
vix = yf.Ticker('^VIX').history(start=start, end=end)   #Volatility index
dxy = yf.Ticker('UUP').history(start=start, end=end)    #Dollar index
junk = yf.Ticker('JNK').history(start=start, end=end)   #Junk bond index

#Design Model

In [4]:
#Create target dataframe
target = pd.DataFrame()
#Use adjusted closing prices instead of closing prices to adjust for corporate actions such as dividends, splits and mergers
target['return'] = (stock['Open']-stock['Close'].shift(1))/stock['Close'].shift(1) #Returns based on buying on the close the day before and selling on the open the day after
target = target.dropna() #get rid of the NaNs
target['direction'] = np.where(target['return'] > 0, 1, -1) #Overnight direction of the stock
target.tail()

Unnamed: 0_level_0,return,direction
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-07-12,0.00758,1
2021-07-13,-0.003253,-1
2021-07-14,0.016891,1
2021-07-15,0.000603,1
2021-07-16,-0.000135,-1


In [5]:
#Create features dataframe based on closing prices
features = pd.DataFrame()
features['market'] = market['Close'].pct_change(1)*100
features['vix'] = vix['Close'].diff() #VIX is measured in annualized percentage terms
features['dxy'] = dxy['Close'].pct_change(1)*100
features['junk'] = junk['Close'].pct_change(1)*100
features = features.dropna()
features.tail()

Unnamed: 0_level_0,market,vix,dxy,junk
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-07-12,0.358192,-0.01,0.121109,-0.00909
2021-07-13,-0.340896,0.950001,0.604845,-0.263781
2021-07-14,0.149221,-0.790001,-0.480965,0.109435
2021-07-15,-0.341553,0.68,0.281916,-0.018217
2021-07-16,-0.78436,1.440001,0.120485,-0.164009


In [6]:
lastknown = features[-1:] #Values of features from the last trading session
features = features[:-1] #Subtracts last row from the features matrix so that it aligns with labels vector
features.tail()

Unnamed: 0_level_0,market,vix,dxy,junk
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-07-09,1.067478,-2.82,-0.241641,0.154853
2021-07-12,0.358192,-0.01,0.121109,-0.00909
2021-07-13,-0.340896,0.950001,0.604845,-0.263781
2021-07-14,0.149221,-0.790001,-0.480965,0.109435
2021-07-15,-0.341553,0.68,0.281916,-0.018217


#Random Forest classification for overnight direction

In [7]:
targetclass = target.drop(axis=1, columns='return') #Get rid of return column for classifiers
targetclass = targetclass[1:] #Removes the first row of labels since we are correlating today's features with tomorrow's opening values
targetclass.head()                                                                        

Unnamed: 0_level_0,direction
Date,Unnamed: 1_level_1
2021-01-06,-1
2021-01-07,1
2021-01-08,1
2021-01-11,-1
2021-01-12,-1


#Train and Test Random Forest Classifier

In [8]:
#Train and test Random Forest classifier using Gini impurity performance metric
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

classifier = RandomForestClassifier(random_state=1, oob_score=True) #Enables using out-of-bag sample set for validation
targetclass = np.ravel(targetclass) #Need to covert column vector into a 1-d Numpy array 
features_train, features_test, targetclass_train, targetclass_test = train_test_split(features, targetclass, test_size = 0.25, random_state=0)
classifier.fit(features_train, targetclass_train)
print("Training score:", classifier.score(features_train, targetclass_train))
print("Out-of-bag score:", classifier.oob_score_)
print("Testing score:", classifier.score(features_test, targetclass_test))

Training score: 1.0
Out-of-bag score: 0.41414141414141414
Testing score: 0.47058823529411764


#Regularize Random Forest Classifier

In [9]:
#Train and test classifier using entropy performance metric with regularizing hyperparameters
classifier_entropy = RandomForestClassifier(criterion='entropy', random_state=1, oob_score=True,n_estimators=100, max_depth=5, min_samples_leaf=20, min_samples_split=30) #Maximum depth of tree is used to prevent overfitting of test data 
features_train, features_test, targetclass_train, targetclass_test = train_test_split(features, targetclass, test_size = 0.25, random_state=0)
classifier_entropy.fit(features_train, targetclass_train)
print("Training score:", classifier_entropy.score(features_train, targetclass_train))
print("Out-of-bag score:", classifier_entropy.oob_score_)
print("Testing score:", classifier_entropy.score(features_test, targetclass_test))

Training score: 0.6060606060606061
Out-of-bag score: 0.3838383838383838
Testing score: 0.6176470588235294


In [10]:
print("Tomorrow's change:", classifier_entropy.predict(lastknown))
print("Probability of change", classifier_entropy.predict_proba(lastknown))

Tomorrow's change: [1]
Probability of change [[0.46139601 0.53860399]]


In [11]:
#Inferring the importance of each feature
print(features.columns)
print(classifier.feature_importances_)
print(classifier_entropy.feature_importances_)

Index(['market', 'vix', 'dxy', 'junk'], dtype='object')
[0.23056644 0.25802237 0.25574205 0.25566914]
[0.28539187 0.38719871 0.1961629  0.13124652]


#Random Forest Regression for Overnight Value Changes

#Design Model

In [12]:
#Get rid of direction column for regressors
targetvalue = target.drop(axis=1, columns='direction')
targetvalue = targetvalue[1:] #Removes the first row of labels since we are correlating today's features with tomorrow's opening values
targetvalue.head()                                                                        

Unnamed: 0_level_0,return
Date,Unnamed: 1_level_1
2021-01-06,-0.025113
2021-01-07,0.013902
2021-01-08,0.011534
2021-01-11,-0.021658
2021-01-12,-0.003722


#Train and Test Random Forest Model

In [13]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

regressor = RandomForestRegressor(random_state=1, oob_score=True) #default performance metric is mean square error
targetvalue = np.ravel(targetvalue) #Need to covert column vector into a 1-d array
features_train, features_test, targetvalue_train, targetvalue_test = train_test_split(features, targetvalue, test_size = 0.25, random_state=0)
regressor.fit(features_train, targetvalue_train)
print("Training score:", regressor.score(features_train, targetvalue_train))
print("Out-of-bag score:", regressor.oob_score_)
print("Testing score:", regressor.score(features_test, targetvalue_test))
print("Tomorrow's value change:", regressor.predict(lastknown))

Training score: 0.8500950651369038
Out-of-bag score: -0.1575840063139653
Testing score: 0.022366160313928796
Tomorrow's value change: [-0.00410875]


#Regularize Random Forest Model

In [14]:
regressor_mae = RandomForestRegressor(criterion="mae", random_state=1, oob_score=True, max_depth=10, n_estimators=500)
features_train, features_test, targetvalue_train, targetvalue_test = train_test_split(features, targetvalue, test_size = 0.25, random_state=0)
regressor_mae.fit(features_train, targetvalue_train)
print("Training score:", regressor_mae.score(features_train, targetvalue_train))
print("Out-of-bag score:", regressor_mae.oob_score_)
print("Testing score:", regressor_mae.score(features_test, targetvalue_test))
print("Tomorrow's value change:", regressor_mae.predict(lastknown))

Training score: 0.7828133897067663
Out-of-bag score: -0.11314211040296085
Testing score: 0.08099593803385141
Tomorrow's value change: [-0.00339385]


In [15]:
#Inferring the importance of each feature
print(features.columns)
print(regressor.feature_importances_)
print(regressor_mae.feature_importances_)

Index(['market', 'vix', 'dxy', 'junk'], dtype='object')
[0.16748602 0.26271163 0.29313468 0.27666766]
[0.21189361 0.26314634 0.26761882 0.25734123]
