In [1]:
# libraries for data manipultion
import pandas as pd
import numpy as np

# libraries for data visualization
import matplotlib.pyplot as plt
import seaborn as sns

#ml libraries
from sklearn.linear_model import LinearRegression

from sklearn.model_selection import train_test_split

from sklearn.metrics import r2_score
from sklearn.feature_selection import f_regression

#stats lib
import statsmodels.api as sm

In [2]:
tr_eikon_df = pd.read_csv('tr_eikon_eod_data.csv')
tr_eikon_df.index = pd.to_datetime(tr_eikon_df['Date'])
tr = tr_eikon_df[['AAPL.O', 'MSFT.O', 'INTC.O', 'AMZN.O', 'GS.N']].dropna()
store = tr.copy(deep = True)

#covert returns to PERCENTAGE CHANGE
for col in tr.columns:
    tr['Returns' + " " + col] = tr[col].pct_change() * 100

tr = tr.dropna()
tr

Unnamed: 0_level_0,AAPL.O,MSFT.O,INTC.O,AMZN.O,GS.N,Returns AAPL.O,Returns MSFT.O,Returns INTC.O,Returns AMZN.O,Returns GS.N
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2010-01-05,30.625684,30.960,20.87,134.690,176.14,0.172889,0.032310,-0.047893,0.589993,1.767969
2010-01-06,30.138541,30.770,20.80,132.250,174.26,-1.590633,-0.613695,-0.335410,-1.811567,-1.067333
2010-01-07,30.082827,30.452,20.60,130.000,177.67,-0.184860,-1.033474,-0.961538,-1.701323,1.956846
2010-01-08,30.282827,30.660,20.83,133.520,174.31,0.664830,0.683042,1.116505,2.707692,-1.891147
2010-01-11,30.015684,30.270,20.95,130.308,171.56,-0.882159,-1.272016,0.576092,-2.405632,-1.577649
...,...,...,...,...,...,...,...,...,...,...
2018-06-25,182.170000,98.390,50.71,1663.150,221.54,-1.487130,-2.011752,-3.409524,-3.061195,-1.982125
2018-06-26,184.430000,99.080,49.67,1691.090,221.58,1.240599,0.701291,-2.050878,1.679945,0.018055
2018-06-27,184.160000,97.540,48.76,1660.510,220.18,-0.146397,-1.554300,-1.832092,-1.808301,-0.631826
2018-06-28,185.500000,98.630,49.25,1701.450,223.42,0.727628,1.117490,1.004922,2.465508,1.471523


In [3]:
#read the fama data collected from the website

#website for fama french data: https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/data_library.html

#data collected under fama french 3 factors [daily]

#original data was manually edited as there was parsing issues

fama = pd.read_csv('fama.csv')
fama['Date'] = fama['Date'].apply(lambda x: pd.to_datetime(str(x), format='%Y%m%d'))
fama.index = pd.to_datetime(fama['Date'])

In [4]:
data = pd.merge(tr, fama, left_index=True, right_index=True)
data

Unnamed: 0_level_0,AAPL.O,MSFT.O,INTC.O,AMZN.O,GS.N,Returns AAPL.O,Returns MSFT.O,Returns INTC.O,Returns AMZN.O,Returns GS.N,Date,Mkt-RF,SMB,HML,RF
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2010-01-05,30.625684,30.960,20.87,134.690,176.14,0.172889,0.032310,-0.047893,0.589993,1.767969,2010-01-05,0.31,-0.64,1.24,0.000
2010-01-06,30.138541,30.770,20.80,132.250,174.26,-1.590633,-0.613695,-0.335410,-1.811567,-1.067333,2010-01-06,0.13,-0.22,0.57,0.000
2010-01-07,30.082827,30.452,20.60,130.000,177.67,-0.184860,-1.033474,-0.961538,-1.701323,1.956846,2010-01-07,0.40,0.09,0.98,0.000
2010-01-08,30.282827,30.660,20.83,133.520,174.31,0.664830,0.683042,1.116505,2.707692,-1.891147,2010-01-08,0.33,0.37,0.01,0.000
2010-01-11,30.015684,30.270,20.95,130.308,171.56,-0.882159,-1.272016,0.576092,-2.405632,-1.577649,2010-01-11,0.13,-0.13,-0.25,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-06-25,182.170000,98.390,50.71,1663.150,221.54,-1.487130,-2.011752,-3.409524,-3.061195,-1.982125,2018-06-25,-1.48,-0.54,0.57,0.006
2018-06-26,184.430000,99.080,49.67,1691.090,221.58,1.240599,0.701291,-2.050878,1.679945,0.018055,2018-06-26,0.27,0.63,-0.24,0.006
2018-06-27,184.160000,97.540,48.76,1660.510,220.18,-0.146397,-1.554300,-1.832092,-1.808301,-0.631826,2018-06-27,-1.02,-1.00,0.36,0.006
2018-06-28,185.500000,98.630,49.25,1701.450,223.42,0.727628,1.117490,1.004922,2.465508,1.471523,2018-06-28,0.58,-0.18,-0.51,0.006


In [5]:
#generate directions for each stock
for col in store.columns:
    data[col + ' direction'] = np.sign(data['Returns' + ' ' + col])
    
data

Unnamed: 0_level_0,AAPL.O,MSFT.O,INTC.O,AMZN.O,GS.N,Returns AAPL.O,Returns MSFT.O,Returns INTC.O,Returns AMZN.O,Returns GS.N,Date,Mkt-RF,SMB,HML,RF,AAPL.O direction,MSFT.O direction,INTC.O direction,AMZN.O direction,GS.N direction
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2010-01-05,30.625684,30.960,20.87,134.690,176.14,0.172889,0.032310,-0.047893,0.589993,1.767969,2010-01-05,0.31,-0.64,1.24,0.000,1.0,1.0,-1.0,1.0,1.0
2010-01-06,30.138541,30.770,20.80,132.250,174.26,-1.590633,-0.613695,-0.335410,-1.811567,-1.067333,2010-01-06,0.13,-0.22,0.57,0.000,-1.0,-1.0,-1.0,-1.0,-1.0
2010-01-07,30.082827,30.452,20.60,130.000,177.67,-0.184860,-1.033474,-0.961538,-1.701323,1.956846,2010-01-07,0.40,0.09,0.98,0.000,-1.0,-1.0,-1.0,-1.0,1.0
2010-01-08,30.282827,30.660,20.83,133.520,174.31,0.664830,0.683042,1.116505,2.707692,-1.891147,2010-01-08,0.33,0.37,0.01,0.000,1.0,1.0,1.0,1.0,-1.0
2010-01-11,30.015684,30.270,20.95,130.308,171.56,-0.882159,-1.272016,0.576092,-2.405632,-1.577649,2010-01-11,0.13,-0.13,-0.25,0.000,-1.0,-1.0,1.0,-1.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-06-25,182.170000,98.390,50.71,1663.150,221.54,-1.487130,-2.011752,-3.409524,-3.061195,-1.982125,2018-06-25,-1.48,-0.54,0.57,0.006,-1.0,-1.0,-1.0,-1.0,-1.0
2018-06-26,184.430000,99.080,49.67,1691.090,221.58,1.240599,0.701291,-2.050878,1.679945,0.018055,2018-06-26,0.27,0.63,-0.24,0.006,1.0,1.0,-1.0,1.0,1.0
2018-06-27,184.160000,97.540,48.76,1660.510,220.18,-0.146397,-1.554300,-1.832092,-1.808301,-0.631826,2018-06-27,-1.02,-1.00,0.36,0.006,-1.0,-1.0,-1.0,-1.0,-1.0
2018-06-28,185.500000,98.630,49.25,1701.450,223.42,0.727628,1.117490,1.004922,2.465508,1.471523,2018-06-28,0.58,-0.18,-0.51,0.006,1.0,1.0,1.0,1.0,1.0


In [6]:
data.drop(columns = ['Date'])

Unnamed: 0_level_0,AAPL.O,MSFT.O,INTC.O,AMZN.O,GS.N,Returns AAPL.O,Returns MSFT.O,Returns INTC.O,Returns AMZN.O,Returns GS.N,Mkt-RF,SMB,HML,RF,AAPL.O direction,MSFT.O direction,INTC.O direction,AMZN.O direction,GS.N direction
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2010-01-05,30.625684,30.960,20.87,134.690,176.14,0.172889,0.032310,-0.047893,0.589993,1.767969,0.31,-0.64,1.24,0.000,1.0,1.0,-1.0,1.0,1.0
2010-01-06,30.138541,30.770,20.80,132.250,174.26,-1.590633,-0.613695,-0.335410,-1.811567,-1.067333,0.13,-0.22,0.57,0.000,-1.0,-1.0,-1.0,-1.0,-1.0
2010-01-07,30.082827,30.452,20.60,130.000,177.67,-0.184860,-1.033474,-0.961538,-1.701323,1.956846,0.40,0.09,0.98,0.000,-1.0,-1.0,-1.0,-1.0,1.0
2010-01-08,30.282827,30.660,20.83,133.520,174.31,0.664830,0.683042,1.116505,2.707692,-1.891147,0.33,0.37,0.01,0.000,1.0,1.0,1.0,1.0,-1.0
2010-01-11,30.015684,30.270,20.95,130.308,171.56,-0.882159,-1.272016,0.576092,-2.405632,-1.577649,0.13,-0.13,-0.25,0.000,-1.0,-1.0,1.0,-1.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-06-25,182.170000,98.390,50.71,1663.150,221.54,-1.487130,-2.011752,-3.409524,-3.061195,-1.982125,-1.48,-0.54,0.57,0.006,-1.0,-1.0,-1.0,-1.0,-1.0
2018-06-26,184.430000,99.080,49.67,1691.090,221.58,1.240599,0.701291,-2.050878,1.679945,0.018055,0.27,0.63,-0.24,0.006,1.0,1.0,-1.0,1.0,1.0
2018-06-27,184.160000,97.540,48.76,1660.510,220.18,-0.146397,-1.554300,-1.832092,-1.808301,-0.631826,-1.02,-1.00,0.36,0.006,-1.0,-1.0,-1.0,-1.0,-1.0
2018-06-28,185.500000,98.630,49.25,1701.450,223.42,0.727628,1.117490,1.004922,2.465508,1.471523,0.58,-0.18,-0.51,0.006,1.0,1.0,1.0,1.0,1.0


In [7]:
#get relevant data for ml
preddat = data.iloc[:, 5:]

preddat

Unnamed: 0_level_0,Returns AAPL.O,Returns MSFT.O,Returns INTC.O,Returns AMZN.O,Returns GS.N,Date,Mkt-RF,SMB,HML,RF,AAPL.O direction,MSFT.O direction,INTC.O direction,AMZN.O direction,GS.N direction
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2010-01-05,0.172889,0.032310,-0.047893,0.589993,1.767969,2010-01-05,0.31,-0.64,1.24,0.000,1.0,1.0,-1.0,1.0,1.0
2010-01-06,-1.590633,-0.613695,-0.335410,-1.811567,-1.067333,2010-01-06,0.13,-0.22,0.57,0.000,-1.0,-1.0,-1.0,-1.0,-1.0
2010-01-07,-0.184860,-1.033474,-0.961538,-1.701323,1.956846,2010-01-07,0.40,0.09,0.98,0.000,-1.0,-1.0,-1.0,-1.0,1.0
2010-01-08,0.664830,0.683042,1.116505,2.707692,-1.891147,2010-01-08,0.33,0.37,0.01,0.000,1.0,1.0,1.0,1.0,-1.0
2010-01-11,-0.882159,-1.272016,0.576092,-2.405632,-1.577649,2010-01-11,0.13,-0.13,-0.25,0.000,-1.0,-1.0,1.0,-1.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-06-25,-1.487130,-2.011752,-3.409524,-3.061195,-1.982125,2018-06-25,-1.48,-0.54,0.57,0.006,-1.0,-1.0,-1.0,-1.0,-1.0
2018-06-26,1.240599,0.701291,-2.050878,1.679945,0.018055,2018-06-26,0.27,0.63,-0.24,0.006,1.0,1.0,-1.0,1.0,1.0
2018-06-27,-0.146397,-1.554300,-1.832092,-1.808301,-0.631826,2018-06-27,-1.02,-1.00,0.36,0.006,-1.0,-1.0,-1.0,-1.0,-1.0
2018-06-28,0.727628,1.117490,1.004922,2.465508,1.471523,2018-06-28,0.58,-0.18,-0.51,0.006,1.0,1.0,1.0,1.0,1.0


In [8]:
#train test split 80/20
data_train, data_test = train_test_split(preddat, 
                                         test_size = 0.2,
                                         shuffle = True,
                                         random_state = 0)

Possible strategies:

1) Buy when Predicted returns > Actual returns
2) Buy when predicted direction is up

In [9]:
model = LinearRegression()

#train, predict, generate signals
accuracy = []
for col in store.columns:
    pred_returns = model.fit(data_train[['Mkt-RF','SMB','HML']], data_train['Returns' + ' ' + col]).predict(data_test[['Mkt-RF','SMB','HML']])
    pred_direction = model.fit(data_train[['Mkt-RF','SMB','HML']], data_train[col + ' direction']).predict(data_test[['Mkt-RF','SMB','HML']])
    data_test['pred_returns ' + col] = pred_returns
    data_test['pred_direc ' + col] = np.sign(pred_direction)
    data_test['Buy ' + col] = np.where((data_test['pred_returns ' + col] > data_test['Returns' + ' ' + col]),1,-1)


In [10]:
data_test

Unnamed: 0_level_0,Returns AAPL.O,Returns MSFT.O,Returns INTC.O,Returns AMZN.O,Returns GS.N,Date,Mkt-RF,SMB,HML,RF,...,Buy MSFT.O,pred_returns INTC.O,pred_direc INTC.O,Buy INTC.O,pred_returns AMZN.O,pred_direc AMZN.O,Buy AMZN.O,pred_returns GS.N,pred_direc GS.N,Buy GS.N
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-03-26,-0.527201,-0.017756,2.907801,1.675650,0.294299,2013-03-26,0.76,-0.39,-0.21,0.000,...,1,0.991479,1.0,-1,1.281958,1.0,-1,0.647107,1.0,1
2017-02-07,0.951723,-0.329981,0.220568,0.601753,-0.150013,2017-02-07,-0.01,-0.30,-0.52,0.002,...,1,0.215112,1.0,-1,0.602961,1.0,1,-0.546712,-1.0,-1
2018-05-21,0.708497,1.286841,1.532710,0.704409,0.295359,2018-05-21,0.72,-0.19,0.44,0.006,...,-1,0.717553,1.0,-1,0.596240,1.0,-1,1.248793,1.0,1
2018-05-08,0.480665,-0.426107,0.562535,-0.484333,-0.084317,2018-05-08,0.07,0.49,0.24,0.006,...,1,-0.121086,-1.0,-1,-0.140308,-1.0,1,0.251465,1.0,1
2016-07-08,0.771315,1.790580,2.409639,1.254463,2.299320,2016-07-08,1.60,0.92,0.32,0.001,...,-1,1.398223,1.0,-1,1.597418,1.0,1,2.022851,1.0,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2014-10-06,0.000000,0.000000,0.235087,-0.167317,-0.313713,2014-10-06,-0.26,-0.78,0.37,0.000,...,-1,-0.163762,-1.0,-1,-0.444096,-1.0,-1,0.122405,1.0,1
2015-05-06,-0.627981,-2.773109,-1.286765,-0.496213,-0.527143,2015-05-06,-0.31,0.63,-0.13,0.000,...,1,-0.472045,-1.0,1,-0.291511,-1.0,1,-0.567789,-1.0,-1
2017-12-28,0.281360,0.011667,0.238560,0.324802,0.214886,2017-12-28,0.22,0.13,0.05,0.004,...,1,0.192179,1.0,-1,0.283822,1.0,-1,0.257969,1.0,1
2018-06-15,-1.027254,-1.271938,-0.774217,-0.457694,-0.740424,2018-06-15,-0.08,0.05,-0.25,0.006,...,1,-0.030207,1.0,1,0.205288,1.0,1,-0.380286,-1.0,1


In [11]:
#Might be useful for analysis
#Statistical summary of regression of training data against returns
X = sm.add_constant(data_train[["Mkt-RF", "SMB", "HML"]])

for col in store.columns:
    y = data_train['Returns' + ' ' + col]
    model = sm.OLS(y, X).fit()
    print(model.summary())

                            OLS Regression Results                            
Dep. Variable:         Returns AAPL.O   R-squared:                       0.355
Model:                            OLS   Adj. R-squared:                  0.354
Method:                 Least Squares   F-statistic:                     313.3
Date:                Mon, 01 Apr 2024   Prob (F-statistic):          5.23e-162
Time:                        17:08:16   Log-Likelihood:                -2852.3
No. Observations:                1709   AIC:                             5713.
Df Residuals:                    1705   BIC:                             5734.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0519      0.031      1.663      0.0

F-statistics for GS.N is questionable