In [1]:
import pandas as pd
from pandas import DataFrame
import numpy as np
import os
os.chdir('/Users/khavya/Box/MS - Financial Engineering/Fall 2020/FIN566 - Algorithmic Market Microstructure')

import scipy.stats as st
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm

pd.set_option('display.max_columns', None)

In [2]:
# Loading the data (MSFT, SPY, AMZN, AAPL, INTL, JPM) 
wholedf = pd.read_csv('six_components.csv')
wholedf.sort_values(['ticker','Collection_time'])

# Deleting the unwanted rows generated
del wholedf['Unnamed: 0']
del wholedf['Unnamed: 0.1']

# Finding the unique tickers in the data
unique_tickers = wholedf.ticker.unique()
print(unique_tickers)

['AAPL' 'AMZN' 'INTL' 'JPM' 'MSFT' 'SPY']


In [12]:
####### Source: Yong Xie's Code ########

# Setting up the scene for regression
window = pd.Timedelta('0 days 00:00:00.000001')
regression_data = pd.merge(wholedf[wholedf['ticker']=='SPY'], wholedf[wholedf['ticker']=='AAPL'], 
                           on = 'Collection_time', how = 'outer', suffixes = ['_SPY', '_AAPL'])
regression_data = regression_data.sort_values('Collection_time')
regression_data = regression_data.fillna(method = 'pad')
combine = regression_data.fillna(method = 'bfill')
combine['Collection_time'] = pd.to_datetime(combine['Collection_time'])

In [13]:
# Calculating future returns
combine_values = combine.values

component_return = np.zeros((combine_values.shape[0], 1))
etf_return = np.zeros((combine_values.shape[0], 1))

n = combine_values.shape[0]
for i in range(n):
    j = i + 1
    while j < n and combine_values[j,0] - combine_values[i,0] < window:
        j += 1
    if j >= n:
        etf_return[i, 0] = np.nan
        component_return[i, 0]= np.nan
    else:
        etf_return[i, 0] = combine_values[j, 5]/combine_values[i, 5] - 1
        component_return[i, 0] = combine_values[j, 12]/combine_values[i, 12] - 1
        
combine['Return_SPY'] = etf_return
combine['Return_AAPL'] = component_return

In [14]:
## calculate features
windows = {'1': pd.Timedelta('0 days 00:00:00.000001'),
           '5': pd.Timedelta('0 days 00:00:00.000005'),
           '10': pd.Timedelta('0 days 00:00:00.000010'),
           '30': pd.Timedelta('0 days 00:00:00.000030'),
           '60': pd.Timedelta('0 days 00:00:00.000060'),
           '90': pd.Timedelta('0 days 00:00:00.000090')}

combine_values = combine.values
n = combine_values.shape[0]

for k, window in windows.items():
    component_return = np.zeros((combine_values.shape[0], 1))
    etf_return = np.zeros((combine_values.shape[0], 1))
    
    for i in range(n):
        j = i - 1
        while j >=0 and combine_values[i,0] - combine_values[j,0] < window:
            j -= 1
        if j < 0:
            etf_return[i, 0] = np.nan
            component_return[i, 0]= np.nan
        else:
            etf_return[i, 0] = combine_values[i, 5]/combine_values[j, 5] - 1
            component_return[i, 0] = combine_values[i, 12]/combine_values[j, 12] - 1

    combine['Return_SPY_'+k] = etf_return
    combine['Return_AAPL_'+k] = component_return

In [15]:
# SPY - Dependent variable
combine_valid = combine.dropna(how='any', axis=0)

x_cols_aapl = []
x_cols_spy = []
for k, _ in windows.items():
    x_cols_aapl.append('Return_SPY_'+k)
    x_cols_spy.append('Return_AAPL_'+k)
x_cols = x_cols_aapl + x_cols_spy

y = combine_valid.Return_SPY.to_list()
x = combine_valid[x_cols]
x = sm.add_constant(x)

model = sm.OLS(y, x).fit()
predictions = model.predict(x) 
print_model = model.summary()
print(print_model)

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.042
Model:                            OLS   Adj. R-squared:                  0.042
Method:                 Least Squares   F-statistic:                     924.5
Date:                Tue, 15 Dec 2020   Prob (F-statistic):               0.00
Time:                        22:51:06   Log-Likelihood:             2.0674e+06
No. Observations:              252684   AIC:                        -4.135e+06
Df Residuals:                  252671   BIC:                        -4.135e+06
Df Model:                          12                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
const           6.482e-08   1.35e-07      0.

In [16]:
# AAPL - Dependent variable
combine_valid = combine.dropna(how='any', axis=0)

y = combine_valid.Return_AAPL.to_list()
x = combine_valid[x_cols]
x = sm.add_constant(x)

model = sm.OLS(y, x).fit()
predictions = model.predict(x) 
print_model = model.summary()
print(print_model)

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.006
Model:                            OLS   Adj. R-squared:                  0.005
Method:                 Least Squares   F-statistic:                     117.0
Date:                Tue, 15 Dec 2020   Prob (F-statistic):          1.11e-292
Time:                        22:51:06   Log-Likelihood:             1.8979e+06
No. Observations:              252684   AIC:                        -3.796e+06
Df Residuals:                  252671   BIC:                        -3.796e+06
Df Model:                          12                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
const           4.948e-07   2.63e-07      1.