In [1]:
import pandas as pd
from pandas import DataFrame
import numpy as np
import os
os.chdir('/Users/khavya/Box/MS - Financial Engineering/Fall 2020/FIN566 - Algorithmic Market Microstructure')
import matplotlib.pyplot as plt
import seaborn as sns

import collections
from collections import defaultdict
import scipy.stats as st
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm

plt.style.use('seaborn-ticks')
pd.set_option('display.max_columns', None)

from IPython.display import Markdown, display
def printmd(string):
    display(Markdown(string))

In [2]:
# Loading the data (MSFT, SPY, AMZN, AAPL, INTL, JPM) 
wholedf = pd.read_csv('SPY_6_components.csv')
wholedf.sort_values(['ticker','Collection_time'])

# Deleting the unwanted rows generated
del wholedf['Unnamed: 0']

# Finding the unique tickers in the data
unique_tickers = wholedf.ticker.unique()
print(unique_tickers)

['AAPL' 'AMZN' 'JPM' 'MSFT' 'SPY' 'INTC']


In [3]:
####### Source: Yong Xie's Code ########

# Setting up the scene for regression
window = pd.Timedelta('0 days 00:00:00.000001')
regression_data = pd.merge(wholedf[wholedf['ticker']=='SPY'], wholedf[wholedf['ticker']=='AAPL'], 
                           on = 'Collection_time', how = 'outer', suffixes = ['_SPY', '_AAPL'])
regression_data = regression_data.sort_values('Collection_time')
regression_data = regression_data.fillna(method = 'pad')
combine = regression_data.fillna(method = 'bfill')
combine['Collection_time'] = pd.to_datetime(combine['Collection_time'])

In [4]:
# Calculating future returns
combine_values = combine.values

component_return = np.zeros((combine_values.shape[0], 1))
etf_return = np.zeros((combine_values.shape[0], 1))

n = combine_values.shape[0]
for i in range(n):
    j = i + 1
    while j < n and combine_values[j,0] - combine_values[i,0] < window:
        j += 1
    if j >= n:
        etf_return[i, 0] = np.nan
        component_return[i, 0]= np.nan
    else:
        etf_return[i, 0] = combine_values[j, 5]/combine_values[i, 5] - 1
        component_return[i, 0] = combine_values[j, 12]/combine_values[i, 12] - 1
        
combine['Return_SPY'] = etf_return
combine['Return_AAPL'] = component_return

In [7]:
## calculate features
windows = {'1': pd.Timedelta('0 days 00:00:00.000001'),
           '5': pd.Timedelta('0 days 00:00:00.000005'),
           '10': pd.Timedelta('0 days 00:00:00.000010'),
           '30': pd.Timedelta('0 days 00:00:00.000030'),
           '60': pd.Timedelta('0 days 00:00:00.000060'),
           '90': pd.Timedelta('0 days 00:00:00.000090')}

combine_values = combine.values
n = combine_values.shape[0]

for k, window in windows.items():
    component_return = np.zeros((combine_values.shape[0], 1))
    etf_return = np.zeros((combine_values.shape[0], 1))
    
    for i in range(n):
        j = i - 1
        while j >=0 and combine_values[i,0] - combine_values[j,0] < window:
            j -= 1
        if j < 0:
            etf_return[i, 0] = np.nan
            component_return[i, 0]= np.nan
        else:
            etf_return[i, 0] = combine_values[i, 5]/combine_values[j, 5] - 1
            component_return[i, 0] = combine_values[i, 12]/combine_values[j, 12] - 1

    combine['Return_SPY_'+k] = etf_return
    combine['Return_AAPL_'+k] = component_return

In [30]:
# Filtering the unchanged price instances
# SPY as the dependent variable
combine_valid = combine.dropna(how='any', axis=0)
combine_validnew = combine_valid[['Return_SPY','Return_AAPL_1','Return_AAPL_5','Return_AAPL_10',
                                  'Return_AAPL_30', 'Return_AAPL_60','Return_AAPL_90']]
df = combine_validnew
df = df[~(df == 0).any(axis = 1)]

y = df.Return_SPY.to_list()
x = df[['Return_AAPL_1','Return_AAPL_5','Return_AAPL_10', 'Return_AAPL_30', 'Return_AAPL_60','Return_AAPL_90']]
x = sm.add_constant(x)

model = sm.OLS(y, x).fit()
predictions = model.predict(x) 
print_model = model.summary()
printmd('**SPY as the dependent variable - Microseconds interval with price filtering**')
print(print_model)

**SPY as the dependent variable - Microseconds interval with price filtering**

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.038
Model:                            OLS   Adj. R-squared:                  0.038
Method:                 Least Squares   F-statistic:                     175.6
Date:                Sun, 20 Dec 2020   Prob (F-statistic):          7.29e-184
Time:                        17:50:16   Log-Likelihood:             1.6755e+05
No. Observations:               22348   AIC:                        -3.351e+05
Df Residuals:                   22342   BIC:                        -3.350e+05
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
const          -1.752e-07   8.98e-07     -0.

In [32]:
# AAPL as the dependent variable
combine_valid = combine.dropna(how='any', axis=0)
combine_validnew = combine_valid[['Return_AAPL','Return_SPY_1','Return_SPY_5','Return_SPY_10',
                                  'Return_SPY_30', 'Return_SPY_60','Return_SPY_90']]
df = combine_validnew
df = df[~(df == 0).any(axis = 1)]

y = df.Return_AAPL.to_list()
x = df[['Return_SPY_1','Return_SPY_5','Return_SPY_10', 'Return_SPY_30', 'Return_SPY_60','Return_SPY_90']]
x = sm.add_constant(x)

model = sm.OLS(y, x).fit()
predictions = model.predict(x) 
print_model = model.summary()
printmd('**AAPL as the dependent variable - Microseconds interval with price filtering**')
print(print_model)

**AAPL as the dependent variable - Microseconds interval with price filtering**

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.038
Model:                            OLS   Adj. R-squared:                  0.037
Method:                 Least Squares   F-statistic:                     164.1
Date:                Sun, 20 Dec 2020   Prob (F-statistic):          8.93e-172
Time:                        17:52:25   Log-Likelihood:             1.3916e+05
No. Observations:               21054   AIC:                        -2.783e+05
Df Residuals:                   21048   BIC:                        -2.783e+05
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const          3.714e-06   2.25e-06      1.653