In [20]:
import pandas as pd
import numpy as np 
import warnings
import matplotlib.pyplot as plt
import statsmodels.api as sm
from datetime import datetime
from statsmodels.tsa.stattools import adfuller as adf
from statsmodels.graphics.gofplots import qqplot
from pandas.plotting import register_matplotlib_converters
from pandas.plotting import autocorrelation_plot
from pandas_datareader import data
from scipy import stats

In [21]:
%matplotlib inline
register_matplotlib_converters()
warnings.filterwarnings('ignore')

In [22]:
df = pd.read_csv('data/SPY_data_5min.csv')

# Convert 'time' column to datetime format
df['time'] = pd.to_datetime(df['time'])

# Filter trades within the time window from 09:30 to 16:00
data = df[(df['time'].dt.time >= pd.to_datetime('09:30').time()) & 
                 (df['time'].dt.time <= pd.to_datetime('16:00').time())]

data.drop(['Unnamed: 0','money','open','high','low'], axis=1, inplace=True)
data['time'] = pd.to_datetime(data['time']).dt.strftime('%H:%M')


In [23]:
'''
Calculation of  Barndorff-Nielsen (2010) estimators that capture the variation only due to negative or 
positive returns using the realized semivariance estimators
'''
'''
Ovo je deo gde racunamo returns na osnovu Andersen (2001) koji kaze da  RV konvergira prema ri = pi-p(i-1) --> [p,p]
'''

data['returns'] = np.log(data['close']) - np.log(data['close'].shift(1))
data.loc[data['date'] != data['date'].shift(1), 'returns'] = None
data['returns**2'] = data['returns']**2
rv = data.groupby('date')['returns**2'].sum().rename('RV')
data = data.join(rv,on='date')
data['sum_for_BV'] = data['returns'].abs() * data['returns'].abs().shift(1)
coef =  np.sqrt(2/np.pi) ** (-2)
bv = coef * data.groupby('date')['sum_for_BV'].sum().rename('BV')
data['positive_returns'] = (data['returns**2']) * (data['returns'] > 0)
data['negative_returns'] = (data['returns**2'] * (data['returns'] < 0))
rs_p = data.groupby('date')['positive_returns'].sum().rename('RV+')
rs_n = data.groupby('date')['negative_returns'].sum().rename('RV-')
data = data.join(rs_p, on= 'date')
data = data.join(rs_n, on = 'date')
data = data.join(bv,on= 'date')
data.head()





Unnamed: 0,date,time,close,volume,returns,returns**2,RV,sum_for_BV,positive_returns,negative_returns,RV+,RV-,BV
17,2004-01-02,09:30,111.75,33000,,,4.8e-05,,,,2.2e-05,2.6e-05,3.6e-05
18,2004-01-02,09:35,111.86,3337800,0.000984,9.679724e-07,4.8e-05,,9.679724e-07,0.0,2.2e-05,2.6e-05,3.6e-05
19,2004-01-02,09:40,111.9,1950700,0.000358,1.278248e-07,4.8e-05,3.51754e-07,1.278248e-07,0.0,2.2e-05,2.6e-05,3.6e-05
20,2004-01-02,09:45,111.83,680700,-0.000626,3.915684e-07,4.8e-05,2.237234e-07,0.0,3.915684e-07,2.2e-05,2.6e-05,3.6e-05
21,2004-01-02,09:50,111.82,517000,-8.9e-05,7.99691e-09,4.8e-05,5.595835e-08,0.0,7.99691e-09,2.2e-05,2.6e-05,3.6e-05


In [24]:
rv = pd.DataFrame(rv)
rv =rv.join(rs_p,on ='date')
rv = rv.join(rs_n,on = 'date')
rv = rv.join(bv,on='date')

podaci = rv
podaci.head()

Unnamed: 0_level_0,RV,RV+,RV-,BV
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2004-01-02,4.8e-05,2.2e-05,2.6e-05,3.6e-05
2004-01-05,2.3e-05,1.4e-05,9e-06,2.6e-05
2004-01-06,2.1e-05,1.1e-05,9e-06,1.8e-05
2004-01-07,2.9e-05,1.7e-05,1.2e-05,2.5e-05
2004-01-08,3.3e-05,1.5e-05,1.8e-05,3.4e-05


In [25]:
data[:80]

Unnamed: 0,date,time,close,volume,returns,returns**2,RV,sum_for_BV,positive_returns,negative_returns,RV+,RV-,BV
17,2004-01-02,09:30,111.75,33000,,,0.000048,,,,0.000022,0.000026,0.000036
18,2004-01-02,09:35,111.86,3337800,0.000984,9.679724e-07,0.000048,,9.679724e-07,0.000000e+00,0.000022,0.000026,0.000036
19,2004-01-02,09:40,111.90,1950700,0.000358,1.278248e-07,0.000048,3.517540e-07,1.278248e-07,0.000000e+00,0.000022,0.000026,0.000036
20,2004-01-02,09:45,111.83,680700,-0.000626,3.915684e-07,0.000048,2.237234e-07,0.000000e+00,3.915684e-07,0.000022,0.000026,0.000036
21,2004-01-02,09:50,111.82,517000,-0.000089,7.996910e-09,0.000048,5.595835e-08,0.000000e+00,7.996910e-09,0.000022,0.000026,0.000036
...,...,...,...,...,...,...,...,...,...,...,...,...,...
92,2004-01-02,15:45,111.02,302000,0.000000,0.000000e+00,0.000048,0.000000e+00,0.000000e+00,0.000000e+00,0.000022,0.000026,0.000036
93,2004-01-02,15:50,111.04,153400,0.000180,3.244736e-08,0.000048,0.000000e+00,3.244736e-08,0.000000e+00,0.000022,0.000026,0.000036
94,2004-01-02,15:55,110.91,282100,-0.001171,1.372260e-06,0.000048,2.110124e-07,0.000000e+00,1.372260e-06,0.000022,0.000026,0.000036
95,2004-01-02,16:00,111.00,597700,0.000811,6.579476e-07,0.000048,9.501975e-07,6.579476e-07,0.000000e+00,0.000022,0.000026,0.000036


In [26]:
data.describe()

Unnamed: 0,close,volume,returns,returns**2,RV,sum_for_BV,positive_returns,negative_returns,RV+,RV-,BV
count,332058.0,332058.0,327840.0,327840.0,332058.0,323622.0,327840.0,327840.0,332058.0,332058.0,332058.0
mean,174.68265,1479655.0,1e-06,1.12025e-06,8.721074e-05,6.829775e-07,5.677877e-07,5.524621e-07,4.419706e-05,4.301368e-05,8.2e-05
std,65.81581,1678869.0,0.001058,7.225206e-06,0.0002442418,4.148786e-06,5.704628e-06,4.50424e-06,0.0001367557,0.0001141687,0.000236
min,67.17,0.0,-0.029285,0.0,9.972536e-07,0.0,0.0,0.0,5.357478e-07,2.133019e-07,1e-06
25%,122.55,485434.5,-0.000388,2.863691e-08,1.841439e-05,3.311868e-08,0.0,0.0,9.031428e-06,8.222706e-06,1.7e-05
50%,146.12,934679.5,0.0,1.532048e-07,3.267161e-05,1.255583e-07,0.0,0.0,1.624641e-05,1.564398e-05,3e-05
75%,213.44,1831441.0,0.000395,6.22777e-07,7.014798e-05,4.236448e-07,1.557557e-07,1.503129e-07,3.351727e-05,3.631619e-05,6.5e-05
max,358.7,40322800.0,0.037564,0.001411067,0.005996921,0.0008863521,0.001411067,0.0008575841,0.004014671,0.00198225,0.005158


In [None]:
plt.figure(figsize=(12,6))
plt.plot(podaci['RV'],color = 'blue')
plt.grid(True)
plt.show()

In [None]:
## Corsi fusari and La Vecchia 2013

rolling_std = podaci['RV'].rolling(window = 200).sum()
threshold = 4 * rolling_std

data_fitered = podaci[podaci['RV'] <= threshold]
print(podaci.shape)
print(data_fitered.shape)

print((podaci.shape[0]- data_fitered.shape[0]) / podaci.shape[0] * 100)

(4218, 4)
(4019, 4)
4.71787577050735


In [None]:
data.head()

Unnamed: 0,date,time,close,volume,returns,returns**2,RV,sum_for_BV,positive_returns,negative_returns,RV+,RV-,BV
17,2004-01-02,09:30,111.75,33000,,,4.8e-05,,,,2.2e-05,2.6e-05,3.6e-05
18,2004-01-02,09:35,111.86,3337800,0.000984,9.679724e-07,4.8e-05,,9.679724e-07,0.0,2.2e-05,2.6e-05,3.6e-05
19,2004-01-02,09:40,111.9,1950700,0.000358,1.278248e-07,4.8e-05,3.51754e-07,1.278248e-07,0.0,2.2e-05,2.6e-05,3.6e-05
20,2004-01-02,09:45,111.83,680700,-0.000626,3.915684e-07,4.8e-05,2.237234e-07,0.0,3.915684e-07,2.2e-05,2.6e-05,3.6e-05
21,2004-01-02,09:50,111.82,517000,-8.9e-05,7.99691e-09,4.8e-05,5.595835e-08,0.0,7.99691e-09,2.2e-05,2.6e-05,3.6e-05


In [None]:
podaci.head()

Unnamed: 0_level_0,RV,RV+,RV-,BV
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2004-01-02,4.8e-05,2.2e-05,2.6e-05,3.6e-05
2004-01-05,2.3e-05,1.4e-05,9e-06,2.6e-05
2004-01-06,2.1e-05,1.1e-05,9e-06,1.8e-05
2004-01-07,2.9e-05,1.7e-05,1.2e-05,2.5e-05
2004-01-08,3.3e-05,1.5e-05,1.8e-05,3.4e-05


In [None]:
podaci.tail()

Unnamed: 0_level_0,RV,RV+,RV-,BV
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-09-28,5.9e-05,2.9e-05,3e-05,6.1e-05
2020-09-29,5.7e-05,2.3e-05,3.4e-05,5.2e-05
2020-09-30,0.000137,6.5e-05,7.2e-05,0.000114
2020-10-01,7.1e-05,3.5e-05,3.6e-05,6.1e-05
2020-10-02,0.000164,0.0001,6.3e-05,0.000176


In [None]:
podaci.index

Index(['2004-01-02', '2004-01-05', '2004-01-06', '2004-01-07', '2004-01-08',
       '2004-01-09', '2004-01-12', '2004-01-13', '2004-01-14', '2004-01-15',
       ...
       '2020-09-21', '2020-09-22', '2020-09-23', '2020-09-24', '2020-09-25',
       '2020-09-28', '2020-09-29', '2020-09-30', '2020-10-01', '2020-10-02'],
      dtype='object', name='date', length=4218)

In [None]:
# training /test/validation  split


training_start = '2004-01-02'
training_end = '2017-12-31'
validation_start = '2018-01-01'
validation_end = '2018-12-31'
test_start = '2019-01-01'
test_end = '2020-10-02'

training_data = podaci.loc[podaci.index <= training_end]
validation_data = podaci.loc[(podaci.index >= validation_start) & (podaci.index <= validation_end)]
test_data =  podaci.loc[podaci.index >= test_start]

print(training_data.shape)
print(validation_data.shape)
print(test_data.shape)

(3524, 4)
(251, 4)
(443, 4)


In [None]:
test_data.head()

Unnamed: 0_level_0,RV,RV+,RV-,BV
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-01-02,0.000164,0.000106,5.9e-05,0.000142
2019-01-03,0.000261,0.000113,0.000147,0.00023
2019-01-04,0.000164,0.000115,5e-05,0.000163
2019-01-07,7.9e-05,4.7e-05,3.2e-05,7.5e-05
2019-01-08,9.4e-05,3.5e-05,5.9e-05,9.8e-05


In [None]:

import statsmodels.api as sm

In [1]:
from HAR_model import *