In [39]:
import pandas as pd
import numpy as np 
import warnings
import matplotlib.pyplot as plt
import statsmodels.api as sm
from datetime import datetime
from statsmodels.tsa.stattools import adfuller as adf
from statsmodels.graphics.gofplots import qqplot
from pandas.plotting import register_matplotlib_converters
from pandas.plotting import autocorrelation_plot
from pandas_datareader import data
from scipy import stats
from HAR_model import *
pd.options.mode.chained_assignment = None

In [40]:
df = pd.read_csv('data/SPY_data_5min.csv')

# Convert 'time' column to datetime format
df['time'] = pd.to_datetime(df['time'])

# Filter trades within the time window from 09:30 to 16:00
data = df[(df['time'].dt.time >= pd.to_datetime('09:30').time()) & 
                 (df['time'].dt.time <= pd.to_datetime('16:00').time())]

data.drop(['Unnamed: 0','money','open','high','low'], axis=1, inplace=True)
data['time'] = pd.to_datetime(data['time']).dt.strftime('%H:%M')

In [41]:
model1 = HARModel(
    raw_data= data,
    future= 20,
    lags =[4,20],
    feature="RV",
    semi_variance=False,
    jump_detect=True,
    log_transformation=False,
    period_train=list(
    [
            pd.to_datetime("20030910", format="%Y%m%d"),
            pd.to_datetime("20091231", format="%Y%m%d"),
    ]
    ),
    period_test=list(
        [
            pd.to_datetime("20110101", format="%Y%m%d"),
            pd.to_datetime("20111231", format="%Y%m%d"),            
        ]
    ),
)

In [42]:
model1.data_transformation()

In [43]:
model1.jump_detection()
model1.data_filltered_on_jump
model1.data

Unnamed: 0_level_0,RV,RVp,RVn,BV
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2004-01-02,0.000048,0.000022,0.000026,0.000036
2004-01-05,0.000023,0.000014,0.000009,0.000026
2004-01-06,0.000021,0.000011,0.000009,0.000018
2004-01-07,0.000029,0.000017,0.000012,0.000025
2004-01-08,0.000033,0.000015,0.000018,0.000034
...,...,...,...,...
2020-09-28,0.000059,0.000029,0.000030,0.000061
2020-09-29,0.000057,0.000023,0.000034,0.000052
2020-09-30,0.000137,0.000065,0.000072,0.000114
2020-10-01,0.000071,0.000035,0.000036,0.000061


In [44]:
data = model1.lag_average()
data

Unnamed: 0,date,RV_t,RV_w,RV_m
0,2004-02-02,0.000088,0.000065,0.000035
1,2004-02-03,0.000036,0.000080,0.000034
2,2004-02-04,0.000038,0.000065,0.000039
3,2004-02-05,0.000043,0.000053,0.000043
4,2004-02-06,0.000044,0.000051,0.000044
...,...,...,...,...
4069,2020-09-28,0.000059,0.000139,0.000155
4070,2020-09-29,0.000057,0.000132,0.000159
4071,2020-09-30,0.000137,0.000114,0.000164
4072,2020-10-01,0.000071,0.000094,0.000176


In [45]:
model1.output_dataset

In [46]:
print(model1.data[20:])
print(model1.output_dataset)

                  RV       RVp       RVn        BV
date                                              
2004-02-02  0.000088  0.000039  0.000049  0.000077
2004-02-03  0.000036  0.000017  0.000020  0.000035
2004-02-04  0.000038  0.000020  0.000017  0.000030
2004-02-05  0.000043  0.000022  0.000021  0.000042
2004-02-06  0.000044  0.000035  0.000009  0.000033
...              ...       ...       ...       ...
2020-09-28  0.000059  0.000029  0.000030  0.000061
2020-09-29  0.000057  0.000023  0.000034  0.000052
2020-09-30  0.000137  0.000065  0.000072  0.000114
2020-10-01  0.000071  0.000035  0.000036  0.000061
2020-10-02  0.000164  0.000100  0.000063  0.000176

[4074 rows x 4 columns]
None


In [47]:
model1.generate_dataset()

In [48]:
model1.generate_training_test_split()

In [49]:
model1.training_set

Unnamed: 0,RV_t,RV_w,RV_m,Target
0,0.000088,0.000065,0.000035,0.000035
1,0.000036,0.000080,0.000034,0.000035
2,0.000038,0.000065,0.000039,0.000034
3,0.000043,0.000053,0.000043,0.000036
4,0.000044,0.000051,0.000044,0.000035
...,...,...,...,...
1428,0.000010,0.000026,0.000049,0.000042
1429,0.000010,0.000021,0.000050,0.000047
1430,0.000010,0.000015,0.000047,0.000055
1431,0.000016,0.000011,0.000045,0.000063


In [50]:
model1.estimate_model()
model1.training_set

Unnamed: 0,RV_t,RV_w,RV_m,Target
0,0.000088,0.000065,0.000035,0.000035
1,0.000036,0.000080,0.000034,0.000035
2,0.000038,0.000065,0.000039,0.000034
3,0.000043,0.000053,0.000043,0.000036
4,0.000044,0.000051,0.000044,0.000035
...,...,...,...,...
1428,0.000010,0.000026,0.000049,0.000042
1429,0.000010,0.000021,0.000050,0.000047
1430,0.000010,0.000015,0.000047,0.000055
1431,0.000016,0.000011,0.000045,0.000063


In [51]:
model1.predict_values()
model1.training_set

Unnamed: 0,RV_t,RV_w,RV_m,Target
0,0.000088,0.000065,0.000035,0.000035
1,0.000036,0.000080,0.000034,0.000035
2,0.000038,0.000065,0.000039,0.000034
3,0.000043,0.000053,0.000043,0.000036
4,0.000044,0.000051,0.000044,0.000035
...,...,...,...,...
1428,0.000010,0.000026,0.000049,0.000042
1429,0.000010,0.000021,0.000050,0.000047
1430,0.000010,0.000015,0.000047,0.000055
1431,0.000016,0.000011,0.000045,0.000063


In [52]:
model1.make_accurate_measures()

In [53]:
model1.test_accuracy

{'MSE': 2.3741967329022253e-09,
 'MAE': 3.3263273879849765e-05,
 'RSquared': 0.4884595917007555}

In [54]:
model1.train_accuracy

{'MSE': 1.2885165808009794e-08,
 'MAE': 4.704553539503511e-05,
 'RSquared': 0.6449163036750358}