In [1]:
import pandas as pd
import numpy as np 
import warnings
import matplotlib.pyplot as plt
import statsmodels.api as sm
from datetime import datetime
from statsmodels.tsa.stattools import adfuller as adf
from statsmodels.graphics.gofplots import qqplot
from pandas.plotting import register_matplotlib_converters
from pandas.plotting import autocorrelation_plot
from pandas_datareader import data
from scipy import stats
from HAR_model import *
pd.options.mode.chained_assignment = None

In [2]:
df = pd.read_csv('data/SPY_data_5min.csv')

# Convert 'time' column to datetime format
df['time'] = pd.to_datetime(df['time'])

# Filter trades within the time window from 09:30 to 16:00
data = df[(df['time'].dt.time >= pd.to_datetime('09:30').time()) & 
                 (df['time'].dt.time <= pd.to_datetime('16:00').time())]

data.drop(['Unnamed: 0','money','open','high','low'], axis=1, inplace=True)
data['time'] = pd.to_datetime(data['time']).dt.strftime('%H:%M')

In [3]:
model1 = HARModel(
    raw_data= data,
    future= 1,
    lags =[4,20],
    feature="RV",
    semi_variance=False,
    jump_detect=True,
    log_transformation=False,
    period_train=list(
    [
            pd.to_datetime("20030910", format="%Y%m%d"),
            pd.to_datetime("20091231", format="%Y%m%d"),
    ]
    ),
    period_test=list(
        [
            pd.to_datetime("20100101", format="%Y%m%d"),
            pd.to_datetime("20101231", format="%Y%m%d"),            
        ]
    ),
)

In [4]:
model1.data_transfomation()

In [5]:
model1.jump_detection()
model1.data_filltered_on_jump

2.939781887150308

In [6]:
data = model1.lag_average()
data

Unnamed: 0_level_0,RV,RV_t,RV_w,RV_m
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2004-01-02,0.000048,,,
2004-01-05,0.000023,0.000048,,
2004-01-06,0.000021,0.000023,,
2004-01-07,0.000029,0.000021,0.000030,
2004-01-08,0.000033,0.000029,0.000026,
...,...,...,...,...
2020-09-28,0.000059,0.000121,0.000132,0.000159
2020-09-29,0.000057,0.000059,0.000114,0.000164
2020-09-30,0.000137,0.000057,0.000094,0.000176
2020-10-01,0.000071,0.000137,0.000081,0.000181


In [7]:
model1.generate_dataset()

In [8]:
model1.generate_training_test_split()

In [9]:
model1.estimate_model()

In [10]:
model1.predict_values()

In [11]:
model1.make_accurate_measures()

In [12]:
model1.test_accuracy

{'MSE': 1.1428715449826422e-09,
 'MAE': 2.2420725841545826e-05,
 'RSquared': 0.7265511646566034}

In [13]:
model1.train_accuracy

{'MSE': 9.59079025551135e-09,
 'MAE': 3.0666862740908845e-05,
 'RSquared': 0.8199760830519365}

In [14]:
model1.feature

'RV'

In [15]:
model1.training_set

Unnamed: 0,RV,RV_t,RV_w,RV_m,Target
0,0.000049,0.000086,0.000065,0.000035,0.000049
1,0.000088,0.000049,0.000080,0.000034,0.000088
2,0.000036,0.000088,0.000065,0.000039,0.000036
3,0.000038,0.000036,0.000053,0.000043,0.000038
4,0.000043,0.000038,0.000051,0.000044,0.000043
...,...,...,...,...,...
1429,0.000010,0.000015,0.000021,0.000050,0.000010
1430,0.000010,0.000010,0.000015,0.000047,0.000010
1431,0.000010,0.000010,0.000011,0.000045,0.000010
1432,0.000016,0.000010,0.000011,0.000043,0.000016
