# Realized Volatility Prediction
## 3 (pt.2) Training Data Development

### Table of Contents
5. Baseline Model
6. Train/Test Split
7. Inspect Features

In [1]:
# Standard imports and libraries
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
features1 = pd.read_csv('../data/interim/features1.csv')
features1.head()

Unnamed: 0,stock_id,time_id,target_value,real_vol_trade,real_vol_1,real_vol_2,mean_bid_price1,mean_ask_price1,mean_bid_price2,mean_ask_price2,...,ld_std_bid_size1,ld_std_ask_size1,ld_std_bid_size2,ld_std_ask_size2,ld_std_bid_ask_1,ld_std_bid_ask_2,ld_std_wap_1,ld_std_wap_2,ld_std_volume_overall,ld_std_volume_difference
0,0,5,0.004136,0.002006,0.004499,0.006999,1.003314,1.004169,1.003139,1.00432,...,1.464797,1.718864,1.852007,1.442717,0.172775,0.162195,0.00026,0.000404,0.43244,1.257587
1,0,11,0.001445,0.000901,0.001204,0.002476,1.000011,1.000406,0.99987,1.000541,...,1.246478,1.120756,1.004334,1.128109,0.189273,0.141589,8.6e-05,0.000176,0.366436,0.989984
2,0,16,0.002168,0.001961,0.002369,0.004801,0.999204,0.999929,0.999007,1.000127,...,1.074147,0.823276,1.101306,1.444008,0.121048,0.182168,0.000173,0.000352,0.310853,0.926486
3,0,31,0.002195,0.001561,0.002574,0.003637,0.998445,0.999304,0.998255,0.999413,...,1.171561,0.899833,1.104095,0.984974,0.140659,0.154967,0.000236,0.000334,0.319151,1.783597
4,0,62,0.001747,0.000871,0.001894,0.003257,0.999407,0.999804,0.999216,0.999913,...,1.802104,1.316963,1.53147,1.548352,0.264692,0.150934,0.000144,0.000247,0.499696,


## 3.5 Baseline Model

The baseline model does not involve any machine learning. It is based on the assumption that volatility will be autocorrelated. That is, this model predicts that the realized volatility of the next 10-minute window will be the same as the given 10-minute window. Our performance metric would be the root mean square percentage error, which we will define and calculate below.

In [3]:
def RMSPE(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

In [4]:
y_true = features1['target_value']
y_pred = features1['real_vol_1']
print(RMSPE(y_true, y_pred))

0.3413544901880096


## 3.6 Train/Test Split

In [5]:
test = features1.sample(frac=0.2, random_state=57)
test.head()

Unnamed: 0,stock_id,time_id,target_value,real_vol_trade,real_vol_1,real_vol_2,mean_bid_price1,mean_ask_price1,mean_bid_price2,mean_ask_price2,...,ld_std_bid_size1,ld_std_ask_size1,ld_std_bid_size2,ld_std_ask_size2,ld_std_bid_ask_1,ld_std_bid_ask_2,ld_std_wap_1,ld_std_wap_2,ld_std_volume_overall,ld_std_volume_difference
386444,113,29515,0.005317,0.004254,0.006474,0.008231,1.002776,1.003482,1.00267,1.003561,...,1.567101,1.52862,1.819983,1.726748,0.236483,0.209522,0.000304,0.000387,0.983408,
353498,104,9885,0.006053,0.003215,0.005559,0.007867,0.998749,0.999655,0.998499,0.999824,...,1.678065,1.147619,1.310169,0.773019,0.251995,0.220244,0.000278,0.000393,0.308519,
239617,70,17971,0.001829,0.001547,0.001652,0.002224,1.000342,1.000697,1.000195,1.000845,...,0.812572,1.061667,0.729032,0.975573,0.201137,0.11458,0.000103,0.000139,0.21235,0.926712
77592,21,8256,0.003989,0.006974,0.007669,0.008836,1.010516,1.011096,1.010174,1.011439,...,0.808932,1.027513,0.557968,0.645378,0.376694,0.169418,0.000343,0.000396,0.339714,
237993,70,4543,0.000931,0.000887,0.001203,0.001487,1.00087,1.001045,1.000706,1.001208,...,0.964638,0.930109,0.269083,0.356964,0.172748,0.0717,6.7e-05,8.3e-05,0.161347,


In [6]:
train = features1.drop(test.index)
train.head()

Unnamed: 0,stock_id,time_id,target_value,real_vol_trade,real_vol_1,real_vol_2,mean_bid_price1,mean_ask_price1,mean_bid_price2,mean_ask_price2,...,ld_std_bid_size1,ld_std_ask_size1,ld_std_bid_size2,ld_std_ask_size2,ld_std_bid_ask_1,ld_std_bid_ask_2,ld_std_wap_1,ld_std_wap_2,ld_std_volume_overall,ld_std_volume_difference
1,0,11,0.001445,0.000901,0.001204,0.002476,1.000011,1.000406,0.99987,1.000541,...,1.246478,1.120756,1.004334,1.128109,0.189273,0.141589,8.6e-05,0.000176,0.366436,0.989984
2,0,16,0.002168,0.001961,0.002369,0.004801,0.999204,0.999929,0.999007,1.000127,...,1.074147,0.823276,1.101306,1.444008,0.121048,0.182168,0.000173,0.000352,0.310853,0.926486
3,0,31,0.002195,0.001561,0.002574,0.003637,0.998445,0.999304,0.998255,0.999413,...,1.171561,0.899833,1.104095,0.984974,0.140659,0.154967,0.000236,0.000334,0.319151,1.783597
7,0,103,0.00412,0.002102,0.005331,0.006557,0.999436,1.000475,0.999303,1.000721,...,1.74175,1.347152,1.48164,1.108572,0.211328,0.126178,0.000311,0.000383,0.461899,
8,0,109,0.002182,0.001266,0.001797,0.003536,1.00126,1.001706,1.001137,1.00186,...,1.464672,1.457715,1.862705,1.675391,0.247753,0.221657,0.000117,0.000231,0.37828,


In [7]:
test.to_csv('../data/interim/features1_test')
train.to_csv('../data/interim/features1_train')

## 3.7 Inspect Features

In [8]:
#from ydata_profiling import ProfileReport

In [9]:
#profile = ProfileReport(train, title="Profiling Report")

In [10]:
#profile.to_file('../reports/features1_report.html')