# Optiver Realized Volatility Prediction for 1 stock

This competition's main objective is to predict short-term volatility for 112 stocks. In this file, I'll be attempting to do so for one stock, and then replicate that process for the rest.

In [112]:
# Install pandas if not already installed
import pandas as pd
import numpy as np
import glob

In [113]:
stock_id = 0
book_example = pd.read_parquet('input/book_train.parquet/stock_id=0')
book_example=book_example[book_example['time_id']==5]
book_example.head()

Unnamed: 0,time_id,seconds_in_bucket,bid_price1,ask_price1,bid_price2,ask_price2,bid_size1,ask_size1,bid_size2,ask_size2
0,5,0,1.001422,1.002301,1.00137,1.002353,3,226,2,100
1,5,1,1.001422,1.002301,1.00137,1.002353,3,100,2,100
2,5,5,1.001422,1.002301,1.00137,1.002405,3,100,2,100
3,5,6,1.001422,1.002301,1.00137,1.002405,3,126,2,100
4,5,7,1.001422,1.002301,1.00137,1.002405,3,126,2,100


In [114]:
trade_example = pd.read_parquet('input/trade_train.parquet/stock_id=0')
trade_example=trade_example[trade_example['time_id']==5]
trade_example.head()

Unnamed: 0,time_id,seconds_in_bucket,price,size,order_count
0,5,21,1.002301,326,12
1,5,46,1.002778,128,4
2,5,50,1.002818,55,1
3,5,57,1.003155,121,5
4,5,68,1.003646,4,1


In [115]:
def calculate_wap(bid_price1, ask_price1, bid_size1, ask_size1):
    return (bid_price1 * ask_size1 + ask_price1 * bid_size1) / (bid_size1 + ask_size1)

In [116]:
book_example['wap'] = (book_example['bid_price1'] * book_example['ask_size1'] +
                                book_example['ask_price1'] * book_example['bid_size1']) / (
                                       book_example['bid_size1']+ book_example['ask_size1'])
book_example.head()

Unnamed: 0,time_id,seconds_in_bucket,bid_price1,ask_price1,bid_price2,ask_price2,bid_size1,ask_size1,bid_size2,ask_size2,wap
0,5,0,1.001422,1.002301,1.00137,1.002353,3,226,2,100,1.001434
1,5,1,1.001422,1.002301,1.00137,1.002353,3,100,2,100,1.001448
2,5,5,1.001422,1.002301,1.00137,1.002405,3,100,2,100,1.001448
3,5,6,1.001422,1.002301,1.00137,1.002405,3,126,2,100,1.001443
4,5,7,1.001422,1.002301,1.00137,1.002405,3,126,2,100,1.001443


In [117]:
import plotly.express as px

In [118]:
fig = px.line(book_example, x="seconds_in_bucket", y="wap", title='WAP of stock_id_0, time_id_5')
fig.show()

In [119]:
def log_returns(list_stock_prices):
    # works because .diff(): "Calculates the difference of a Dataframe element compared with another element in the Dataframe (default is element in previous row)."
    return np.log(list_stock_prices).diff()

In [120]:
log_returns(book_example['wap'])

0           NaN
1      0.000014
2      0.000000
3     -0.000005
4      0.000000
         ...   
297    0.000245
298    0.000000
299    0.000000
300    0.000000
301    0.000000
Name: wap, Length: 302, dtype: float64

In [121]:
book_example.loc[:, 'log_return'] = log_returns(book_example['wap'])
book_example = book_example[~book_example['log_return'].isnull()]

In [122]:
fig = px.line(book_example, x="seconds_in_bucket", y="log_return", title='Log return of stock_id_0, time_id_5')
fig.show()

In [123]:
def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))
realized_vol = realized_volatility(book_example['log_return'])
print(f"Realized volatility for stock_id 0 on time_id 5 is {realized_vol}")

Realized volatility for stock_id 0 on time_id 5 is 0.004499364172786558


# Naive Prediction
We know that volatility tends to be autocorrelated. Using this property, we can implement a naive model that predicts the realized volatility by using whatever the realized volatility was in the initial 10 minutes.

Let's calculate the past realized volatility across the training set to see how predictive a single naive signal can be.

In [124]:
list_order_book_file_train = glob.glob('input/book_train.parquet/*')
len(list_order_book_file_train)

112

Using the above output, we want to predict the short-term volatility for 112 stocks. We know that the data for this competition is partitionad by stock_id to better allow Kagglers to manage memory. So, we will attempt to calculate realized volatility stock by stock and then combine then into one submission file.

In [125]:
def realized_volatility_per_time_id(file_path, prediction_column_name):
    df_book_data = pd.read_parquet(file_path)
    df_book_data['wap'] = calculate_wap(df_book_data['bid_price1'], df_book_data['ask_price1'], df_book_data['bid_size1'], df_book_data['ask_size1'])
    df_book_data['log_return'] = df_book_data.groupby(['time_id'])['wap'].transform(log_returns)
    df_book_data = df_book_data[~df_book_data['log_return'].isnull()]
    df_realized_vol_per_stock = pd.DataFrame(df_book_data.groupby(['time_id'])['log_return'].agg(realized_volatility)).reset_index()
    df_realized_vol_per_stock = df_realized_vol_per_stock.rename(columns = {'log_return':prediction_column_name})
    stock_id = file_path.split('=')[1]
    df_realized_vol_per_stock['row_id'] = df_realized_vol_per_stock['time_id'].apply(lambda x: f'{stock_id}-{x}')
    trade_feats = ['seconds_in_bucket']
    trade_df = df_book_data.groupby(['time_id'])[trade_feats].agg(['mean', 'std', 'max', 'min']).reset_index()
    return df_realized_vol_per_stock[['row_id', prediction_column_name]]

In [126]:
def past_realized_volatility_per_stock(list_file, prediction_column_name):
    df_past_realized = pd.DataFrame()
    for file in list_file:
        df_past_realized = pd.concat([df_past_realized, realized_volatility_per_time_id(file, prediction_column_name)])
    return df_past_realized

Handle it for one stock, then we loop through all 112 stocks to handle all

STOCK ID: 17

In [133]:
file_path = list_order_book_file_train[0]
dfpast_realized_train = realized_volatility_per_time_id(file_path, prediction_column_name='pred')
dfpast_realized_train

Unnamed: 0,row_id,pred
0,17-5,0.004091
1,17-11,0.002155
2,17-16,0.002566
3,17-31,0.002221
4,17-62,0.002155
...,...,...
3825,17-32751,0.003227
3826,17-32753,0.002969
3827,17-32758,0.002157
3828,17-32763,0.003487


STOCK ID: 28

In [134]:
file_path = list_order_book_file_train[1]
dfpast_realized_train = realized_volatility_per_time_id(file_path, prediction_column_name='pred')
dfpast_realized_train

Unnamed: 0,row_id,pred
0,28-5,0.003506
1,28-11,0.002301
2,28-16,0.001943
3,28-31,0.002660
4,28-62,0.001169
...,...,...
3825,28-32751,0.002121
3826,28-32753,0.001796
3827,28-32758,0.002111
3828,28-32763,0.002089


Sort the list of file paths based on stock_id int value

In [137]:
list_order_book_file_train = sorted(list_order_book_file_train, key = lambda x: int(x.split('=')[1]))

Compute past realized volatility for all stocks

In [138]:
dfpast_realized_train = past_realized_volatility_per_stock(list_order_book_file_train, prediction_column_name='pred')
dfpast_realized_train

Unnamed: 0,row_id,pred
0,0-5,0.004499
1,0-11,0.001204
2,0-16,0.002369
3,0-31,0.002574
4,0-62,0.001894
...,...,...
3825,126-32751,0.003691
3826,126-32753,0.004104
3827,126-32758,0.003117
3828,126-32763,0.003661


We were able to predict volatility for 112 stocks using naive prediction. We must now use 2 evaluation metrics to evaluate this result: RMSPE and R Squared

RMSPE: 