In [None]:
import numpy as np 
import pandas as pd 
from typing import List, Iterator, Callable
from time import time
import glob
import os
from functools import lru_cache
from sklearn.metrics import r2_score
from tqdm import tqdm
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.base import BaseEstimator
from sklearn.preprocessing import PolynomialFeatures

import statsmodels.api as sm
train = pd.read_csv('../data/train.csv')

train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
train = train[['row_id','target']]

# Useful Functions Given in Example

In [None]:
def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff() 

def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))

def realized_volatility_per_time_id(file_path, prediction_column_name):
    df_book_data = pd.read_parquet(file_path)
    df_book_data['wap'] =(df_book_data['bid_price1'] * df_book_data['ask_size1']+df_book_data['ask_price1'] * df_book_data['bid_size1'])  / (
                                      df_book_data['bid_size1']+ df_book_data[
                                  'ask_size1'])
    df_book_data['log_return'] = df_book_data.groupby(['time_id'])['wap'].apply(log_return)
    df_book_data = df_book_data[~df_book_data['log_return'].isnull()]
    df_realized_vol_per_stock =  pd.DataFrame(df_book_data.groupby(['time_id'])['log_return'].agg(realized_volatility)).reset_index()
    df_realized_vol_per_stock = df_realized_vol_per_stock.rename(columns = {'log_return':prediction_column_name})
    stock_id = file_path.split('=')[1]
    df_realized_vol_per_stock['row_id'] = df_realized_vol_per_stock['time_id'].apply(lambda x:f'{stock_id}-{x}')
    return df_realized_vol_per_stock[['row_id',prediction_column_name]]

def past_realized_volatility_per_stock(list_file,prediction_column_name):
    df_past_realized = pd.DataFrame()
    for file in list_file:
        df_past_realized = pd.concat([df_past_realized,
                                     realized_volatility_per_time_id(file,prediction_column_name)])
    return df_past_realized


def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

# Useful Functions for LM

In [None]:
# helpers to extract features from book
def calc_realized_volatility(log_returns: pd.Series) -> float:
    return np.sqrt(np.sum(log_returns ** 2))


def aggregate_book_for_stock_and_time_id(book_time_slice: pd.DataFrame) -> pd.Series:
    volatilities = dict()
    for i in range(1, 4):
        log_returns = np.log(book_time_slice[f"WAP{i}"]).diff()
        volatilities[f"volatility_{i}"] = calc_realized_volatility(log_returns)
        
        log_returns = np.log(book_time_slice.tail(100)[f"WAP{i}"]).diff()
        volatilities[f"volatility_tail_{i}"] = calc_realized_volatility(log_returns)

    return pd.Series(volatilities)
  
    

def aggregate_book_for_stock(df: pd.DataFrame) -> pd.DataFrame:
    df["WAP1"] = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']) / (df['bid_size1']+ df['ask_size1'])
    df["WAP2"] = (df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2']) / (df['bid_size2']+ df['ask_size2'])
    df["WAP3"] = (
        (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']) +
        (df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2'])
    ) / (
        (df['bid_size1']+ df['ask_size1']) + (df['bid_size2']+ df['ask_size2'])
    )
    
    start = time()
    
    g = df.groupby(["time_id"], as_index=False)
    result = g.apply(aggregate_book_for_stock_and_time_id)
#     print(time() - start, "seconds to aggregate book per time_id")
    return result

def get_book_features_for_all_stocks(file_lists):
    features_all_books = list()
    print("Getting book features for {} stocks.".format(len(file_lists)))
    for file_path in tqdm(file_lists):
        eq_index = file_path.index('=') + 1
        stock_id = file_path[eq_index:]
        book_df = pd.read_parquet(file_path)
        features_this_book = aggregate_book_for_stock(book_df)
        features_this_book.insert(0, "stock_id", stock_id)
        features_all_books.append(features_this_book)

    features_all_books = pd.concat(features_all_books)
    return features_all_books

def make_X(feature_df):
    stock_dummies = pd.get_dummies(feature_df[["stock_id"]])
    return pd.concat([
        feature_df[["volatility_1"]],
        stock_dummies
    ], 
        axis=1
    )

def score_model_on_rmspe(model, X: pd.DataFrame, y_true: pd.Series) -> float:
    y_pred = model.predict(X)
    return rmspe(y_true, y_pred)

In [None]:
list_order_book_file_train = glob.glob('../data/book_train.parquet/*')
book_features_train = get_book_features_for_all_stocks(list_order_book_file_train)
book_features_train

Getting book features for 112 stocks.


100%|█████████████████████████████████████████| 112/112 [18:16<00:00,  9.79s/it]


Unnamed: 0,stock_id,time_id,volatility_1,volatility_tail_1,volatility_2,volatility_tail_2,volatility_3,volatility_tail_3
0,17,5,0.004091,0.001867,0.005673,0.002357,0.003545,0.001583
1,17,11,0.002155,0.001482,0.003741,0.002560,0.002009,0.001404
2,17,16,0.002566,0.001517,0.003324,0.001866,0.002301,0.001382
3,17,31,0.002221,0.001162,0.003017,0.001624,0.001950,0.001113
4,17,62,0.002155,0.001388,0.003315,0.001540,0.002197,0.001272
...,...,...,...,...,...,...,...,...
3825,98,32751,0.002572,0.001503,0.005053,0.003147,0.002248,0.001506
3826,98,32753,0.001231,0.001139,0.001961,0.001445,0.001026,0.000907
3827,98,32758,0.002129,0.002091,0.004702,0.004599,0.002312,0.002255
3828,98,32763,0.004120,0.001950,0.005901,0.002660,0.003699,0.001464


In [None]:
fig = px.line(book_features_train.loc[book_features_train['stock_id'] == 0], x="time_id", y="volatility_1")
fig.show()

In [None]:
book_features_train['row_id'] = book_features_train['stock_id'].astype(str) + '-' + book_features_train['time_id'].astype(str)
book_features_train

In [None]:
train_enriched = train.merge(book_features_train, on = ['row_id'], how="inner")
train_enriched['stock_id'] = train_enriched['stock_id'].astype(str)
train_enriched.head()

Unnamed: 0,row_id,target,stock_id,time_id,volatility_1,volatility_tail_1,volatility_2,volatility_tail_2,volatility_3,volatility_tail_3
0,0-5,0.004136,0,5,0.004499,0.002396,0.006999,0.004591,0.004106,0.002458
1,0-11,0.001445,0,11,0.001204,0.000976,0.002476,0.001981,0.001507,0.001163
2,0-16,0.002168,0,16,0.002369,0.001905,0.004801,0.003851,0.002469,0.001863
3,0-31,0.002195,0,31,0.002574,0.002533,0.003637,0.003518,0.002709,0.002604
4,0-62,0.001747,0,62,0.001894,0.00156,0.003257,0.002414,0.001932,0.001441


In [None]:
X_train = make_X(train_enriched)
y = train_enriched["target"]

In [None]:
model = LinearRegression(fit_intercept=False)

n_splits = 10
cv_results = cross_val_score(model, X_train, y, scoring=score_model_on_rmspe, cv=n_splits)
print("RMSPE from cross-validation:")
print(round(cv_results.mean(), 3),  " +/-", round(1.96 * cv_results.std(), 2))

RMSPE from cross-validation:
0.324  +/- 0.06


In [None]:
# Final fit and predict
model.fit(X_train, y)
train_enriched['pred'] = model.predict(X_train)

In [None]:
df_joined = train.merge(train_enriched[['row_id','pred']], on = ['row_id'], how = 'inner')
R2 = round(r2_score(y_true = df_joined['target'], y_pred = df_joined['pred']),3)
RMSPE = round(rmspe(y_true = df_joined['target'], y_pred = df_joined['pred']),3)
print(f'Performance of the naive prediction: R2 score: {R2}, RMSPE: {RMSPE}')

Performance of the naive prediction: R2 score: 0.757, RMSPE: 0.325


In [None]:
trade_example =  pd.read_parquet('../data/trade_train.parquet/stock_id=0')
stock_id = '0'
trade_example = trade_example[trade_example['time_id']==5]
trade_example.loc[:,'stock_id'] = stock_id

trade_example

Unnamed: 0,time_id,seconds_in_bucket,price,size,order_count,stock_id
0,5,21,1.002301,326,12,0
1,5,46,1.002778,128,4,0
2,5,50,1.002818,55,1,0
3,5,57,1.003155,121,5,0
4,5,68,1.003646,4,1,0
5,5,78,1.003762,134,5,0
6,5,122,1.004207,102,3,0
7,5,127,1.004577,1,1,0
8,5,144,1.00437,6,1,0
9,5,147,1.003964,233,4,0
