In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from pprint import pprint
import numpy as np

import random
#import memory_profiler
import altair as alt
from tqdm import tqdm
import datetime

USE_ALL_STOCK_IDS = True
TEST_SIZE = 0.25

ENV_HOME = True
if os.environ.get('USER') == 'ian':
    import ipython_memory_usage
    %ipython_memory_usage_start
    USE_TEST_LOCAL_6_ITEMS = True # robust local testing at home
else:
    USE_ALL_STOCK_IDS = False
    ENV_HOME = False
    USE_TEST_LOCAL_6_ITEMS = False
    # kaggle notes:
    # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
    # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1] used 0.0000 MiB RAM in 0.10s, peaked 0.00 MiB above current, total RAM usage 134.02 MiB


In [2]:
# OR PASTE IN UTILITY CODE HERE FOR KAGGLE
from utility import make_unique_time_ids, get_data, get_training_stock_ids
from utility import ROOT, TEST_CSV, TRAIN_CSV


Utility says ROOT is /home/ian/data/kaggle/optiver_volatility/
In [2] used 0.2422 MiB RAM in 0.13s, peaked 0.00 MiB above current, total RAM usage 134.26 MiB


## Load train set

In [3]:
stock_ids = get_training_stock_ids('book_train.parquet') # all stocks by default
if not USE_ALL_STOCK_IDS:
    # choose a random subset
    print("Using a subset")
    random.shuffle(stock_ids)
    stock_ids = stock_ids[:30]

In [3] used 0.0000 MiB RAM in 0.11s, peaked 0.00 MiB above current, total RAM usage 134.26 MiB


In [4]:
df_train_all = pd.read_csv(TRAIN_CSV)
df_train_all = df_train_all.set_index(['stock_id', 'time_id'])
print(df_train_all.shape)
rows_for_stock_id_0 = df_train_all.query('stock_id == 0').shape[0]
rows_for_stock_id_0

(428932, 1)


3830

In [4] used 23.1875 MiB RAM in 0.30s, peaked 4.80 MiB above current, total RAM usage 157.45 MiB


In [5]:
time_ids_train = list(df_train_all.reset_index()['time_id'].unique())
time_ids_test = []

In [5] used -6.3750 MiB RAM in 0.12s, peaked 6.59 MiB above current, total RAM usage 151.07 MiB


In [6]:
# make feature columns
def make_features1(df_book, agg_type, cols):
    features_var1 = df_book.groupby(['stock_id', 'time_id'])[cols].agg(agg_type)
    features_var1_col_names = [f"{col}_{agg_type}" for col in cols]
    features_var1.columns = features_var1_col_names
    #features_var1
    return features_var1

In [6] used 0.0000 MiB RAM in 0.11s, peaked 0.00 MiB above current, total RAM usage 151.07 MiB


In [7]:
print(f'Iterating over {len(stock_ids)} stocks:')
all_train_merged = []
for stock_id in tqdm(stock_ids):
    assert isinstance(stock_id, int)
    df_book_train_stock_X = pd.read_parquet(
        os.path.join(ROOT, f"book_train.parquet/stock_id={stock_id}")
    )
    df_book_train_stock_X["stock_id"] = stock_id
    df_book_train_stock_X = df_book_train_stock_X.set_index(['stock_id', 'time_id'])
    assert df_book_train_stock_X.shape[0] > rows_for_stock_id_0, (df_book_train_stock_X.shape[0], rows_for_stock_id_0)
    
    cols = ['bid_price1', 'ask_price1', 'bid_price2', 'ask_price2', 'bid_size1', 'ask_size1']
    features_var1 = make_features1(df_book_train_stock_X, 'var', cols)
    #print(f"{features_var1.memory_usage().sum() / 1_000_000:0.1f} MB")
    features_mean1 = make_features1(df_book_train_stock_X, 'mean', cols)

    # inner join, so we just get the subset
    train_merged = pd.merge(df_train_all, features_var1, left_index=True, right_index=True, how='inner')
    #assert train_merged.shape[0] == rows_for_stock_id_0 # can be 3830, 3829, not sure what a good test is here
    assert train_merged.shape[0] == df_book_train_stock_X.reset_index()[['time_id']].nunique()[0] # i.e. check for the inner join
    train_merged = pd.merge(train_merged, features_mean1, left_index=True, right_index=True)
    all_train_merged.append(train_merged)

Iterating over 112 stocks:


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 112/112 [01:27<00:00,  1.28it/s]

In [7] used 495.7344 MiB RAM in 87.85s, peaked 223.16 MiB above current, total RAM usage 646.81 MiB





In [8]:
# join all the partial results back together
train_merged = pd.concat(all_train_merged)
train_merged.shape

(428932, 13)

In [8] used -46.6719 MiB RAM in 0.17s, peaked 46.67 MiB above current, total RAM usage 600.14 MiB


In [9]:
features = train_merged.drop(columns='target').columns
features

Index(['bid_price1_var', 'ask_price1_var', 'bid_price2_var', 'ask_price2_var',
       'bid_size1_var', 'ask_size1_var', 'bid_price1_mean', 'ask_price1_mean',
       'bid_price2_mean', 'ask_price2_mean', 'bid_size1_mean',
       'ask_size1_mean'],
      dtype='object')

In [9] used 1.3477 MiB RAM in 0.12s, peaked 0.00 MiB above current, total RAM usage 601.48 MiB


# Features

In [10]:
#time_ids_train, time_ids_test
def train_test_split(df, target_col, time_ids_train, time_ids_test):
    X_train = df.query('time_id in @time_ids_train').drop(columns=[target_col, 'time_id'])
    X_test = df.query('time_id in @time_ids_test').drop(columns=[target_col, 'time_id'])
    y_train = df.query('time_id in @time_ids_train')[target_col]
    y_test = df.query('time_id in @time_ids_test')[target_col]
    return X_train, X_test, y_train, y_test

feature_cols = list(features) + ['stock_id']
X_train, X_test, y_train, y_test = train_test_split(train_merged.reset_index()[feature_cols+['time_id', 'target']], 'target', time_ids_train, time_ids_test)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((428932, 13), (0, 13), (428932,), (0,))

In [10] used -25.7734 MiB RAM in 0.23s, peaked 25.77 MiB above current, total RAM usage 575.71 MiB


In [11]:
X_train.head(3)

Unnamed: 0,bid_price1_var,ask_price1_var,bid_price2_var,ask_price2_var,bid_size1_var,ask_size1_var,bid_price1_mean,ask_price1_mean,bid_price2_mean,ask_price2_mean,bid_size1_mean,ask_size1_mean,stock_id
0,1.539389e-07,1.544176e-07,1.349132e-07,1.465483e-07,11855.387055,6892.936853,1.000904,1.001389,1.000779,1.001524,127.993569,111.192926,59
1,3.412355e-07,4.039657e-07,3.29811e-07,4.177058e-07,22438.854139,32757.977261,1.000577,1.001177,1.000439,1.001341,141.639594,199.111675,59
2,5.515992e-07,5.405377e-07,5.369475e-07,5.369036e-07,10481.927458,12315.572839,0.999024,0.999396,0.998933,0.99948,150.673077,168.711538,59


In [11] used 0.2578 MiB RAM in 0.13s, peaked 0.00 MiB above current, total RAM usage 575.97 MiB


In [12]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((428932, 13), (0, 13), (428932,), (0,))

In [12] used -5.4922 MiB RAM in 0.11s, peaked 5.49 MiB above current, total RAM usage 570.48 MiB


# ML on a train/test split

In [13]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [13] used 20.6758 MiB RAM in 0.60s, peaked 0.00 MiB above current, total RAM usage 591.15 MiB


In [14]:
#est = LinearRegression()
est = RandomForestRegressor(n_estimators=10) # default n_estimators==100

In [14] used 0.0000 MiB RAM in 0.10s, peaked 0.00 MiB above current, total RAM usage 591.15 MiB


In [15]:
est.fit(X_train, y_train)

RandomForestRegressor(n_estimators=10)

In [15] used 334.4414 MiB RAM in 102.82s, peaked 0.00 MiB above current, total RAM usage 925.59 MiB


In [16]:
if 'feature_importances_' in dir(est):
    feature_col = 'feature_importances_'
if 'coef_' in dir(est):
    feature_col = 'coef_'
df_features = pd.DataFrame(zip(getattr(est, feature_col), feature_cols), columns=['importance', 'feature']).set_index('importance')
df_features.sort_index(ascending=False)

Unnamed: 0_level_0,feature
importance,Unnamed: 1_level_1
0.271529,ask_price2_mean
0.260803,bid_price2_mean
0.090943,bid_size1_mean
0.053607,ask_price2_var
0.045695,bid_price2_var
0.041194,ask_size1_mean
0.03936,bid_price1_mean
0.037611,ask_price1_mean
0.036861,bid_price1_var
0.035397,ask_price1_var


In [16] used 0.0703 MiB RAM in 0.20s, peaked 0.00 MiB above current, total RAM usage 925.66 MiB


# Make predictions

In [17]:
len(stock_ids) # expecting 112

112

In [17] used 0.0000 MiB RAM in 0.11s, peaked 0.00 MiB above current, total RAM usage 925.66 MiB


In [18]:
if USE_TEST_LOCAL_6_ITEMS: # True if debugging
    # book train as a substitute
    df_test_all = pd.read_csv(os.path.join(ROOT, 'test_local.csv'))
    df_test_all = df_test_all.rename(columns={'target': 'train_target'})
    TEST_FOLDER = 'book_test_local.parquet'
    assert ENV_HOME == True
else:
    df_test_all = pd.read_csv(TEST_CSV)
    if df_test_all.shape[0] == 3: # kaggle test data
        df_test_all = df_test_all[:1] # cut out 2 rows so predictions work    
    TEST_FOLDER = 'book_test.parquet'
print(ROOT, TEST_FOLDER)
df_test_all = df_test_all.set_index(['stock_id', 'time_id'])

/home/ian/data/kaggle/optiver_volatility/ book_test_local.parquet
In [18] used 1.3359 MiB RAM in 0.14s, peaked 0.00 MiB above current, total RAM usage 927.00 MiB


In [19]:
test_set_predictions = []
stock_ids_test = get_training_stock_ids(TEST_FOLDER) # all stocks by default

df_test_predictions = pd.DataFrame() # prediction set to build up
for stock_id in tqdm(stock_ids_test):
    parquet_filename = os.path.join(ROOT, f'{TEST_FOLDER}/stock_id={stock_id}')
    #print(f"reading {parquet_filename}")
    df_book_test_stock_X = pd.read_parquet(parquet_filename)
    df_book_test_stock_X['stock_id'] = stock_id
    features_var1_test = make_features1(df_book_test_stock_X, 'var', cols)
    features_mean1_test = make_features1(df_book_test_stock_X, 'mean', cols)
    
    df_test_all_X = df_test_all.query('stock_id==@stock_id').copy()
    test_merged = pd.merge(df_test_all_X, features_var1_test, left_index=True, right_index=True)
    test_merged = pd.merge(test_merged, features_mean1_test, left_index=True, right_index=True)
    #test_merged = pd.merge(df_test_all_X, features_var1_test, left_index=True, right_index=True)
    test_set_predictions_X = est.predict(test_merged.reset_index()[list(features) + ['stock_id']])
    df_test_all_X['target'] = test_set_predictions_X
    df_test_predictions = pd.concat((df_test_predictions, df_test_all_X))
    
assert df_test_all.shape[0] == df_test_predictions.shape[0], "Expecting all rows to be predicted"

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:03<00:00,  1.77it/s]

In [19] used 324.2188 MiB RAM in 3.51s, peaked 334.82 MiB above current, total RAM usage 1251.22 MiB





In [20]:
print(f"Writing {df_test_predictions.shape[0]} rows to submission.csv on {datetime.datetime.utcnow()}")
df_test_predictions.reset_index()[['row_id', 'target']].to_csv('submission.csv', index=False)

Writing 22980 rows to submission.csv on 2021-07-30 14:26:48.334773
In [20] used -5.6914 MiB RAM in 0.20s, peaked 5.69 MiB above current, total RAM usage 1245.53 MiB
