# Optiver Competition Submission
## By: Karim Zakir

Below is my submission for the [Optiver Competition](https://www.kaggle.com/c/optiver-realized-volatility-prediction/overview) on Kaggle. The submission had a score of 0.27818 (the lower the better). This placed me 3,151st out of 3965 teams. While this is a low placement, the competition provided me with a great learning opportunity, as well as an interesting challenge! I am looking forward to learning from my mistakes and studying new concepts and scoring better on my next competition.

In [24]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
from xgboost import XGBRegressor
import glob
import pickle
import multiprocessing
from multiprocessing import Process, Manager

book_train_files = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/book_train.parquet/*')
book_test_files = glob.glob("/kaggle/input/optiver-realized-volatility-prediction/book_test.parquet/*")

trade_train_files = glob.glob("/kaggle/input/optiver-realized-volatility-prediction/trade_train.parquet/*")
trade_test_files = glob.glob("/kaggle/input/optiver-realized-volatility-prediction/trade_test.parquet/*")
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [23]:
book_sample_list = ["../input/optiver-realized-volatility-prediction/book_train.parquet/stock_id=0",
                    "../input/optiver-realized-volatility-prediction/book_train.parquet/stock_id=1",
                    "../input/optiver-realized-volatility-prediction/book_train.parquet/stock_id=10"]
trade_sample_list = ["../input/optiver-realized-volatility-prediction/trade_train.parquet/stock_id=0",
                     "../input/optiver-realized-volatility-prediction/trade_train.parquet/stock_id=1",
                     "../input/optiver-realized-volatility-prediction/trade_train.parquet/stock_id=10"]

In [22]:
def prepare_submission(book_train_files, book_test_files, trade_train_files, trade_test_files):
    model = train_model(book_train_files, trade_train_files)
    
    f = open("model.pickle", "wb")
    pickle.dump(model, f)
    f.close()
    
    test_data = prepare_features_multi(book_test_files, trade_test_files, 30)
        
    predictions = model.predict(test_data.drop(["row_id"], axis=1))
    
    submission = pd.DataFrame(predictions, index=test_data["row_id"], columns=["target"]).reset_index()
    
    submission.to_csv("submission.csv", index=False)
    
def train_model(book_train_files, trade_train_files):
    
    model = XGBRegressor()
    
    X, y = prepare_train_data(book_train_files, trade_train_files)
    
    model.fit(X, y)
    
    return model

def prepare_train_data(book_data, trade_data):
    features = prepare_features_multi(book_data, trade_data, 30)
    
    train = pd.read_csv("/kaggle/input/optiver-realized-volatility-prediction/train.csv")
    train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
    
    train_data = train.merge(features, on="row_id")

    X = train_data.drop(["stock_id", "time_id", "row_id", "target"], axis=1)
    y = train_data["target"]
    
    return X, y

In [34]:
def prepare_features_multi(book_data, trade_data, modular):
    count = 0
    df_with_features = pd.DataFrame()
    split = int(len(book_data) / 4) + 1
    split_book_data = np.array_split(book_data, split)
    split_trade_data = np.array_split(trade_data, split)
    for book_split, trade_split in zip(split_book_data, split_trade_data):
        with Manager() as manager:
            result_list = manager.list()
            processes = []
            for book_file, trade_file in zip(book_split, trade_split):
                p = Process(target=prepare_features, args=(book_file, trade_file, result_list, modular))
                p.start()
                processes.append(p)
            for p in processes:
                p.join()
            result_list = list(result_list)
            df_with_features = pd.concat([df_with_features] + result_list)
            count += len(book_split)
            print(count)
    return df_with_features

In [5]:
# def prepare_feature_multi(book_data, trade_data, modular):
#     with Manager() as manager:
#         result_list = manager.list()
#         pool = multiprocessing.Pool(processes=5)
#         pool.starmap(prepare_features, book_data, trade_data, result_list, modular)
#         result_list = list(result_list)
#         df_with_features = pd.concat(result_list)
#         return df_with_features

In [26]:
def prepare_features(book_file, trade_file, result_list, modular):    
    book_features = prepare_book_features(book_file)
    trade_features = prepare_trade_features(trade_file)
    combined_features = book_features.merge(trade_features, on=["row_id"], how="outer")
    result_list.append(combined_features)

def prepare_book_features(file_path):
    stock_id = file_path.split('=')[1]
    
    book_data = pd.read_parquet(file_path)
    
    book_data["wap1"] = (book_data["bid_price1"] * book_data["ask_size1"] + book_data["ask_price1"] * book_data["bid_size1"]) \
                        / (book_data["bid_size1"] + book_data["ask_size1"])
    book_data["wap2"] = (book_data["bid_price2"] * book_data["ask_size2"] + book_data["ask_price2"] * book_data["bid_size2"]) \
                        / (book_data["bid_size2"] + book_data["ask_size2"])
    book_data['log_return1'] = book_data.groupby(['time_id'])['wap1'].apply(log_return)
    book_data["log_return2"] = book_data.groupby(['time_id'])["wap2"].apply(log_return)
    book_data = book_data[~book_data["log_return1"].isnull()]
    book_data = book_data[~book_data["log_return2"].isnull()]
    book_data["bid_ask_spread1"] = book_data["ask_price1"] / book_data["bid_price1"] - 1
    book_data["bid_ask_spread2"] = book_data["ask_price2"] / book_data["bid_price2"] - 1
    book_data["bid_ask_ratio"] = book_data["bid_size1"] / book_data["ask_size1"]
    transposed_prices = book_data[["bid_price1", "ask_price1", "bid_price2", "ask_price2"]].transpose()
    book_data["std_price"] = transposed_prices.std()
    transposed_sizes = book_data[["bid_size1", "ask_size1", "bid_size2", "ask_size2"]].transpose()
    book_data["std_size"] = transposed_sizes.std()
    book_data["std_price_seconds"] = book_data["std_price"] * book_data["seconds_in_bucket"]
    book_data["log_return_seconds"] = book_data["log_return1"] * book_data["seconds_in_bucket"]
    book_data["bid_ask_spread1_seconds"] = book_data["bid_ask_spread1"] * book_data["seconds_in_bucket"]
    book_data["bid_1_price_size"] = book_data["bid_price1"] * book_data["bid_size1"]
    book_data["bid_2_price_size"] = book_data["bid_price2"] * book_data["bid_size2"]
    book_data["ask_1_price_size"] = book_data["ask_price1"] * book_data["ask_size1"]
    book_data["ask_2_price_size"] = book_data["ask_price2"] * book_data["ask_size2"]
    transposed_prices_size = book_data[["bid_1_price_size", "ask_1_price_size", "bid_2_price_size", "ask_2_price_size"]].transpose()
    book_data["prices_size_std"] = transposed_prices.std()

    groupby_dict = {
        "log_return1": ["mean", "std", numeric_range, realized_volatility, time_range],
        "log_return2": ["mean", "std", numeric_range, realized_volatility, time_range],
        "bid_ask_spread1": ["mean", "std"],
        "bid_ask_spread2": ["mean", "std"],
        "bid_ask_ratio": ["mean", "std"],
        "std_price": ["mean"],
        "std_size": ["mean"],
        "std_price_seconds": ["mean"],
        "log_return_seconds": ["mean", "std", time_range],
        "bid_ask_spread1_seconds": [time_range],
        "prices_size_std": ["mean"]
    }
    
    result_df = book_data.groupby("time_id", as_index=False).agg(groupby_dict)
    
    result_df.columns = ['_'.join(col).strip('_') for col in result_df.columns]
            
    result_df["row_id"] = result_df["time_id"].apply(lambda time_id: f"{stock_id}-{time_id}")
    
    result_df.drop("time_id", axis=1, inplace=True)
        
    return result_df


def prepare_trade_features(file_path):
    stock_id = file_path.split("=")[1]
        
    trade_data = pd.read_parquet(file_path)
    trade_data["trade_log_return"] = trade_data.groupby(["time_id"])["price"].apply(log_return)
    trade_data = trade_data[~trade_data["trade_log_return"].isnull()]
    trade_data["trade_log_return_seconds_interaction"] = trade_data["trade_log_return"] * trade_data["seconds_in_bucket"]
    trade_data["trade_log_return_size_interaction"] = trade_data["trade_log_return"] * trade_data["size"]
    trade_data["size_per_order"] = trade_data["size"] / trade_data["order_count"]
    trade_data["size_order_interaction"] = trade_data["size"] * trade_data["order_count"]

    groupby_dict = {
        "price": ["std", time_range, numeric_range, "count"],
        "size": ["mean", "std", numeric_range, "sum"],
        "trade_log_return": ["mean", "std", numeric_range, time_range, realized_volatility],
        "trade_log_return_seconds_interaction": ["std"],
        "trade_log_return_size_interaction": ["std", "mean"],
        "size_per_order": ["mean"],
        "size_order_interaction": ["mean"],
        
    }
        
    result_df = trade_data.groupby("time_id", as_index=False).agg(groupby_dict)
    result_df.fillna(0, inplace=True)
    
    result_df.columns = ['_'.join(col).strip('_') for col in result_df.columns]
    
    result_df["row_id"] = result_df["time_id"].apply(lambda time_id: f"{stock_id}-{time_id}")
    
    result_df.drop("time_id", axis=1, inplace=True)
    
    return result_df

def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff()

def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))

def time_range(interaction_var):
    return interaction_var.iloc[-1] - interaction_var.iloc[0]

def numeric_range(prices):
    return max(prices) - min(prices)

In [44]:
# prepare_submission(book_sample_list, book_test_files, trade_sample_list, trade_test_files)

In [35]:
prepare_submission(book_train_files, book_test_files, trade_train_files, trade_test_files)

4
8
12
16
20
24
28
32
36
40
44
48
52
56
60
64
68
72
76
80
84
88
92
96
100
103
106
109
112
1
