This notebook is used to generate the datasets used to train and test the model in a different notebook. We generate a torch tensor of shape [428932, 600, 7] where 428932 is the number of training examples, 600 is th length of each time series, and 7 is the number of channels in each entry of the time series .

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import torch
import torch.nn as nn
torch.manual_seed(5)
import pyarrow.parquet as pq
import time
pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
MAX_TIME_ID = int(3.276700e+04)
MAX_STOCK_ID = 126

In [6]:
def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff()

In [7]:
# This cell loads in the training set. The training set contains (stock, time) pairs and the target volatility

train  = pd.read_csv("/content/drive/MyDrive/optiver-realized-volatility-prediction/train.csv")
train["stock_id"] = train["stock_id"].astype("int")
train["time_id"] = train["time_id"].astype("int")


train_ids = np.zeros((len(train), 2), dtype="int")


train_ids[:,0] = train["stock_id"].values
train_ids[:,1] = train["time_id"].values


In [10]:
train.head()

Unnamed: 0,stock_id,time_id,target
0,0,5,0.004136
1,0,11,0.001445
2,0,16,0.002168
3,0,31,0.002195
4,0,62,0.001747


In [15]:
stock_id, time_id = train_ids[0] #taking the first traing case

In [8]:
# here we load in all the order book data organised by stock id into dictionaries for faster lookup times. We calculate the additional features of "BidAskSpread" amd "WAP"
order_books = {}
for stock_id in range(MAX_STOCK_ID+1):
    try:
        order_books["{}".format(stock_id)] = pq.read_table("/content/drive/MyDrive/optiver-realized-volatility-prediction/book_train.parquet/stock_id={}".format(stock_id)).to_pandas()
        order_books["{}".format(stock_id)]["time_id"] = order_books["{}".format(stock_id)]["time_id"].astype("int")
        order_books["{}".format(stock_id)]["WAP"] =  (order_books["{}".format(stock_id)]['bid_price1'] * order_books["{}".format(stock_id)]['ask_size1'] + order_books["{}".format(stock_id)]['ask_price1'] * order_books["{}".format(stock_id)]['bid_size1']) / (order_books["{}".format(stock_id)]['bid_size1']+ order_books["{}".format(stock_id)]['ask_size1'])
        order_books["{}".format(stock_id)]["BidAskSpread"] = order_books["{}".format(stock_id)]["ask_price1"]/order_books["{}".format(stock_id)]["bid_price1"]-1
    except:
        print(stock_id)


       

12
24
25
45
49
54
57
65
71
79
91
92
106
117
121


In [16]:
#below is a snapshot of the order book for a specific time id. From this orderbook we will construct a time series of our desired features
orders = order_books[f"{stock_id}"]
orders[orders["time_id"]==time_id].head()

Unnamed: 0,time_id,seconds_in_bucket,bid_price1,ask_price1,bid_price2,ask_price2,bid_size1,ask_size1,bid_size2,ask_size2,WAP,BidAskSpread
0,5,0,1.001422,1.002301,1.00137,1.002353,3,226,2,100,1.001434,0.000878
1,5,1,1.001422,1.002301,1.00137,1.002353,3,100,2,100,1.001448,0.000878
2,5,5,1.001422,1.002301,1.00137,1.002405,3,100,2,100,1.001448,0.000878
3,5,6,1.001422,1.002301,1.00137,1.002405,3,126,2,100,1.001443,0.000878
4,5,7,1.001422,1.002301,1.00137,1.002405,3,126,2,100,1.001443,0.000878


In [9]:
# here we load in trade book data
trade_books = {}

for stock_id in range(MAX_STOCK_ID+1):
    try:
        trade_books["{}".format(stock_id)] = pq.read_table("/content/drive/MyDrive/optiver-realized-volatility-prediction/trade_train.parquet/stock_id={}".format(stock_id)).to_pandas()
        trade_books["{}".format(stock_id)]["time_id"] = trade_books["{}".format(stock_id)]["time_id"].astype("int")
  
    except:
        print(stock_id)


12
24
25
45
49
54
57
65
71
79
91
92
106
117
121


In [17]:
# below is snapshot of trade book for specific time id
trades = trade_books[f"{stock_id}"]
trades[trades["time_id"]==time_id].head()

Unnamed: 0,time_id,seconds_in_bucket,price,size,order_count
0,5,21,1.002301,326,12
1,5,46,1.002778,128,4
2,5,50,1.002818,55,1
3,5,57,1.003155,121,5
4,5,68,1.003646,4,1


In [None]:
# to_arr tkes in the trade book and orderbook for a given time, concatenates them according to the time index, fills in the nan values and then
# takes out our desired features. It also calculates the volatility of the 10 minute period to be used in order to forecast the volatility of the
# next period in the model. The output is a numpy array of shape (num_entries, num_channels)

def to_arr(orders, trades, features):
        
    orders = orders.set_index("seconds_in_bucket")
    orders["log return"] = log_return(orders["WAP"])
    orders["vol"] = np.sqrt((orders["log return"]**2).cumsum())

    trades = trades.set_index("seconds_in_bucket")
    trades["size"] = trades["size"].cumsum()
    trades["order_count"] = trades["order_count"].cumsum()

    df = pd.concat([orders, trades], axis=1)
    #print(df.columns)              
    df["seconds_in_bucket"] = df.index.values
    df = df.fillna(method="ffill").fillna(0)
    #print(rv)
    #print(features)
    df = df[features]

    rv = df["vol"].values[-1]

        
    return df.values, np.array(rv)

In [None]:
# from the output of to_arr, we generate a time series using array slicing and pad the array using the final values such that each time series is of the same length
# returns an array of shape (600, num_channels)
def create_time_series(arr, max_len=600):
  
    num_features = arr.shape[1]
    
    X = np.zeros((1, max_len, num_features))
    

    for j in range(max_len):

        try:
          X[0,j,:] = arr[j,:]
        
        except:
          X[0,j,:] = X[0,j-1,:]

    return X
    
    

In [None]:
# returns the time series for a given stock and time id, as well as the RV calculated in to_arr, and the categorical stock and time entry

def generate_training_data(stock_id, time_id, orders, trades, features):
    
    X, RV = to_arr(orders, trades, features)
    
    X = create_time_series(X, max_len=600)

    X_cat = np.array([int(stock_id), int(time_id)]).reshape(-1,2)
    
    return X, X_cat, RV.reshape(-1,1)

In [None]:
def get_data(features, size = len(train)):

    X = torch.zeros((size, 600, len(features)))
    X_cat =  torch.zeros((size, 2))
    RV = torch.zeros((size, 1))
    Y = torch.zeros((size,1))
    
    count=0
    s=time.time()
    
    for i, j in train_ids:
        
        if count==size:
            break
            
        
        orders = order_books[f"{i}"]
        trades = trade_books[f"{i}"]
        orders = orders[orders["time_id"]==j]
        trades = trades[trades["time_id"]==j]
        trn = train[(train["stock_id"]==i) & (train["time_id"]==j)]["target"].values.reshape(-1,1)
        x, x_cat, rv = generate_training_data(stock_id=i,time_id=j,orders=orders, trades=trades,features=features)

        X[count,:,:] = torch.from_numpy(x).float()
        X_cat[count,:] = torch.from_numpy(x_cat).long()
        Y[count,:]  = torch.from_numpy(trn).float()
        RV[count,:] = torch.from_numpy(rv).float()
        count+=1
        
        
        if count%(size//100)==0:
            
            d = time.time()-s
            
            print("duration to complete {}/{} = {} seconds".format(count, size, d))
    
    torch.save(X, f"/content/drive/MyDrive/{features}.pt") # saves training data to drive
    torch.save(X_cat, "/content/drive/MyDrive/X_cat.pt")
    torch.save(RV, "/content/drive/MyDrive/RV.pt")
    torch.save(Y, "/content/drive/MyDrive/Y.pt")
    print("save complete")

    return X
    

In [None]:
############### HYPERPARAMETERS #################

# here we specify the features we want in our trainin set and how many training cases we want to consider (mainly for memory purposes)

features = ["seconds_in_bucket","vol", "WAP", "BidAskSpread", "log return", "order_count", "size"]
size = len(train)



In [None]:
X = get_data(size=size, features = features) 

Below are various transformations thatw ere experimented with

In [None]:
def shorten(X, num):

  print(f"---------------------- num = {num} ---------------------------")

  new = torch.zeros(X.shape[0], num, X.shape[2])

  s = time.time()

  for i in range(X.shape[0]):

    for j in range(1, X.shape[1]):

      if X[i,j,0] == X[i,j-1,0]:


        if len(X[i,:,0].unique())<num:

          aux = X[i, -1, :]*torch.ones(1, num, X.shape[2])

          aux[0,:len(X[i,:,0].unique()),:] = X[i,:j,:]

          new[i,:,:] = aux

        else:

          new[i,:,:] = X[i,j-(num):j,:]

        break
    
    if (i+1)%(X.shape[0]//10)==0:
            
        d = time.time()-s
            
        print("duration to complete {}/{} = {} seconds".format(i+1, X.shape[0], d))

  
  return new

In [None]:
import copy
import math
from scipy.ndimage import shift
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import as_float_array

class AddTime(BaseEstimator, TransformerMixin):
    def __init__(self, init_time=0., total_time=1.):
        self.init_time = init_time
        self.total_time = total_time

    def fit(self, X, y=None):
        return self

    def transform_instance(self, X):
        t = np.linspace(self.init_time, self.init_time + 1, len(X))
        return np.c_[t, X]

    def transform(self, X, y=None):
        return [self.transform_instance(x) for x in X]

class LeadLag(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform_instance(self, X):
        lag = []
        lead = []

        for val_lag, val_lead in zip(X[:-1], X[1:]):
            lag.append(val_lag)
            lead.append(val_lag)

            lag.append(val_lag)
            lead.append(val_lead)

        lag.append(X[-1])
        lead.append(X[-1])

        return np.c_[lag, lead]

    def transform(self, X, y=None):
        return [self.transform_instance(x) for x in X]

def transform(paths, at=False, ll=False, scale=1.):
    paths = scale*paths
    if ll:
        paths = LeadLag().fit_transform(paths)
    if at:
        paths = AddTime().fit_transform(paths)
    return np.array(paths)


In [None]:
def lead_lag(x):

  new = torch.zeros(x.shape[0], 2*x.shape[1]-1, 2*x.shape[2]-1)

  s = time.time()

  for i in range(x.shape[0]):

    new[i,:,:] = torch.from_numpy(np.transpose(transform(torch.transpose(x[0] ,dim0=1, dim1=0), ll=True), (1,0,2)).reshape(2*x.shape[1]-1, -1)[:,1:])

    if (i+1)%(x.shape[0]//500)==0:
            
        d = time.time()-s
            
        print("duration to complete {}/{} = {} seconds".format(i+1, x.shape[0], d))

  

  return new