In [None]:
# Import the libraries required to estimate the index returns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spd_matrix as spd
from sklearn.decomposition import PCA

# Read in the training dataset provided by Optiver
all_data = pd.read_csv('train.csv')

# Read in the index data estimated by PCA
idx_rets = pd.read_csv('idx_rets.csv')
idx_rets.drop('Unnamed: 0', axis=1, inplace=True)

In [None]:
## We will generate the features we want to feed into the prediction model later
## These will be the EWM versions of the active returns for each stock

# Remove any unnecessary columns first
stock_data = all_data[['date_id', 'seconds_in_bucket', 'stock_id', 'wap']]

# Calculate the returns from the WAP provided per stock
stock_data['stock_rets'] = stock_data.groupby(['date_id', 'stock_id'])['wap'].pct_change()

# Remove all time zero data as this is irrelevant in return space
stock_data = stock_data[stock_data['seconds_in_bucket'] > 0]

# Remove the prices now that we only need return information
stock_rets = stock_data.drop(['wap'], axis=1)

# Add in index returns to the stock return dataframe
all_rets = pd.merge(left=stock_rets, right=idx_rets, how='left')

# Calculate active returns per stock and timestep
all_rets['active_rets'] = all_rets['stock_rets'] - all_rets['idx_rets']

# Calculate the EWM active returns for half lives of 10s, 20s, 30s and 60s
all_rets['active_rets_ewm_10'] = all_rets.groupby(['stock_id'])['active_rets'].transform(lambda x: x.ewm(halflife=1).mean())
all_rets['active_rets_ewm_20'] = all_rets.groupby(['stock_id'])['active_rets'].transform(lambda x: x.ewm(halflife=2).mean())
all_rets['active_rets_ewm_30'] = all_rets.groupby(['stock_id'])['active_rets'].transform(lambda x: x.ewm(halflife=3).mean())
all_rets['active_rets_ewm_60'] = all_rets.groupby(['stock_id'])['active_rets'].transform(lambda x: x.ewm(halflife=6).mean())