In [13]:
# Import the libraries required to estimate the index returns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Read in the training dataset provided by Optiver
all_data = pd.read_csv('train.csv')

# Read in the index data estimated by PCA
idx_rets = pd.read_parquet('idx_rets.gzip')

In [15]:
## We will generate the features we want to feed into the prediction model later
## These will be the EWM versions of the active returns for each stock

# Remove any unnecessary columns first
stock_data = all_data[['date_id', 'seconds_in_bucket', 'stock_id', 'wap']]
target_data = all_data[['date_id', 'seconds_in_bucket', 'stock_id', 'target']]

# Calculate the returns from the WAP provided per stock
stock_data['stock_rets'] = stock_data.groupby(['date_id', 'stock_id'])['wap'].pct_change()

# Remove all time zero data as this is irrelevant in return space
stock_data = stock_data[stock_data['seconds_in_bucket'] > 0]

# Remove the prices now that we only need return information
stock_rets = stock_data.drop(['wap'], axis=1)

# Add in index returns to the stock return dataframe
all_rets = pd.merge(left=stock_rets, right=idx_rets, how='left', on=['date_id', 'seconds_in_bucket'])

# Calculate active returns per stock and timestep
all_rets['active_rets'] = all_rets['stock_rets'] - all_rets['idx_rets']

# Calculate the EWM active returns for half lives of 10s, 20s, 30s, 60s and 120s
ewm_half_life = [10, 20, 30, 60, 120]

for half_life in ewm_half_life:
    all_rets['active_rets_ewm_' + str(half_life)] = all_rets.groupby(['stock_id'])['active_rets'].transform(lambda x: x.ewm(halflife=half_life/10).mean())

model_data = pd.merge(left=all_rets, right=target_data, how='left', on=['date_id', 'seconds_in_bucket', 'stock_id'])
model_data.drop(['seconds_in_bucket', 'stock_id', 'stock_rets', 'idx_rets'], axis = 1, inplace=True)

model_data.to_parquet('model_data.gzip', compression='gzip')

  stock_data['stock_rets'] = stock_data.groupby(['date_id', 'stock_id'])['wap'].pct_change()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stock_data['stock_rets'] = stock_data.groupby(['date_id', 'stock_id'])['wap'].pct_change()
