In [None]:
# Import the libraries required to estimate the index returns
import pandas as pd
import numpy as np
import spd_matrix as spd
from sklearn.decomposition import PCA

# Read in the training dataset provided by Optiver
all_data = pd.read_csv('train.csv')

In [4]:
## This is an attempt to estimate the index time series by performing PCA on the stock data
## It used the weekly index to estimate the covariance matrix - this ensures we remove market noise
## The factor loadings then seem to work as expected and can be thought of as index weights per stock
## Clearly the assumption here is that the index weights remain stable throughout the dataset
## This assumption is not true, so these weights can be thought of as average index weights over the period

# For each week, calculate the stock returns and estimate the covariance matrix - use this covariance matrix in the PCA 
# to then estimate factor loadings of the first factor (likely the market factor) to each of the stocks
end_of_day_data = all_data[all_data['seconds_in_bucket'] == 540]
end_of_week_data = end_of_day_data[end_of_day_data['date_id'] % 5 == 0]
weekly_ts_prices = end_of_week_data.pivot(index='date_id', columns='stock_id', values = 'wap')
weekly_ts_returns = weekly_ts_prices.pct_change(1)
weekly_ts_returns = weekly_ts_returns.iloc[1:]

# Once we have the returns we need to z-score the returns per stock to ensure the PCA does not "reward" more volatile ones
weekly_ts_returns_zcore = (weekly_ts_returns - weekly_ts_returns.mean()) / weekly_ts_returns.std(ddof=0)

# Some of the stocks have missing data for some dates/weeks - we will use the pd.corr() function
# This function ignores the missing data and estimates the pairwise correlation which is helpful
cov_matrix = weekly_ts_returns.cov()

# Given the corr() function estimates pairwise correlation ignoring missing data, it might not be PD
# Use the function below to get the nearest PD matrix to the original one before any PCA is done
cov_matrix_spd = spd.nearestPD(cov_matrix)

# Perform the PCA on this correlation matrix to get the factor loadings of the first factor
# This first factor should be the market factor and then we can normalise loadings to get index weights
num_assets = len(weekly_ts_returns.columns)
pca = PCA(n_components=num_assets, svd_solver='full')
pca.fit(cov_matrix_spd)
factor_loadings_cov = pca.components_.T * np.sqrt(pca.explained_variance_)
factor_loadings_mkt_cov = factor_loadings_cov[:, 0]
factor_loadings_mkt_cov = factor_loadings_mkt_cov / sum(factor_loadings_mkt_cov)

  weekly_ts_returns = weekly_ts_prices.pct_change(1)


In [6]:
## The covariance approach using weekly data provided the best result in terms of index weights (all positive)
## We will now use these weights to estimate the index returns for each timestep in the original dataset

# Initialise empty variable to assign index returns to later
max_dates = int(max(all_data['date_id']) + 1)
timesteps_per_day = int(max(all_data['seconds_in_bucket']) / 10)
idx_returns = np.empty((max_dates * timesteps_per_day, 3,))
idx_returns[:, 1] = list(range(10, max(all_data['seconds_in_bucket']) + 1, 10)) * max_dates

# Loop through each date to calculate index returns - we do this due to missing stock data, etc.
for date_id in range(max(all_data['date_id']) + 1):

    date_data = all_data[all_data['date_id'] == date_id]
    date_ts_prices = date_data.pivot(index='time_id', columns='stock_id', values='wap')
    date_ts_returns = date_ts_prices.pct_change(1)
    date_ts_returns = date_ts_returns.iloc[1:]

    # In case of any missing data, remove all data for that stock
    date_ts_returns.dropna(axis=1,how='any',inplace=True)
    valid_stock_ids = sorted(date_ts_returns.columns)

    # Only get the factor loadings for the stocks with data in this timestep
    valid_factor_loadings = factor_loadings_mkt_cov[valid_stock_ids]

    # Normalise the factor loadings for the stocks with data
    norm_factor_loadings = valid_factor_loadings / sum(valid_factor_loadings)

    # Calculate the index returns based on stock returns and factor loadings
    date_idx_contr = date_ts_returns * norm_factor_loadings.T
    date_idx_returns = date_idx_contr.sum(axis=1)

    # Assign output to the index return variable 
    idx_returns[timesteps_per_day * date_id : timesteps_per_day * (date_id + 1), 0] = date_id
    idx_returns[timesteps_per_day * date_id : timesteps_per_day * (date_id + 1), 2] = date_idx_returns

idx_rets_df = pd.DataFrame(idx_returns)
idx_rets_df.rename(columns={0: 'date_id', 1: 'seconds_in_bucket', 2: 'idx_rets'},inplace=True)

idx_rets_df.to_parquet('idx_rets.gzip', compression='gzip')

  date_ts_returns = date_ts_prices.pct_change(1)
  date_ts_returns = date_ts_prices.pct_change(1)
  date_ts_returns = date_ts_prices.pct_change(1)
  date_ts_returns = date_ts_prices.pct_change(1)


In [None]:
## The cells below this are ones which were used to test other ways of estimating the index weights
## They did not work as well as the covariance approach using weekly stock data

In [None]:
## This is an attempt to estimate the index time series by performing PCA on the stock data
## It used the weekly index to estimate the correlation matrix - this ensures we remove market noise
## The factor loadings then seem to work as expected and can be thought of as index weights per stock
## Clearly the assumption here is that the index weights remain stable throughout the dataset
## This assumption is not true, so these weights can be thought of as average index weights over the period

# For each week, calculate the stock returns and estimate the correlation matrix - use this correlation matrix in the PCA 
# to then estimate factor loadings of the first factor (likely the market factor) to each of the stocks
end_of_day_data = all_data[all_data['seconds_in_bucket'] == max(all_data['seconds_in_bucket'])]
end_of_week_data = end_of_day_data[end_of_day_data['date_id'] % 5 == 0]
weekly_ts_prices = end_of_week_data.pivot(index='date_id', columns='stock_id', values = 'wap')
weekly_ts_returns = weekly_ts_prices.pct_change(1)
weekly_ts_returns = weekly_ts_returns.iloc[1:]

# Some of the stocks have missing data for some dates/weeks - we will use the pd.corr() function
# This function ignores the missing data and estimates the pairwise correlation which is helpful
corr_matrix = weekly_ts_returns.corr()

# Given the corr() function estimates pairwise correlation ignoring missing data, it might not be PD
# Use the function below to get the nearest PD matrix to the original one before any PCA is done
corr_matrix_spd = spd.nearestPD(corr_matrix)

# Perform the PCA on this correlation matrix to get the factor loadings of the first factor
# This first factor should be the market factor and then we can normalise loadings to get index weights
num_assets = len(weekly_ts_returns.columns)
pca = PCA(n_components=num_assets, svd_solver='full')
pca.fit(corr_matrix_spd)
factor_loadings_corr = pca.components_.T * np.sqrt(pca.explained_variance_)
factor_loadings_mkt_corr = factor_loadings_corr[:, 0]
factor_loadings_mkt_corr = factor_loadings_mkt_corr / sum(factor_loadings_mkt_corr)

In [None]:
## This was an attempt to estimate the index time series by performing PCA on the stock data
## It used the 10s tick data to estimate the correlation matrix - this tick data will have a lot of noise
## The factor loadings did not work as expected due to this noise

# For each date, calculate the stock returns and estimate the correlation matrix - use this correlation matrix in the PCA 
# to then estimate factor loadings of the first factor (likely the market factor) to each of the stocks
max_dates = max(all_data['date_id']) + 1
max_assets = max(all_data['stock_id']) + 1
daily_loadings_mkt = np.empty((max_dates, max_assets,))
daily_loadings_mkt[:] = np.nan

for date_id in range(max(all_data['date_id']) + 1):
    date_data = all_data[all_data['date_id'] == date_id]
    # Pivot the dataframe to create a wap time series for all the stocks
    date_ts_prices = date_data.pivot(index='time_id', columns='stock_id', values = 'wap')
    # Check for any empty columns and remove them
    date_ts_prices.dropna(how='all', axis=1, inplace=True)   
    stock_ids = sorted(date_ts_prices.columns.unique())
    num_assets = len(date_ts_prices.columns)
    date_ts_returns = date_ts_prices.pct_change(1)
    date_ts_returns = date_ts_returns.iloc[1:]
    corr_matrix = date_ts_returns.corr()
    pca = PCA(n_components=num_assets, svd_solver='full')
    pca.fit(corr_matrix)
    factor_loadings = pca.components_.T * np.sqrt(pca.explained_variance_)
    factor_loadings_mkt = factor_loadings[:, 0]
    factor_loadings_mkt = factor_loadings_mkt / sum(factor_loadings_mkt)
    daily_loadings_mkt[date_id, stock_ids] = factor_loadings_mkt