### Target 
According to the paper, "[t]he five-day moving average predicts log dollar volume with an R2 of 93.68%, higher than the one-day lag (92.53%), moving average of 22 days (92.60%), or 252 days (86.12%)". <br>
To validate my model's effectiveness, I computed the r-square values of moving average (1,5,22) here as another benchmark.

### Result: r-square
ma1	= 0.912950
ma5	= 0.923016
ma22 = 0.912761


In [1]:
import numpy as np
import pandas as pd
import datetime
import pyarrow.parquet as pq
import os
import matplotlib.pyplot as plt

# 1. read data file names
folder_path = "F:/predictors_v2"
filenames = os.listdir(folder_path)
filenames = [x for x in filenames if x.endswith(".parquet")]

filenames

['tech.parquet',
 'mkt_cap.parquet',
 'style_factors.parquet',
 'calendar_date.parquet',
 'release_schedule.parquet',
 'industry_factors.parquet',
 'final_dataset.parquet',
 'pre_train_dataset.parquet',
 'agg_industry_factors.parquet']

In [2]:
# 2. load data and write data functions
import sys
sys.path.append("..")

from src.model_training_utils import read_data
# using pyarrow.parquet.read_table to load data
# columns and filters are useful to save memory


# Step 1: Load data
final_dataset = read_data(filename="final_dataset", columns=["date", "isin", 'lag_logvol_ma1', 'lag_logvol_ma5',
                                                             'lag_logvol_ma22', 'lag_logvol_ma252', 'log_adj_volume'], 
                          folder_path=folder_path)
final_dataset = final_dataset.sort_values(by=["date", "isin"], ignore_index=True)

# Step 2: Split train, validation (create cross validation spliter) and test datasets
dates_list = final_dataset["date"].unique()
dates_list.sort()
num_of_days = dates_list.shape[0]

step = 60 # step in window movement
h = 60 # time horizon for validation dataset
trainval_test_threshold = int(num_of_days * 0.6) # 60% dates are used to training and validation
initial_threshold = int(trainval_test_threshold / 3) # the window size of the 1st train dataset
# Update the split threshold of train_validation and test
trainval_test_threshold = (
    (trainval_test_threshold - (initial_threshold + h)) // step * step
    + h
    + initial_threshold
)
# test_dates are the dates used for testing (out-of-sample datasets)
test_dates = dates_list[trainval_test_threshold:]
# Create the test_filter, an input for model training.
test_filter = (final_dataset["date"] >= test_dates[0]) & (
    final_dataset["date"] <= test_dates[-1]
)

In [3]:
# 1. Compute a value using the time range of my test dataset
# test time range:
test_dates[0], test_dates[-1]

(datetime.date(2022, 3, 25), datetime.date(2023, 12, 29))

In [4]:
# metrics of test dataset
test_dataset=final_dataset[test_filter.values]

def compute_metrics(test_dataset, days):
    error = test_dataset[f"lag_logvol_ma{days}"]-test_dataset.log_adj_volume
    mse=(error**2).mean()
    r2 = 1-mse/((test_dataset.log_adj_volume-test_dataset.log_adj_volume.mean())**2).mean()
    return mse, r2

pd.DataFrame([compute_metrics(test_dataset, days) for days in [1,5,22, 252]], 
             columns=["MSE", "R2"], index=["ma1","ma5","ma22", "ma252"])

Unnamed: 0,MSE,R2
ma1,0.229697,0.91295
ma5,0.203137,0.923016
ma22,0.230197,0.912761
ma252,0.301302,0.885814


In [5]:
# 2. Compute a value using the same time range of the test dataset in their paper
# the values can be difference because the universes are different.
dates_21 = dates_list[(dates_list < datetime.date(2023, 1, 1))&(dates_list>datetime.date(2021, 1, 1))]
# test time range:
dates_21[0], dates_21[-1]

(datetime.date(2021, 1, 4), datetime.date(2022, 12, 30))

In [6]:
# compute metrics
dates_21_filter = (final_dataset["date"] >= dates_21[0]) & (
    final_dataset["date"] <= dates_21[-1]
)
dates_21_dataset=final_dataset[dates_21_filter.values]

pd.DataFrame([compute_metrics(dates_21_dataset, days) for days in [1,5,22, 252]], 
             columns=["MSE", "R2"], index=["ma1","ma5","ma22", "ma252"])

Unnamed: 0,MSE,R2
ma1,0.23031,0.914587
ma5,0.204476,0.924168
ma22,0.229573,0.91486
ma252,0.304959,0.886903


In [7]:
# Compute a daily R^2 statistics
daily_metrics = dates_21_dataset.groupby(["date"]).apply(lambda x: compute_metrics(x, 5))
daily_r2=daily_metrics.apply(lambda x: x[1])
daily_r2.mean()

0.9207606425230866

In [8]:
daily_r2.describe()

count    502.000000
mean       0.920761
std        0.066062
min        0.135005
25%        0.922528
50%        0.934215
75%        0.941083
max        0.959644
dtype: float64