### Target 
To build one dataset containing all the predictors and dependent varaibles into one dataset

### Result
Saved 'final_dataset'

In [1]:
import numpy as np
import pandas as pd
import datetime
import pyarrow.parquet as pq
import os
import matplotlib.pyplot as plt

# 1. read data file names
folder_path = "F:/predictors"
save_folder_path = "F:/predictors"
filenames = os.listdir(folder_path)
filenames = [x for x in filenames if x.endswith(".parquet")]

filenames

['final_universe.parquet',
 'tech.parquet',
 'mkt_cap.parquet',
 'style_factors.parquet',
 'industry_factors.parquet',
 'calendar_date.parquet',
 'release_schedule.parquet',
 'final_dataset.parquet']

In [2]:
# 2. load data and write data functions
from model_training_utils import read_data
# using pyarrow.parquet.read_table to load data
# columns and filters are useful to save memory
    
def write_data(df, filename):
    return df.to_parquet(f"{save_folder_path}/{filename}.parquet", engine='pyarrow')

In [3]:
# 3. Load data
# russell 3000
final_universe = read_data(filename="final_universe", folder_path=folder_path)
# calendar_date
calendar_date = read_data(filename="calendar_date", folder_path=folder_path)
release_schedule = read_data(filename="release_schedule", folder_path=folder_path)
# barra
industry_factors = read_data(filename="industry_factors", folder_path=folder_path)
style_factors = read_data(filename="style_factors", folder_path=folder_path)
# pricing and volumes
mkt_cap = read_data(filename="mkt_cap", folder_path=folder_path)
tech = read_data(filename="tech", folder_path=folder_path)

In [None]:
# 4. Shift values by 1 to create lag 1 version of these predictors
def create_lag(df, col_name_list):
    df = df.sort_values(["isin", "date"],ignore_index=True)
    for col_name in col_name_list:
        df.loc[:, f"lag_{col_name}"] = df.groupby(["isin"])[col_name].shift(1)
    return df.drop(columns=col_name_list)

# market cap are not in the same scale as others. we need to adjust it
mkt_cap["log_mkt_cap"] = np.log(mkt_cap['mkt_cap'])
mkt_cap = create_lag(mkt_cap[['date', 'isin', 'log_mkt_cap']], ["log_mkt_cap"])

calendar_date = create_lag(calendar_date, ['early_close', 'triple_witch', 'double_witch', 'russell_rebalance'])

style_factors = create_lag(style_factors, ['EFMUSATRD_BETA', 'EFMUSATRD_CROWD',
       'EFMUSATRD_DIVYILD', 'EFMUSATRD_EARNQLTY', 'EFMUSATRD_EARNVAR',
       'EFMUSATRD_EARNYILD', 'EFMUSATRD_ESG', 'EFMUSATRD_GROWTH',
       'EFMUSATRD_INDMOM', 'EFMUSATRD_INVSQLTY', 'EFMUSATRD_LEVERAGE',
       'EFMUSATRD_LIQUIDTY', 'EFMUSATRD_LTREVRSL', 'EFMUSATRD_MIDCAP',
       'EFMUSATRD_MLFAC', 'EFMUSATRD_MOMENTUM', 'EFMUSATRD_PROFIT',
       'EFMUSATRD_RESVOL', 'EFMUSATRD_SEASON', 'EFMUSATRD_SENTMT',
       'EFMUSATRD_SHORTINT', 'EFMUSATRD_SIZE', 'EFMUSATRD_STREVRSL',
       'EFMUSATRD_VALUE'])

industry_factors = create_lag(industry_factors, ['EFMUSATRD_AERODEF', 'EFMUSATRD_AIRLINES',
       'EFMUSATRD_ALUMSTEL', 'EFMUSATRD_APPAREL', 'EFMUSATRD_AUTO',
       'EFMUSATRD_BANKS', 'EFMUSATRD_BEVTOB', 'EFMUSATRD_BIOLIFE',
       'EFMUSATRD_BLDGPROD', 'EFMUSATRD_CHEM', 'EFMUSATRD_CNSTENG',
       'EFMUSATRD_CNSTMACH', 'EFMUSATRD_CNSTMATLPAPER', 'EFMUSATRD_COMMEQP',
       'EFMUSATRD_COMPELEC', 'EFMUSATRD_COMSVCS', 'EFMUSATRD_CONGLOM',
       'EFMUSATRD_CONTAINR', 'EFMUSATRD_DISTRIB', 'EFMUSATRD_DIVFIN',
       'EFMUSATRD_ELECEQP', 'EFMUSATRD_ELECUTIL', 'EFMUSATRD_ENERGYEQ',
       'EFMUSATRD_FOODPROD', 'EFMUSATRD_FOODRET', 'EFMUSATRD_GASUTIL',
       'EFMUSATRD_HLTHEQP', 'EFMUSATRD_HLTHSVCS', 'EFMUSATRD_HOMEBLDG',
       'EFMUSATRD_HOUSEDUR', 'EFMUSATRD_INDMACH', 'EFMUSATRD_INSURNCE',
       'EFMUSATRD_INTERNET', 'EFMUSATRD_LEISPROD', 'EFMUSATRD_LEISSVCS',
       'EFMUSATRD_LIFEINS', 'EFMUSATRD_MEDIA', 'EFMUSATRD_MGDHLTH',
       'EFMUSATRD_MULTUTIL', 'EFMUSATRD_NETRET', 'EFMUSATRD_OILGSCON',
       'EFMUSATRD_OILGSEXP', 'EFMUSATRD_PHARMA', 'EFMUSATRD_PRECMTLS',
       'EFMUSATRD_PSNLPROD', 'EFMUSATRD_REALEST', 'EFMUSATRD_RESTAUR',
       'EFMUSATRD_RLESTMNG', 'EFMUSATRD_ROADRAIL', 'EFMUSATRD_SEMICOND',
       'EFMUSATRD_SEMIEQP', 'EFMUSATRD_SOFTWARE', 'EFMUSATRD_SPLTYRET',
       'EFMUSATRD_SPTYCHEM', 'EFMUSATRD_SPTYSTOR', 'EFMUSATRD_TELECOM',
       'EFMUSATRD_TRADECO', 'EFMUSATRD_TRANSPRT'])

release_schedule = create_lag(release_schedule, ['-1', '-2', '-3', '0', '1', '2', '3', '4', '≤-4', '≥5'])

In [6]:
tech2 = create_lag(tech, ['rtn_ma1', 'rtn_ma5', 'rtn_ma22', 'rtn_ma252', 'logvol_ma1',
                          'logvol_ma5', 'logvol_ma22', 'logvol_ma252'])
tech2[['date','isin', 'log_adj_volume']] = tech[['date','isin', 'log_adj_volume']].copy()
# create eta = v_t - ma5_(t-1...t-5)
tech2["eta"] = tech2["log_adj_volume"]-tech2["lag_logvol_ma5"]

In [26]:
# Get the intersection universe and drop all na records 
predictors = tech2.drop(columns=["barrid"]).merge(mkt_cap, on=["isin", "date"], how="inner")
predictors = predictors.merge(calendar_date.drop(columns=["barrid"]), on=["isin", "date"], how="inner")
predictors = predictors.merge(style_factors.drop(columns=["EFMUSATRD_CARBONEFF"]), on=["isin", "date"], how="inner")
predictors = predictors.merge(industry_factors, on=["isin", "date"], how="inner")
predictors = predictors.merge(release_schedule, on=["isin", "date"], how="inner")
predictors = predictors.dropna().drop(columns=["return"])
# sort the predictors
predictors = predictors.sort_values(by=["date","isin"], ignore_index=True)

In [None]:
predictors.head()

# # This is a synthetic example following the structure of technical feature data.
# # No proprietary data is included.

# | date       | isin         | log_adj_volume | is_adj_date | lag_rtn_ma1 | lag_rtn_ma5 | lag_rtn_ma22 | lag_rtn_ma252 | lag_logvol_ma1 | lag_logvol_ma5 | ... | lag_-1 | lag_-2 | lag_-3 | lag_0 | lag_1 | lag_2 | lag_3 | lag_4 | lag_≤-4 | lag_≥5 |
# |------------|--------------|----------------|-------------|-------------|-------------|--------------|---------------|----------------|----------------|-----|--------|--------|--------|-------|-------|-------|-------|-------|----------|---------|
# | 2020-01-03 | XX00000001   | 16.093388      | False       | -0.000828   | -0.002382   | 0.005502     | 0.000745      | 15.978206      | 15.929897      | ... | 0.0    | 0.0    | 0.0    | 0.0   | 0.0   | 0.0   | 0.0   | 0.0   | 0.0      | 1.0     |
# | 2020-01-03 | XX00000002   | 13.006063      | False       | 0.000316    | 0.000253    | 0.000045     | 0.002744      | 13.048581      | 12.502088      | ... | 0.0    | 0.0    | 0.0    | 0.0   | 0.0   | 0.0   | 0.0   | 0.0   | 0.0      | 1.0     |
# | 2020-01-03 | XX00000003   | 13.573252      | False       | 0.011768    | 0.006528    | 0.001566     | 0.002065      | 14.135469      | 13.561239      | ... | 0.0    | 0.0    | 0.0    | 0.0   | 0.0   | 0.0   | 0.0   | 0.0   | 0.0      | 1.0     |
# | 2020-01-03 | XX00000004   | 11.772593      | False       | 0.018416    | 0.002253    | 0.000922     | 0.000154      | 11.678728      | 11.518813      | ... | 0.0    | 0.0    | 0.0    | 0.0   | 0.0   | 0.0   | 0.0   | 0.0   | 0.0      | 1.0     |
# | 2020-01-03 | XX00000005   | 12.630683      | False       | 0.001345    | -0.000354   | -0.000507    | 0.001097      | 12.799066      | 12.804587      | ... | 0.0    | 0.0    | 0.0    | 0.0   | 0.0   | 0.0   | 0.0   | 0.0   | 1.0      | 0.0     |


In [29]:
predictors.date.min(), predictors.date.max()

(datetime.date(2020, 1, 3), datetime.date(2023, 12, 29))

In [30]:
# Save all predictors into one table
write_data(predictors, "final_dataset")

In [31]:
# the column names of the predictor tables
print(predictors.columns.tolist())

['date', 'isin', 'log_adj_volume', 'is_adj_date', 'lag_rtn_ma1', 'lag_rtn_ma5', 'lag_rtn_ma22', 'lag_rtn_ma252', 'lag_logvol_ma1', 'lag_logvol_ma5', 'lag_logvol_ma22', 'lag_logvol_ma252', 'eta', 'lag_log_mkt_cap', 'lag_early_close', 'lag_triple_witch', 'lag_double_witch', 'lag_russell_rebalance', 'lag_EFMUSATRD_BETA', 'lag_EFMUSATRD_CROWD', 'lag_EFMUSATRD_DIVYILD', 'lag_EFMUSATRD_EARNQLTY', 'lag_EFMUSATRD_EARNVAR', 'lag_EFMUSATRD_EARNYILD', 'lag_EFMUSATRD_ESG', 'lag_EFMUSATRD_GROWTH', 'lag_EFMUSATRD_INDMOM', 'lag_EFMUSATRD_INVSQLTY', 'lag_EFMUSATRD_LEVERAGE', 'lag_EFMUSATRD_LIQUIDTY', 'lag_EFMUSATRD_LTREVRSL', 'lag_EFMUSATRD_MIDCAP', 'lag_EFMUSATRD_MLFAC', 'lag_EFMUSATRD_MOMENTUM', 'lag_EFMUSATRD_PROFIT', 'lag_EFMUSATRD_RESVOL', 'lag_EFMUSATRD_SEASON', 'lag_EFMUSATRD_SENTMT', 'lag_EFMUSATRD_SHORTINT', 'lag_EFMUSATRD_SIZE', 'lag_EFMUSATRD_STREVRSL', 'lag_EFMUSATRD_VALUE', 'lag_EFMUSATRD_AERODEF', 'lag_EFMUSATRD_AIRLINES', 'lag_EFMUSATRD_ALUMSTEL', 'lag_EFMUSATRD_APPAREL', 'lag_EFMUSATRD