# 0. Basic configuration
This notebook is to build the final features panel using the original OHLCV clean panel and the useful univariate and mutivariate features.

In [78]:
import sys
from pathlib import Path
PROJECT_ROOT = Path.cwd().parent
sys.path.append(str(PROJECT_ROOT))

In [80]:
import numpy as np
import pandas as pd
from src.utils.config import load_config

cfg = load_config()
PANEL_PATH = PROJECT_ROOT / "data" / "processed" / "prices_ohlcv_panel.parquet"
FACTORS_PATH = PROJECT_ROOT / "data" / "processed" / "pca_factors.parquet"
TICKER_TARGET = "AMD"  

DATA_DIR = PROJECT_ROOT / "data"
PROC_DATA_DIR = DATA_DIR / "processed"
PROC_DATA_DIR.mkdir(parents=True, exist_ok=True)

---
# 1. Input data
Loading previous notebooks outputs

In [None]:
panel = pd.read_parquet(PANEL_PATH).sort_index()

pca_factors = pd.read_parquet(FACTORS_PATH).sort_index()

In [60]:
# now we should create a log-returns basis matric (wide)
prices = panel.xs("close", axis=1, level="field").sort_index()

# daily log-returns
log_ret = np.log(prices / prices.shift(1))
log_ret.head()

ticker,AAPL,MSFT,GOOGL,META,AMZN,NVDA,TSLA,ORCL,AVGO,AMD,INTC,^GSPC,^NDX,^VIX
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2012-01-03 23:00:00,,,,,,,,,,,,,,
2012-01-04 23:00:00,0.01104,0.010167,-0.013969,,0.000563,0.035286,-0.021522,0.022054,0.001405,0.0,0.011483,0.002939,0.008237,-0.033871
2012-01-05 23:00:00,0.0104,0.015415,-0.013736,,0.027763,-0.011624,-0.007773,0.012706,-0.006338,-0.00551,-0.005923,-0.00254,0.003056,-0.040376
2012-01-08 23:00:00,-0.001587,-0.01325,-0.043324,,-0.022428,0.0,0.012556,0.005937,0.065289,0.02904,0.008675,0.002259,-0.002346,0.021104
2012-01-09 23:00:00,0.003574,0.003598,0.001092,,0.004359,-0.004135,0.013486,-0.002222,0.014129,0.02124,0.004701,0.008847,0.007092,-0.0182


---
# 2. Features: returns and lags

In [61]:
features = log_ret.copy()
features.columns = [f"{c}_ret1" for c in features.columns]

# returns lags
for lag in [2,5]:
    lagged = log_ret.shift(lag)
    lagged.columns = [f"{c}_ret{lag}" for c in lagged.columns]
    features = pd.concat([features, lagged], axis=1)

---
# 3. Features: Rolling volatility

In [62]:
vol20 = log_ret.rolling(20).std()
vol20.columns = [f"{c}_vol20" for c in vol20.columns]

features = pd.concat([features, vol20], axis=1)

---
# 4. Features: Normalized volume

In [63]:
vol_wide = panel.xs("volume", axis=1, level="field").sort_index()

vol_norm = (vol_wide-vol_wide.mean()) / vol_wide.std()
vol_norm.columns = [f"{c}_volnorm" for c in vol_norm.columns]

vol_norm = vol_norm.drop(columns=['^VIX_volnorm'])

features = pd.concat([features, vol_norm], axis=1)

---
# 5. Features: PCA factors

In [64]:
# we only keep the three fisrt PCA factors
pca_used = pca_factors[["PC1", "PC2", "PC3"]]

# sort by date index
features = features.join(pca_used, how='inner')
features.head()

Unnamed: 0_level_0,AAPL_ret1,MSFT_ret1,GOOGL_ret1,META_ret1,AMZN_ret1,NVDA_ret1,TSLA_ret1,ORCL_ret1,AVGO_ret1,AMD_ret1,...,TSLA_volnorm,ORCL_volnorm,AVGO_volnorm,AMD_volnorm,INTC_volnorm,^GSPC_volnorm,^NDX_volnorm,PC1,PC2,PC3
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-05-21 22:00:00,-0.007708,0.000336,-0.021912,-0.093255,-0.012828,-0.012281,0.068181,0.003801,0.01158,-0.022473,...,-0.910871,0.762827,0.055306,-1.033913,0.594606,0.268355,-0.741753,-1.524794,2.447392,-0.141137
2012-05-22 22:00:00,0.024107,-0.022083,0.014311,0.031749,0.009015,0.024412,0.007118,0.012067,0.069483,-0.013072,...,-1.134256,1.872121,0.985327,-0.83728,1.391915,0.251643,-0.701444,1.50641,-0.202839,0.479184
2012-05-23 22:00:00,-0.009227,-0.001375,-0.009562,0.03168,-0.009433,-0.026886,-0.024145,-0.021213,-0.030594,-0.009917,...,-1.162486,1.977705,0.007598,-1.113901,0.148781,0.065842,-0.80809,-1.310321,-1.07927,0.010144
2012-05-24 22:00:00,-0.005374,-0.000344,-0.020299,-0.034497,-0.010978,0.023665,-0.015644,0.000765,0.019495,0.032683,...,-1.224601,0.4137,-0.573303,-1.10522,-0.752377,-1.093657,-1.086141,-0.570934,2.196727,-0.348755
2012-05-28 22:00:00,0.017593,0.017059,0.004739,-0.101156,0.008699,0.025479,0.061158,0.012168,0.038466,0.037859,...,-1.050501,0.965196,-0.394349,-0.912486,0.044868,-0.474196,-1.717185,2.054451,3.1672,-0.327307


---
# 6. Final formating


### NaNs cleaning

In [71]:
features = features.sort_index()
features = features.dropna()
features.head()

Unnamed: 0_level_0,AAPL_ret1,MSFT_ret1,GOOGL_ret1,META_ret1,AMZN_ret1,NVDA_ret1,TSLA_ret1,ORCL_ret1,AVGO_ret1,AMD_ret1,...,TSLA_volnorm,ORCL_volnorm,AVGO_volnorm,AMD_volnorm,INTC_volnorm,^GSPC_volnorm,^NDX_volnorm,PC1,PC2,PC3
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-06-18 22:00:00,0.002778,0.028413,0.018536,0.015793,0.006134,0.065546,0.007821,0.030504,-0.001728,-0.020444,...,-1.194557,6.674678,-0.39488,-0.794408,-0.204986,-0.06733,-0.757434,2.509512,-0.481049,-0.82441
2012-06-19 22:00:00,-0.002847,0.007464,-0.006937,-0.009762,-0.004519,0.015736,0.051325,0.018778,0.016293,0.042129,...,-0.704954,1.481057,0.09355,-0.736915,-0.299836,-0.197596,-0.919409,1.038386,1.735394,0.252142
2012-06-20 22:00:00,-0.013873,-0.025873,-0.021528,0.007566,-0.011046,-0.046414,-0.048213,-0.024157,-0.022653,-0.057741,...,-1.00334,1.826201,-0.200715,-0.906193,0.671344,0.236554,-0.76533,-5.216659,-1.482487,0.477586
2012-06-21 22:00:00,0.007639,0.01841,0.011032,0.037298,0.007183,0.012384,0.048509,0.006809,0.009238,0.001747,...,-0.77822,0.76686,1.361247,-0.94201,3.313192,1.518001,0.258498,2.47095,-0.615675,0.457072
2012-06-24 22:00:00,-0.019656,-0.027408,-0.019043,-0.030412,-0.009452,-0.032047,-0.02033,-0.013664,-0.017099,-0.057466,...,-1.080038,1.043341,-0.180483,-0.899039,0.38192,-0.408677,-0.948337,-4.651565,-0.427498,0.201326


### Binary target
Here we can choose a target ticker, for example AMD. Later can be generalized.

In [72]:
target_series = (log_ret[TICKER_TARGET] > 0).astype(int)
target_series.name = "target"

dataset = features.join(target_series, how='inner')
dataset = dataset.dropna()

- 'dataset' is what the models will use (rows=date; cols=features+target)

### Temporal splits train/valid/test

In [75]:
dataset = dataset.sort_index()

train = dataset.loc["2012-01-01":"2018-12-31"] # TODO: read from config and automate from actual date
valid = dataset.loc["2019-01-01":"2021-12-31"] # TODO: read from config and automate from actual date
test = dataset.loc["2022-01-01":] # TODO: read from config and automate from actual date

len(train), len(valid), len(test)
len(train)/len(dataset), len(valid)/len(dataset), len(test)/len(dataset)

(0.5214081826831589, 0.24008880431335236, 0.23850301300348875)

### Saving

In [81]:
dataset.to_parquet(PROC_DATA_DIR / "feature_panel.parquet")
train.to_parquet(PROC_DATA_DIR / "train.parquet")
valid.to_parquet(PROC_DATA_DIR / "valid.parquet")
test.to_parquet(PROC_DATA_DIR / "test.parquet")