# 0. Basic configuration
This notebook is to build the final features panel using the original OHLCV clean panel and the useful univariate and mutivariate features.

In [3]:
import sys
from pathlib import Path


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import norm, skew, kurtosis
from statsmodels.graphics.tsaplots import plot_acf

# Project root is the parent of the notebooks/ folder
PROJECT_ROOT = Path.cwd().parent
sys.path.append(str(PROJECT_ROOT))
from src.utils.config import load_config

cfg = load_config()
PANEL_PATH = PROJECT_ROOT / "data" / "processed" / "prices_ohlcv_panel.parquet"
FACTORS_PATH = PROJECT_ROOT / "data" / "processed" / "pca_factors.parquet"
TICKER_TARGET = "AMD"  


---
# 1. Input data
Loading previous notebooks outputs

In [4]:
panel = pd.read_parquet(PANEL_PATH)
panel = panel.sort_index()

pca_factors = pd.read_parquet(FACTORS_PATH)
pca_factors = pca_factors.sort_index()

In [None]:
# now we should create a log-returns basis matric (wide)
prices = panel.xs("close", axis=1, level="field").sort_index()

# daily log-returns
log_ret = np.log(prices / prices.shift(1))

log_ret.head()

ticker,AAPL,MSFT,GOOGL,META,AMZN,NVDA,TSLA,ORCL,AVGO,AMD,INTC,^GSPC,^NDX,^VIX
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2012-01-03 23:00:00,,,,,,,,,,,,,,
2012-01-04 23:00:00,0.01104,0.010167,-0.013969,,0.000563,0.035286,-0.021522,0.022054,0.001405,0.0,0.011483,0.002939,0.008237,-0.033871
2012-01-05 23:00:00,0.0104,0.015415,-0.013736,,0.027763,-0.011624,-0.007773,0.012706,-0.006338,-0.00551,-0.005923,-0.00254,0.003056,-0.040376
2012-01-08 23:00:00,-0.001587,-0.01325,-0.043324,,-0.022428,0.0,0.012556,0.005937,0.065289,0.02904,0.008675,0.002259,-0.002346,0.021104
2012-01-09 23:00:00,0.003574,0.003598,0.001092,,0.004359,-0.004135,0.013486,-0.002222,0.014129,0.02124,0.004701,0.008847,0.007092,-0.0182


---
# 2. Features: returns and lags

In [9]:
features = log_ret.copy()
features.columns = [f"{c}_ret1" for c in features.columns]

# returns lags
for lag in [2,5]:
    lagged = log_ret.shift(lag)
    lagged.columns = [f"{c}_ret{lag}" for c in lagged.columns]
    features = pd.concat([features, lagged], axis=1)

features.head()

Unnamed: 0_level_0,AAPL_ret1,MSFT_ret1,GOOGL_ret1,META_ret1,AMZN_ret1,NVDA_ret1,TSLA_ret1,ORCL_ret1,AVGO_ret1,AMD_ret1,...,AMZN_ret5,NVDA_ret5,TSLA_ret5,ORCL_ret5,AVGO_ret5,AMD_ret5,INTC_ret5,^GSPC_ret5,^NDX_ret5,^VIX_ret5
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-01-03 23:00:00,,,,,,,,,,,...,,,,,,,,,,
2012-01-04 23:00:00,0.01104,0.010167,-0.013969,,0.000563,0.035286,-0.021522,0.022054,0.001405,0.0,...,,,,,,,,,,
2012-01-05 23:00:00,0.0104,0.015415,-0.013736,,0.027763,-0.011624,-0.007773,0.012706,-0.006338,-0.00551,...,,,,,,,,,,
2012-01-08 23:00:00,-0.001587,-0.01325,-0.043324,,-0.022428,0.0,0.012556,0.005937,0.065289,0.02904,...,,,,,,,,,,
2012-01-09 23:00:00,0.003574,0.003598,0.001092,,0.004359,-0.004135,0.013486,-0.002222,0.014129,0.02124,...,,,,,,,,,,


---
# 3. Features: Rolling volatility

In [10]:
vol20 = log_ret.rolling(20).std()
vol20.columns = [f"{c}_vol20" for c in vol20.columns]

features = pd.concat([features, vol20], axis=1)
features.head()

Unnamed: 0_level_0,AAPL_ret1,MSFT_ret1,GOOGL_ret1,META_ret1,AMZN_ret1,NVDA_ret1,TSLA_ret1,ORCL_ret1,AVGO_ret1,AMD_ret1,...,AMZN_vol20,NVDA_vol20,TSLA_vol20,ORCL_vol20,AVGO_vol20,AMD_vol20,INTC_vol20,^GSPC_vol20,^NDX_vol20,^VIX_vol20
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-01-03 23:00:00,,,,,,,,,,,...,,,,,,,,,,
2012-01-04 23:00:00,0.01104,0.010167,-0.013969,,0.000563,0.035286,-0.021522,0.022054,0.001405,0.0,...,,,,,,,,,,
2012-01-05 23:00:00,0.0104,0.015415,-0.013736,,0.027763,-0.011624,-0.007773,0.012706,-0.006338,-0.00551,...,,,,,,,,,,
2012-01-08 23:00:00,-0.001587,-0.01325,-0.043324,,-0.022428,0.0,0.012556,0.005937,0.065289,0.02904,...,,,,,,,,,,
2012-01-09 23:00:00,0.003574,0.003598,0.001092,,0.004359,-0.004135,0.013486,-0.002222,0.014129,0.02124,...,,,,,,,,,,


---
# 4. Features: Normalized volume

In [None]:
vol_wide = panel.xs("volume", axis=1, level="field").sort_index()

vol_norm = (vol_wide-vol_wide.mean()) / vol_wide.std()
vol_norm.columns = [f"{c}_volnorm" for c in vol_norm.columns]

features = pd.concat([features, vol_norm], axis=1)
features.head()

---
# 5. Features: PCA factors

---
# 6. Final formating


### NaNs cleaning

### Binary target

### Temporal splits train/valid/test

### Saving