# Feature Engineering

This notebook constructs an analytical dataset by combining price data and sentiment scores. It derives log returns, rolling volatility, and labels periods of market stress (risk events).

In [None]:
import pandas as pd
from pathlib import Path
from src.feature_engineering import create_analytical_dataset, save_analytical_dataset

# Load raw data
price_df = pd.read_csv('data/raw/spy_prices.csv', index_col=0, parse_dates=True)
sent_scores = pd.read_csv('data/raw/sample_sentiments.csv', header=None, squeeze=True)
# Use the sentiment series with current date repeated for demonstration
sent_series = pd.Series(sent_scores.iloc[:,0].values, index=[price_df.index[0]]*len(sent_scores))

# Create analytical dataset
analytical_df = create_analytical_dataset(price_df, sentiment_scores=sent_series)
print(analytical_df.head())

# Save processed data
save_analytical_dataset(analytical_df, Path('data/processed/analytical_data.csv'))
# Sample 100 records (or all if fewer) and label columns as Source/Derived/Target
import numpy as np
sample_df = analytical_df.sample(n=min(100, len(analytical_df)), random_state=42).copy()
labels = {}
# Define labeling (manual for demonstration)
for col in sample_df.columns:
    if col in ['Adj Close', 'Volume', 'Open', 'High', 'Low', 'Close']:
        labels[col] = 'Source'
    elif col in ['Log_Return', 'Rolling_Vol', 'Next_Return', 'Sentiment']:
        labels[col] = 'Derived'
    elif col == 'Risk_Flag':
        labels[col] = 'Target'
    else:
        labels[col] = 'Derived'

sample_df.columns = [f"{col} ({labels[col]})" for col in sample_df.columns]
# Save sample
sample_df.to_csv('data/processed/analytical_sample.csv', index=False)
print(sample_df.head())