# Feature Engineering for Stock Prediction

This notebook handles feature engineering and analysis for stock price prediction, including technical indicators and correlation analysis across target and peer assets.

In [1]:
# Standard library imports
import sys
import os
import logging
from datetime import datetime, timedelta

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Add project root to path
sys.path.append('../')

# Third-party imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import yaml

# Local imports from src
from src.data_loader import StockDataLoader
from src.features import FeatureEngineer

# Configure visualization settings
plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = [15, 8]
plt.rcParams['figure.dpi'] = 100

logger.info("Imports and configurations loaded successfully")

2025-06-05 16:53:23,504 - INFO - Imports and configurations loaded successfully


## 1. Data Loading

## Import and create stock features

In [2]:
# Add logging configuration
import logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Load configuration
with open('../config.yaml', 'r') as file:
    config = yaml.safe_load(file)

# Initialize components
data_loader = StockDataLoader('../config.yaml')
feature_engineer = FeatureEngineer(config)  # Pass config to FeatureEngineer

# Set date range from config periods
try:
    training_period = config['periods']['training']
    strategy_period = config['periods']['strategy']
    
    # Use training start and strategy end for complete historical data
    start_date = pd.to_datetime(training_period['start'])
    end_date = pd.to_datetime(strategy_period['end'])
    
    logger.info("Data collection period:")
    logger.info(f"Start: {start_date.strftime('%Y-%m-%d')} (Training start)")
    logger.info(f"End: {end_date.strftime('%Y-%m-%d')} (Strategy end)")
except KeyError as e:
    logger.error(f"Missing config parameter: {e}")
    raise

# Get symbols from config
target_symbol = config['target_symbol']
peer_symbols = [symbol for symbol in config['peer_symbols'] if symbol != target_symbol]

# Log the filtered symbols
logger.info(f"Target Symbol: {target_symbol}")
logger.info(f"Filtered Peer Symbols: {', '.join(peer_symbols)}")

# Load target asset data
target_asset = data_loader.fetch_stock_data(target_symbol, 
                                          start_date.strftime('%Y-%m-%d'),
                                          end_date.strftime('%Y-%m-%d'))

# Load peer assets data
peer_assets = {}
for symbol in peer_symbols:
    peer_assets[symbol] = data_loader.fetch_stock_data(symbol,
                                                      start_date.strftime('%Y-%m-%d'),
                                                      end_date.strftime('%Y-%m-%d'))

# Calculate features for target asset
target_features = feature_engineer.engineer_features(target_asset)
target_features.columns = [f'{target_symbol}_{col}' for col in target_features.columns]

# Calculate and combine features for peer assets
peer_features_list = []
for symbol, data in peer_assets.items():
    features = feature_engineer.engineer_features(data)
    features.columns = [f'{symbol}_{col}' for col in features.columns]
    peer_features_list.append(features)

# Combine all peer features into a single DataFrame

peer_features = pd.concat(peer_features_list, axis=1)

# Display first few rows of features
target_features.head()
peer_features.head()

# Combine target and peer features
all_features = pd.concat([target_features, peer_features], axis=1)

# Display first few rows of combined features
all_features.head()

# Create data/processed directory if it doesn't exist
os.makedirs('../data/processed', exist_ok=True)

# Save features to pickle file
all_features.to_pickle('../data/processed/all_features.pkl')
print("\nFeatures saved to '../data/processed/all_features.pkl'")

2025-06-05 16:53:23,535 - INFO - Data collection period:
2025-06-05 16:53:23,536 - INFO - Start: 2020-01-01 (Training start)
2025-06-05 16:53:23,537 - INFO - End: 2025-06-01 (Strategy end)
2025-06-05 16:53:23,538 - INFO - Target Symbol: MSFT
2025-06-05 16:53:23,540 - INFO - Filtered Peer Symbols: AAPL, AMZN, GOOGL, META, ORCL, IBM, CRM, ADBE, NVDA, INTC, SPY, QQQ, XLK, VTI
2025-06-05 16:53:26,340 - INFO - Added rsi features
2025-06-05 16:53:26,344 - INFO - Added macd features
2025-06-05 16:53:26,351 - INFO - Added bollinger features
2025-06-05 16:53:26,787 - INFO - Added cci features
2025-06-05 16:53:26,790 - INFO - Added stochastic features
2025-06-05 16:53:26,794 - INFO - Added atr features
2025-06-05 16:53:26,848 - INFO - Added obv features
2025-06-05 16:53:26,852 - INFO - Added ichimoku features
2025-06-05 16:53:26,860 - INFO - Added sma features
2025-06-05 16:53:26,902 - INFO - Added lagged features for 5 columns with 6 lag periods
2025-06-05 16:53:26,906 - INFO - Added rsi featur


Features saved to '../data/processed/all_features.pkl'
