# Feature Engineering for Stock Prediction

This notebook handles feature engineering and analysis for stock price prediction, including technical indicators and correlation analysis across target and peer assets.

In [12]:
# Standard library imports
import sys
import os
import logging
from datetime import datetime, timedelta

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Add project root to path
sys.path.append('../')

# Third-party imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import yaml

# Local imports from src
from src.data_loader import StockDataLoader
from src.features import FeatureEngineer

# Configure visualization settings
plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = [15, 8]
plt.rcParams['figure.dpi'] = 100

logger.info("Imports and configurations loaded successfully")

2025-06-01 09:54:52,661 - INFO - Imports and configurations loaded successfully


## 1. Data Loading

## Import and create stock features

In [13]:
# Add logging configuration
import logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Load configuration
with open('../config.yaml', 'r') as file:
    config = yaml.safe_load(file)

# Initialize components
data_loader = StockDataLoader('../config.yaml')
feature_engineer = FeatureEngineer(config)  # Pass config to FeatureEngineer

# Set date range
end_date = datetime.now()
start_date = datetime(2024, 1, 1)  # Start from beginning of 2024

# Get symbols from config
# Get symbols from config and filter out target from peers
target_symbol = config['target_symbol']
peer_symbols = [symbol for symbol in config['peer_symbols'] if symbol != target_symbol]

# Log the filtered symbols
logger.info(f"Target Symbol: {target_symbol}")
logger.info(f"Filtered Peer Symbols: {', '.join(peer_symbols)}")

# Load target asset data
target_asset = data_loader.fetch_stock_data(target_symbol, 
                                          start_date.strftime('%Y-%m-%d'),
                                          end_date.strftime('%Y-%m-%d'))

# Load peer assets data
peer_assets = {}
for symbol in peer_symbols:
    peer_assets[symbol] = data_loader.fetch_stock_data(symbol,
                                                      start_date.strftime('%Y-%m-%d'),
                                                      end_date.strftime('%Y-%m-%d'))

# Calculate features for target asset
target_features = feature_engineer.engineer_features(target_asset)
target_features.columns = [f'{target_symbol}_{col}' for col in target_features.columns]

# Calculate and combine features for peer assets
peer_features_list = []
for symbol, data in peer_assets.items():
    features = feature_engineer.engineer_features(data)
    features.columns = [f'{symbol}_{col}' for col in features.columns]
    peer_features_list.append(features)

# Combine all peer features into a single DataFrame

peer_features = pd.concat(peer_features_list, axis=1)

# Display first few rows of features
target_features.head()
peer_features.head()

# Combine target and peer features
all_features = pd.concat([target_features, peer_features], axis=1)

# Display first few rows of combined features
all_features.head()

# Create data/processed directory if it doesn't exist
os.makedirs('../data/processed', exist_ok=True)

# Save features to pickle file
all_features.to_pickle('../data/processed/all_features.pkl')
print("\nFeatures saved to '../data/processed/all_features.pkl'")

2025-06-01 09:54:56,258 - INFO - Target Symbol: MSFT
2025-06-01 09:54:56,260 - INFO - Filtered Peer Symbols: AAPL, AMZN, GOOGL, META, ORCL, IBM, CRM, ADBE, NVDA, INTC, SPY, QQQ, XLK, VTI



Features saved to '../data/processed/all_features.pkl'


## 2. Technical Indicators (Optional)

In [None]:
## 2. Technical Indicators

# First, let's print the available columns
print("Available columns:")
print([col for col in all_features.columns if 'RSI' in col])

# Then update the plotting code with the correct column name
plt.figure(figsize=(15, 6))
plt.plot(all_features.index, all_features[f'{target_symbol}_RSI'])  # Removed _14 suffix
plt.axhline(y=70, color='r', linestyle='--', label='Overbought (70)')
plt.axhline(y=30, color='g', linestyle='--', label='Oversold (30)')
plt.title(f'RSI Over Time - {target_symbol}')
plt.legend()
plt.show()

# Plot MACD for target asset
plt.figure(figsize=(15, 6))
plt.plot(all_features.index, all_features[f'{target_symbol}_MACD'], label='MACD')  # Simplified column names
plt.plot(all_features.index, all_features[f'{target_symbol}_Signal'], label='Signal')
plt.bar(all_features.index, all_features[f'{target_symbol}_Histogram'], label='Histogram')
plt.title(f'MACD Analysis - {target_symbol}')
plt.legend()
plt.show()

In [None]:
# Compare MACD across peers
plt.figure(figsize=(15, 6))
for symbol in [target_symbol] + peer_symbols:
    plt.plot(all_features.index, all_features[f'{symbol}_MACD'], label=f'{symbol}_MACD')
    plt.plot(all_features.index, all_features[f'{symbol}_Signal'], label=f'{symbol}_Signal')
    plt.bar(all_features.index, all_features[f'{symbol}_Histogram'], label=f'{symbol}_Histogram', alpha=0.3)
plt.title('MACD Analysis Comparison')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

## 3. Feature Correlation Analysis

In [None]:
# Calculate correlation matrix
correlation_matrix = all_features.corr()

# Create a larger figure for better readability
plt.figure(figsize=(20, 16))

# Plot correlation heatmap
sns.heatmap(correlation_matrix, 
            annot=True, 
            cmap='coolwarm', 
            center=0,
            fmt='.2f',
            square=True,
            linewidths=0.5)

plt.title('Feature Correlation Matrix Across All Assets')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

# Display summary of highly correlated features (|correlation| > 0.7)
high_corr = np.where(np.abs(correlation_matrix) > 0.7)
high_corr = [(correlation_matrix.index[x], correlation_matrix.columns[y], correlation_matrix.iloc[x, y]) 
             for x, y in zip(*high_corr) if x != y]

print("\nHighly correlated features (|correlation| > 0.7):")
for feat1, feat2, corr in sorted(high_corr, key=lambda x: abs(x[2]), reverse=True):
    print(f"{feat1} <-> {feat2}: {corr:.3f}")