---
title: "Week 02 — Data Wrangling & Feature Engineering for Finance"
week: 2
author: "Praveen Kumar"
date: 2025-10-07
duration: "3 hours"
prerequisites: ["Week 1: Financial Modelling & ML Basics"]
tags: ["feature-engineering","data-preprocessing","finance","PCA"]
version: v1.0
---

# Week 02 — Feature Engineering & Selection for Financial Data

## Student Notebook: Complete Feature Engineering Pipeline

This notebook demonstrates the end-to-end process of cleaning financial data, engineering meaningful features, and selecting the most predictive variables for machine learning models.

### Learning Goals:
- Master financial data preprocessing techniques
- Create technical indicators and derived features
- Apply feature selection methods (correlation, PCA, importance)
- Build a reproducible feature engineering pipeline

In [None]:
# Parameters and Configuration
SEED = 42
SAMPLE_MODE = True  # Set to True for quick runs, False for comprehensive analysis
DATA_PATH = "data/synthetic/stock_data.csv"
TICKER = "AAPL"  # Default ticker for download
START_DATE = "2022-01-01"
END_DATE = "2024-01-01"

print(f"Configuration:")
print(f"SEED: {SEED}")
print(f"SAMPLE_MODE: {SAMPLE_MODE}")
print(f"DATA_PATH: {DATA_PATH}")
print(f"TICKER: {TICKER}")
print(f"Date Range: {START_DATE} to {END_DATE}")

In [None]:
# Install required packages
import sys
import subprocess

# Check and install technical analysis library
try:
    import talib
    print("TA-Lib available")
except ImportError:
    print("TA-Lib not available, will use manual calculations")

# Install yfinance if needed
try:
    import yfinance
    print("yfinance already installed")
except ImportError:
    print("Installing yfinance...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "yfinance", "-q"])
    print("yfinance installed successfully")

In [None]:
# Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os
from datetime import datetime

# Machine Learning Libraries
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectKBest, f_regression
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Financial Data
try:
    import yfinance as yf
    YF_AVAILABLE = True
except ImportError:
    YF_AVAILABLE = False
    print("Warning: yfinance not available. Will use synthetic data only.")

# Set random seed and configure
np.random.seed(SEED)
warnings.filterwarnings('ignore')
plt.rcParams['figure.figsize'] = (12, 8)
sns.set_style("whitegrid")

print("Setup Complete!")
print(f"Pandas: {pd.__version__}")
print(f"NumPy: {np.__version__}")
print(f"Matplotlib: {plt.matplotlib.__version__}")
print(f"Seaborn: {sns.__version__}")
print(f"YFinance available: {YF_AVAILABLE}")

## Section 1: Data Loading and Initial Assessment

### Data Loading Pipeline

```mermaid
graph LR
    A[Check Local Data] --> B{File Exists?}
    B -->|Yes| C[Load CSV]
    B -->|No| D[Download from yfinance]
    C --> E[Data Quality Check]
    D --> E
    E --> F[Clean Dataset]
```

We'll implement a robust data loading strategy that handles both local files and live data downloads.

In [None]:
# Data Loading Function with Fallback Logic
def load_financial_data():
    """Load financial data with multiple fallback strategies."""
    
    # Strategy 1: Try local synthetic data
    if os.path.exists(DATA_PATH):
        try:
            print(f"Loading data from {DATA_PATH}")
            data = pd.read_csv(DATA_PATH, index_col=0, parse_dates=True)
            print(f"Successfully loaded local data: {data.shape}")
            return data, "local"
        except Exception as e:
            print(f"Failed to load local data: {e}")
    
    # Strategy 2: Download from yfinance
    if YF_AVAILABLE:
        try:
            print(f"Downloading {TICKER} data from Yahoo Finance...")
            if SAMPLE_MODE:
                # Smaller dataset for quick processing
                data = yf.download(TICKER, start="2022-01-01", end="2023-12-31", progress=False)
            else:
                # Full dataset
                data = yf.download(TICKER, start=START_DATE, end=END_DATE, progress=False)
            
            print(f"Successfully downloaded data: {data.shape}")
            return data.dropna(), "yfinance"
        except Exception as e:
            print(f"Failed to download data: {e}")
    
    # Strategy 3: Generate synthetic data
    print("Generating synthetic financial data...")
    dates = pd.date_range(start="2022-01-01", end="2023-12-31", freq='D')
    dates = dates[dates.weekday < 5]  # Remove weekends
    
    np.random.seed(SEED)
    n_days = len(dates)
    
    # Generate realistic price series with trend and volatility
    initial_price = 150.0
    returns = np.random.normal(0.0005, 0.02, n_days)  # Daily returns
    returns[0] = 0  # First return is zero
    
    # Add some market regime changes
    volatility_regimes = np.random.choice([1, 1.5, 0.7], n_days, p=[0.7, 0.15, 0.15])
    returns = returns * volatility_regimes
    
    # Calculate prices
    prices = initial_price * np.exp(np.cumsum(returns))
    
    # Generate OHLC data
    noise_factor = 0.003
    data = pd.DataFrame({
        'Open': prices * (1 + np.random.normal(0, noise_factor, n_days)),
        'High': prices * (1 + np.abs(np.random.normal(0, noise_factor * 1.5, n_days))),
        'Low': prices * (1 - np.abs(np.random.normal(0, noise_factor * 1.5, n_days))),
        'Close': prices,
        'Volume': np.random.randint(50000000, 200000000, n_days),
        'Adj Close': prices
    }, index=dates)
    
    # Ensure High >= Low and both contain Open/Close
    data['High'] = np.maximum(data['High'], np.maximum(data['Open'], data['Close']))
    data['Low'] = np.minimum(data['Low'], np.minimum(data['Open'], data['Close']))
    
    print(f"Generated synthetic data: {data.shape}")
    return data, "synthetic"

# Load the data
stock_data, data_source = load_financial_data()

print(f"\nDataset Information:")
print(f"Source: {data_source}")
print(f"Shape: {stock_data.shape}")
print(f"Date range: {stock_data.index.min().date()} to {stock_data.index.max().date()}")
print(f"Columns: {list(stock_data.columns)}")

# Display basic statistics
print(f"\nBasic Statistics:")
stock_data.describe()

## Section 2: Data Cleaning and Preprocessing

### Data Quality Assessment

Let's examine our data for common issues:
- Missing values
- Outliers and anomalies  
- Data consistency checks

In [None]:
# Data Quality Assessment and Cleaning
def assess_data_quality(data):
    """Comprehensive data quality assessment."""
    print("=== DATA QUALITY ASSESSMENT ===")
    
    # Check for missing values
    print("\n1. Missing Values:")
    missing_counts = data.isnull().sum()
    missing_pct = (missing_counts / len(data)) * 100
    missing_info = pd.DataFrame({
        'Missing Count': missing_counts,
        'Missing %': missing_pct
    })
    print(missing_info[missing_info['Missing Count'] > 0])
    
    # Check for duplicated index values
    print(f"\n2. Duplicate Dates: {data.index.duplicated().sum()}")
    
    # Check for zero/negative prices
    price_cols = ['Open', 'High', 'Low', 'Close', 'Adj Close']
    available_price_cols = [col for col in price_cols if col in data.columns]
    
    print(f"\n3. Zero/Negative Prices:")
    for col in available_price_cols:
        zero_count = (data[col] <= 0).sum()
        if zero_count > 0:
            print(f"   {col}: {zero_count} zero/negative values")
    
    # Check for outliers using IQR method
    print(f"\n4. Outlier Analysis (IQR Method):")
    for col in available_price_cols:
        Q1 = data[col].quantile(0.25)
        Q3 = data[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outliers = ((data[col] < lower_bound) | (data[col] > upper_bound)).sum()
        print(f"   {col}: {outliers} outliers ({outliers/len(data)*100:.1f}%)")
    
    return missing_info

def clean_financial_data(data):
    """Clean financial data using standard techniques."""
    print("\n=== DATA CLEANING ===")
    cleaned_data = data.copy()
    
    # Remove any duplicate dates
    if cleaned_data.index.duplicated().sum() > 0:
        print(f"Removing {cleaned_data.index.duplicated().sum()} duplicate dates")
        cleaned_data = cleaned_data[~cleaned_data.index.duplicated(keep='first')]
    
    # Handle missing values
    missing_before = cleaned_data.isnull().sum().sum()
    if missing_before > 0:
        print(f"Filling {missing_before} missing values using forward fill")
        cleaned_data = cleaned_data.fillna(method='ffill').fillna(method='bfill')
    
    # Winsorize outliers (cap at 1st and 99th percentiles)
    price_columns = ['Open', 'High', 'Low', 'Close', 'Adj Close']
    available_cols = [col for col in price_columns if col in cleaned_data.columns]
    
    outliers_treated = 0
    for col in available_cols:
        lower_cap = cleaned_data[col].quantile(0.01)
        upper_cap = cleaned_data[col].quantile(0.99)
        
        outliers_count = ((cleaned_data[col] < lower_cap) | (cleaned_data[col] > upper_cap)).sum()
        if outliers_count > 0:
            cleaned_data[col] = cleaned_data[col].clip(lower=lower_cap, upper=upper_cap)
            outliers_treated += outliers_count
            print(f"Winsorized {outliers_count} outliers in {col}")
    
    print(f"Total outliers treated: {outliers_treated}")
    print(f"Cleaned dataset shape: {cleaned_data.shape}")
    
    return cleaned_data

# Assess and clean the data
quality_info = assess_data_quality(stock_data)
cleaned_data = clean_financial_data(stock_data)

# Visualize the cleaned data
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Price series
axes[0, 0].plot(cleaned_data.index, cleaned_data['Close'], linewidth=1)
axes[0, 0].set_title('Stock Price Over Time')
axes[0, 0].set_ylabel('Price ($)')
axes[0, 0].grid(True, alpha=0.3)

# Volume series
if 'Volume' in cleaned_data.columns:
    axes[0, 1].plot(cleaned_data.index, cleaned_data['Volume'], color='orange', linewidth=1)
    axes[0, 1].set_title('Trading Volume Over Time')
    axes[0, 1].set_ylabel('Volume')
    axes[0, 1].grid(True, alpha=0.3)

# Daily price change distribution
price_changes = cleaned_data['Close'].pct_change().dropna()
axes[1, 0].hist(price_changes, bins=50, alpha=0.7, color='green')
axes[1, 0].set_title('Daily Return Distribution')
axes[1, 0].set_xlabel('Daily Return')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].grid(True, alpha=0.3)

# Box plot of price columns
price_cols = [col for col in ['Open', 'High', 'Low', 'Close'] if col in cleaned_data.columns]
if price_cols:
    cleaned_data[price_cols].boxplot(ax=axes[1, 1])
    axes[1, 1].set_title('Price Distribution (OHLC)')
    axes[1, 1].set_ylabel('Price ($)')
    axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nData cleaning completed. Ready for feature engineering!")

## Section 3: Feature Engineering Pipeline

### Feature Engineering Workflow

```mermaid
graph TD
    A[Clean Data] --> B[Basic Features]
    B --> C[Technical Indicators]
    C --> D[Volatility Features]
    D --> E[Momentum Features]
    E --> F[Lag Features]
    F --> G[Feature Matrix]
    
    B --> B1[Returns, Log Returns]
    C --> C1[MA, RSI, MACD]
    D --> D1[Rolling Vol, GARCH]
    E --> E1[Price Momentum, ROC]
    F --> F1[Lagged Returns, Prices]
```

Let's systematically create a comprehensive set of financial features.

In [None]:
# Comprehensive Feature Engineering Functions

def calculate_returns(data, price_col='Close'):
    """Calculate various types of returns."""
    features = pd.DataFrame(index=data.index)
    
    # Simple returns
    features['simple_return'] = data[price_col].pct_change()
    
    # Log returns (preferred for modeling)
    features['log_return'] = np.log(data[price_col] / data[price_col].shift(1))
    
    # Multi-period returns
    for period in [5, 10, 20]:
        features[f'return_{period}d'] = data[price_col].pct_change(period)
        features[f'log_return_{period}d'] = np.log(data[price_col] / data[price_col].shift(period))
    
    return features

def calculate_moving_averages(data, price_col='Close'):
    """Calculate various moving averages."""
    features = pd.DataFrame(index=data.index)
    
    # Simple Moving Averages
    for window in [10, 20, 50]:
        features[f'SMA_{window}'] = data[price_col].rolling(window=window).mean()
        features[f'price_to_SMA_{window}'] = data[price_col] / features[f'SMA_{window}']
    
    # Exponential Moving Averages
    for window in [10, 20]:
        features[f'EMA_{window}'] = data[price_col].ewm(span=window).mean()
        features[f'price_to_EMA_{window}'] = data[price_col] / features[f'EMA_{window}']
    
    return features

def calculate_volatility_features(data, price_col='Close'):
    """Calculate volatility-based features."""
    features = pd.DataFrame(index=data.index)
    
    # Returns for volatility calculation
    returns = data[price_col].pct_change()
    
    # Rolling volatility (different windows)
    for window in [5, 10, 20]:
        features[f'volatility_{window}d'] = returns.rolling(window=window).std() * np.sqrt(252)
        features[f'volatility_{window}d_norm'] = features[f'volatility_{window}d'] / features[f'volatility_{window}d'].rolling(60).mean()
    
    # Price range features
    if all(col in data.columns for col in ['High', 'Low']):
        features['daily_range'] = (data['High'] - data['Low']) / data[price_col]
        features['overnight_gap'] = (data['Open'] - data[price_col].shift(1)) / data[price_col].shift(1)
    
    return features

def calculate_rsi(prices, window=14):
    """Calculate Relative Strength Index manually."""
    delta = prices.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    rs = gain / loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

def calculate_macd(prices, fast=12, slow=26, signal=9):
    """Calculate MACD manually."""
    ema_fast = prices.ewm(span=fast).mean()
    ema_slow = prices.ewm(span=slow).mean()
    macd_line = ema_fast - ema_slow
    signal_line = macd_line.ewm(span=signal).mean()
    histogram = macd_line - signal_line
    
    return macd_line, signal_line, histogram

def calculate_bollinger_bands(prices, window=20, num_std=2):
    """Calculate Bollinger Bands."""
    sma = prices.rolling(window=window).mean()
    std = prices.rolling(window=window).std()
    upper_band = sma + (std * num_std)
    lower_band = sma - (std * num_std)
    
    return upper_band, lower_band, sma

def calculate_technical_indicators(data, price_col='Close'):
    """Calculate comprehensive technical indicators."""
    features = pd.DataFrame(index=data.index)
    prices = data[price_col]
    
    # RSI
    features['RSI'] = calculate_rsi(prices)
    features['RSI_overbought'] = (features['RSI'] > 70).astype(int)
    features['RSI_oversold'] = (features['RSI'] < 30).astype(int)
    
    # MACD
    macd_line, signal_line, histogram = calculate_macd(prices)
    features['MACD'] = macd_line
    features['MACD_signal'] = signal_line
    features['MACD_histogram'] = histogram
    features['MACD_bullish'] = (features['MACD'] > features['MACD_signal']).astype(int)
    
    # Bollinger Bands
    bb_upper, bb_lower, bb_middle = calculate_bollinger_bands(prices)
    features['BB_upper'] = bb_upper
    features['BB_lower'] = bb_lower
    features['BB_middle'] = bb_middle
    features['BB_width'] = (bb_upper - bb_lower) / bb_middle
    features['BB_position'] = (prices - bb_lower) / (bb_upper - bb_lower)
    
    # Price momentum
    features['momentum_5'] = prices / prices.shift(5) - 1
    features['momentum_10'] = prices / prices.shift(10) - 1
    features['momentum_20'] = prices / prices.shift(20) - 1
    
    # Rate of Change
    features['ROC_5'] = ((prices - prices.shift(5)) / prices.shift(5)) * 100
    features['ROC_10'] = ((prices - prices.shift(10)) / prices.shift(10)) * 100
    
    return features

def calculate_lag_features(data, price_col='Close', max_lags=5):
    """Calculate lagged features."""
    features = pd.DataFrame(index=data.index)
    
    # Lagged returns
    returns = data[price_col].pct_change()
    for lag in range(1, max_lags + 1):
        features[f'return_lag_{lag}'] = returns.shift(lag)
    
    # Lagged prices (normalized)
    for lag in range(1, max_lags + 1):
        features[f'price_lag_{lag}'] = data[price_col].shift(lag) / data[price_col]
    
    return features

# Apply all feature engineering steps
print("Starting comprehensive feature engineering...")

# Step 1: Basic return features
print("1. Calculating returns...")
return_features = calculate_returns(cleaned_data)

# Step 2: Moving averages
print("2. Calculating moving averages...")
ma_features = calculate_moving_averages(cleaned_data)

# Step 3: Volatility features
print("3. Calculating volatility features...")
vol_features = calculate_volatility_features(cleaned_data)

# Step 4: Technical indicators
print("4. Calculating technical indicators...")
tech_features = calculate_technical_indicators(cleaned_data)

# Step 5: Lag features
print("5. Calculating lag features...")
lag_features = calculate_lag_features(cleaned_data)

# Combine all features
feature_matrix = pd.concat([
    return_features,
    ma_features, 
    vol_features,
    tech_features,
    lag_features
], axis=1)

# Add target variable (next day return)
feature_matrix['target'] = return_features['simple_return'].shift(-1)

# Remove rows with missing values
initial_shape = feature_matrix.shape
feature_matrix = feature_matrix.dropna()
final_shape = feature_matrix.shape

print(f"\nFeature Engineering Complete!")
print(f"Initial shape: {initial_shape}")
print(f"Final shape: {final_shape}")
print(f"Total features created: {final_shape[1] - 1}")  # Excluding target
print(f"Date range: {feature_matrix.index.min().date()} to {feature_matrix.index.max().date()}")

# Display feature summary
print(f"\nFeature Categories:")
feature_names = feature_matrix.columns.tolist()
feature_names.remove('target')

categories = {
    'Returns': [f for f in feature_names if 'return' in f and 'lag' not in f],
    'Moving Averages': [f for f in feature_names if any(x in f for x in ['SMA', 'EMA', 'price_to'])],
    'Volatility': [f for f in feature_names if 'volatility' in f or 'range' in f or 'gap' in f],
    'Technical Indicators': [f for f in feature_names if any(x in f for x in ['RSI', 'MACD', 'BB', 'momentum', 'ROC'])],
    'Lag Features': [f for f in feature_names if 'lag' in f]
}

for category, features in categories.items():
    print(f"  {category}: {len(features)} features")

feature_matrix.head()

## Section 4: Feature Correlation Analysis

Understanding relationships between features is crucial for model performance and interpretability.

In [None]:
# Feature Correlation Analysis
def analyze_feature_correlations(data, target_col='target', threshold=0.8):
    """Analyze correlations between features and with target."""
    
    features = data.drop(columns=[target_col])
    target = data[target_col]
    
    # Calculate correlation matrix
    corr_matrix = features.corr()
    
    # Find highly correlated feature pairs
    high_corr_pairs = []
    for i in range(len(corr_matrix.columns)):
        for j in range(i+1, len(corr_matrix.columns)):
            corr_val = abs(corr_matrix.iloc[i, j])
            if corr_val > threshold:
                high_corr_pairs.append({
                    'Feature1': corr_matrix.columns[i],
                    'Feature2': corr_matrix.columns[j],
                    'Correlation': corr_matrix.iloc[i, j]
                })
    
    # Calculate correlations with target
    target_corrs = features.corrwith(target).abs().sort_values(ascending=False)
    
    return corr_matrix, high_corr_pairs, target_corrs

# Perform correlation analysis
print("Analyzing feature correlations...")
corr_matrix, high_corr_pairs, target_correlations = analyze_feature_correlations(feature_matrix)

# Display high correlation pairs
print(f"\nHigh Correlation Pairs (|correlation| > 0.8):")
if high_corr_pairs:
    for pair in high_corr_pairs[:10]:  # Show top 10
        print(f"  {pair['Feature1']} <-> {pair['Feature2']}: {pair['Correlation']:.3f}")
else:
    print("  No highly correlated pairs found.")

# Display top target correlations
print(f"\nTop 10 Features Correlated with Target:")
for feature, corr in target_correlations.head(10).items():
    print(f"  {feature}: {corr:.4f}")

# Create correlation heatmap for selected features
plt.figure(figsize=(16, 12))

# Select top correlated features with target for visualization
top_features = target_correlations.head(20).index.tolist()
selected_data = feature_matrix[top_features + ['target']]
selected_corr = selected_data.corr()

# Create heatmap
mask = np.triu(np.ones_like(selected_corr))
sns.heatmap(selected_corr, 
            mask=mask,
            annot=True, 
            cmap='RdBu_r', 
            center=0,
            square=True,
            fmt='.2f',
            cbar_kws={"shrink": .8})

plt.title('Feature Correlation Heatmap (Top 20 Features)', fontsize=14, pad=20)
plt.tight_layout()
plt.show()

# Feature-Target correlation bar plot
plt.figure(figsize=(12, 8))
top_15_corrs = target_correlations.head(15)
colors = ['red' if x < 0 else 'blue' for x in feature_matrix[top_15_corrs.index].corrwith(feature_matrix['target'])]

bars = plt.barh(range(len(top_15_corrs)), top_15_corrs.values, color=colors, alpha=0.7)
plt.yticks(range(len(top_15_corrs)), top_15_corrs.index)
plt.xlabel('Absolute Correlation with Target')
plt.title('Top 15 Features by Target Correlation')
plt.grid(True, alpha=0.3)

# Add value labels
for i, (bar, val) in enumerate(zip(bars, top_15_corrs.values)):
    plt.text(val + 0.001, bar.get_y() + bar.get_height()/2, 
             f'{val:.3f}', va='center', fontsize=9)

plt.tight_layout()
plt.show()

print(f"Correlation analysis completed!")

## Section 5: Principal Component Analysis (PCA)

### Dimensionality Reduction Workflow

```mermaid
graph LR
    A[Feature Matrix] --> B[Standardization]
    B --> C[PCA Fitting]
    C --> D[Component Selection]
    D --> E[Transformation]
    E --> F[Reduced Dataset]
    
    C --> C1[Explained Variance]
    D --> D1[80-95% Threshold]
```

PCA helps us reduce dimensionality while preserving the most important variance in our data.

In [None]:
# Principal Component Analysis
def perform_pca_analysis(data, target_col='target', n_components=None, variance_threshold=0.90):
    """Perform PCA analysis with comprehensive reporting."""
    
    # Separate features and target
    X = data.drop(columns=[target_col])
    y = data[target_col]
    
    # Standardize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Determine number of components if not specified
    if n_components is None:
        # Find components needed for variance threshold
        pca_temp = PCA()
        pca_temp.fit(X_scaled)
        cumsum_variance = np.cumsum(pca_temp.explained_variance_ratio_)
        n_components = np.argmax(cumsum_variance >= variance_threshold) + 1
        print(f"Selected {n_components} components to explain {variance_threshold*100}% of variance")
    
    # Fit PCA
    pca = PCA(n_components=n_components)
    X_pca = pca.fit_transform(X_scaled)
    
    # Create DataFrame with principal components
    pca_df = pd.DataFrame(
        X_pca, 
        index=data.index,
        columns=[f'PC{i+1}' for i in range(n_components)]
    )
    pca_df[target_col] = y
    
    return pca, pca_df, scaler, X_scaled

# Perform PCA analysis
print("Performing Principal Component Analysis...")
pca_model, pca_data, feature_scaler, scaled_features = perform_pca_analysis(feature_matrix, n_components=10)

print(f"\nPCA Results:")
print(f"Number of components: {pca_model.n_components_}")
print(f"Total variance explained: {pca_model.explained_variance_ratio_.sum():.4f}")

# Display explained variance by component
variance_explained = pca_model.explained_variance_ratio_
cumulative_variance = np.cumsum(variance_explained)

print(f"\nVariance Explained by Component:")
for i, (var, cum_var) in enumerate(zip(variance_explained, cumulative_variance)):
    print(f"  PC{i+1}: {var:.4f} (Cumulative: {cum_var:.4f})")

# Visualize PCA results
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# 1. Explained variance by component
axes[0, 0].bar(range(1, len(variance_explained) + 1), variance_explained, alpha=0.7, color='blue')
axes[0, 0].plot(range(1, len(cumulative_variance) + 1), cumulative_variance, 'ro-', color='red')
axes[0, 0].set_xlabel('Principal Component')
axes[0, 0].set_ylabel('Explained Variance Ratio')
axes[0, 0].set_title('PCA Explained Variance')
axes[0, 0].grid(True, alpha=0.3)
axes[0, 0].legend(['Individual', 'Cumulative'])

# 2. First two principal components
scatter = axes[0, 1].scatter(pca_data['PC1'], pca_data['PC2'], c=pca_data['target'], 
                           cmap='RdBu_r', alpha=0.6, s=20)
axes[0, 1].set_xlabel('First Principal Component')
axes[0, 1].set_ylabel('Second Principal Component')
axes[0, 1].set_title('Data in PC Space (colored by target)')
plt.colorbar(scatter, ax=axes[0, 1])

# 3. Feature loadings for PC1
feature_names = feature_matrix.drop(columns=['target']).columns
pc1_loadings = pca_model.components_[0]
top_features_pc1 = np.argsort(np.abs(pc1_loadings))[-10:]
axes[1, 0].barh(range(10), pc1_loadings[top_features_pc1])
axes[1, 0].set_yticks(range(10))
axes[1, 0].set_yticklabels([feature_names[i] for i in top_features_pc1])
axes[1, 0].set_xlabel('Loading')
axes[1, 0].set_title('Top 10 Feature Loadings for PC1')
axes[1, 0].grid(True, alpha=0.3)

# 4. Feature loadings for PC2
pc2_loadings = pca_model.components_[1]
top_features_pc2 = np.argsort(np.abs(pc2_loadings))[-10:]
axes[1, 1].barh(range(10), pc2_loadings[top_features_pc2])
axes[1, 1].set_yticks(range(10))
axes[1, 1].set_yticklabels([feature_names[i] for i in top_features_pc2])
axes[1, 1].set_xlabel('Loading')
axes[1, 1].set_title('Top 10 Feature Loadings for PC2')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Analyze principal components
print(f"\nPrincipal Component Analysis:")
print(f"PC1 (explains {variance_explained[0]:.1%} of variance):")
pc1_top_features = np.argsort(np.abs(pc1_loadings))[-5:]
for idx in reversed(pc1_top_features):
    print(f"  {feature_names[idx]}: {pc1_loadings[idx]:.3f}")

print(f"\nPC2 (explains {variance_explained[1]:.1%} of variance):")
pc2_top_features = np.argsort(np.abs(pc2_loadings))[-5:]
for idx in reversed(pc2_top_features):
    print(f"  {feature_names[idx]}: {pc2_loadings[idx]:.3f}")

print(f"\nPCA transformation completed!")

## Section 6: Feature Importance Analysis

Using tree-based methods to identify the most predictive features in our dataset.

In [None]:
# Feature Importance Analysis using Random Forest
def analyze_feature_importance(data, target_col='target', n_estimators=100):
    """Calculate feature importance using Random Forest."""
    
    # Prepare data
    X = data.drop(columns=[target_col])
    y = data[target_col]
    
    # Remove any remaining NaN values
    mask = ~(X.isnull().any(axis=1) | y.isnull())
    X_clean = X[mask]
    y_clean = y[mask]
    
    print(f"Training Random Forest on {X_clean.shape[0]} samples with {X_clean.shape[1]} features...")
    
    # Train Random Forest
    rf = RandomForestRegressor(
        n_estimators=n_estimators,
        random_state=SEED,
        n_jobs=1 if SAMPLE_MODE else -1,
        max_depth=10 if SAMPLE_MODE else None
    )
    
    rf.fit(X_clean, y_clean)
    
    # Get feature importances
    importance_scores = rf.feature_importances_
    feature_names = X_clean.columns
    
    # Create importance DataFrame
    importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': importance_scores
    }).sort_values('Importance', ascending=False)
    
    return rf, importance_df, X_clean, y_clean

# Perform feature importance analysis
print("Analyzing feature importance with Random Forest...")
rf_model, importance_rankings, X_clean, y_clean = analyze_feature_importance(feature_matrix)

print(f"\nRandom Forest Model Performance:")
train_score = rf_model.score(X_clean, y_clean)
print(f"Training R²: {train_score:.4f}")

print(f"\nTop 15 Most Important Features:")
for idx, row in importance_rankings.head(15).iterrows():
    print(f"  {row['Feature']}: {row['Importance']:.4f}")

# Visualize feature importance
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 1. Top 20 feature importance bar plot
top_20_features = importance_rankings.head(20)
axes[0, 0].barh(range(len(top_20_features)), top_20_features['Importance'][::-1], alpha=0.7)
axes[0, 0].set_yticks(range(len(top_20_features)))
axes[0, 0].set_yticklabels(top_20_features['Feature'][::-1])
axes[0, 0].set_xlabel('Importance Score')
axes[0, 0].set_title('Top 20 Features by Random Forest Importance')
axes[0, 0].grid(True, alpha=0.3)

# 2. Feature importance distribution
axes[0, 1].hist(importance_rankings['Importance'], bins=30, alpha=0.7, color='green')
axes[0, 1].set_xlabel('Importance Score')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].set_title('Distribution of Feature Importance Scores')
axes[0, 1].grid(True, alpha=0.3)

# 3. Cumulative importance
cumulative_importance = np.cumsum(importance_rankings['Importance'])
axes[1, 0].plot(range(1, len(cumulative_importance) + 1), cumulative_importance, 'b-')
axes[1, 0].axhline(y=0.8, color='r', linestyle='--', label='80% Threshold')
axes[1, 0].axhline(y=0.9, color='orange', linestyle='--', label='90% Threshold')
axes[1, 0].set_xlabel('Number of Features')
axes[1, 0].set_ylabel('Cumulative Importance')
axes[1, 0].set_title('Cumulative Feature Importance')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# 4. Feature category importance
categories = {
    'Returns': [f for f in importance_rankings['Feature'] if 'return' in f and 'lag' not in f],
    'Moving Averages': [f for f in importance_rankings['Feature'] if any(x in f for x in ['SMA', 'EMA', 'price_to'])],
    'Volatility': [f for f in importance_rankings['Feature'] if 'volatility' in f or 'range' in f],
    'Technical': [f for f in importance_rankings['Feature'] if any(x in f for x in ['RSI', 'MACD', 'BB', 'momentum', 'ROC'])],
    'Lag Features': [f for f in importance_rankings['Feature'] if 'lag' in f]
}

category_importance = {}
for category, features in categories.items():
    category_features = importance_rankings[importance_rankings['Feature'].isin(features)]
    category_importance[category] = category_features['Importance'].sum()

category_names = list(category_importance.keys())
category_scores = list(category_importance.values())

axes[1, 1].pie(category_scores, labels=category_names, autopct='%1.1f%%', startangle=90)
axes[1, 1].set_title('Feature Importance by Category')

plt.tight_layout()
plt.show()

# Find features needed for 80% and 90% importance
cumulative_importance = np.cumsum(importance_rankings['Importance'])
features_80 = np.argmax(cumulative_importance >= 0.8) + 1
features_90 = np.argmax(cumulative_importance >= 0.9) + 1

print(f"\nFeature Selection Analysis:")
print(f"Features needed for 80% importance: {features_80} ({features_80/len(importance_rankings)*100:.1f}%)")
print(f"Features needed for 90% importance: {features_90} ({features_90/len(importance_rankings)*100:.1f}%)")

# Export top features for later use
top_features_80 = importance_rankings.head(features_80)['Feature'].tolist()
top_features_90 = importance_rankings.head(features_90)['Feature'].tolist()

print(f"\nFeature importance analysis completed!")

In [None]:
# Export Engineered Dataset
def export_engineered_data(feature_data, importance_data, filename='engineered_financial_data.csv'):
    """Export the engineered dataset with metadata."""
    
    # Create output directory
    output_dir = '/kaggle/working' if '/kaggle' in os.getcwd() else 'output'
    os.makedirs(output_dir, exist_ok=True)
    
    # Full dataset export
    full_path = os.path.join(output_dir, filename)
    feature_data.to_csv(full_path)
    
    # Top features dataset (80% importance)
    top_features = importance_data.head(features_80)['Feature'].tolist() + ['target']
    top_features_data = feature_data[top_features]
    top_features_path = os.path.join(output_dir, 'top_features_' + filename)
    top_features_data.to_csv(top_features_path)
    
    # Feature metadata
    metadata = {
        'creation_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        'data_source': data_source,
        'sample_mode': SAMPLE_MODE,
        'total_features': len(feature_data.columns) - 1,
        'total_samples': len(feature_data),
        'date_range': f"{feature_data.index.min().date()} to {feature_data.index.max().date()}",
        'top_features_80_pct': features_80,
        'top_features_90_pct': features_90
    }
    
    metadata_path = os.path.join(output_dir, 'feature_metadata.json')
    import json
    with open(metadata_path, 'w') as f:
        json.dump(metadata, f, indent=2, default=str)
    
    print(f"Exported datasets:")
    print(f"  Full dataset: {full_path} ({feature_data.shape})")
    print(f"  Top features: {top_features_path} ({top_features_data.shape})")
    print(f"  Metadata: {metadata_path}")
    
    return full_path, top_features_path, metadata_path

# Export the engineered datasets
export_paths = export_engineered_data(feature_matrix, importance_rankings)

print(f"\n🎉 Feature Engineering Pipeline Complete!")
print(f"📊 Created {len(feature_matrix.columns)-1} features from raw financial data")
print(f"📈 Random Forest R²: {train_score:.4f}")
print(f"🔍 Top feature: {importance_rankings.iloc[0]['Feature']} (importance: {importance_rankings.iloc[0]['Importance']:.4f})")
print(f"💾 Datasets exported and ready for modeling!")

## Section 7: Student Exercises

Complete the following exercises to master advanced feature engineering techniques.

### Exercise 1: Manual MACD Calculation

Implement MACD (Moving Average Convergence Divergence) from scratch and compare with any existing implementation.

**Components:**
- MACD Line = EMA(12) - EMA(26)
- Signal Line = EMA(9) of MACD Line  
- Histogram = MACD Line - Signal Line

In [None]:
# TODO: Exercise 1 - Manual MACD Calculation
# Your task:
# 1. Implement MACD calculation manually using pandas ewm() function
# 2. Create MACD line, signal line, and histogram
# 3. Visualize the MACD components with the price chart
# 4. Identify bullish/bearish crossovers (MACD > Signal vs MACD < Signal)

def calculate_macd_manual(prices, fast_period=12, slow_period=26, signal_period=9):
    """
    TODO: Implement MACD calculation manually
    
    Parameters:
    - prices: pandas Series of price data
    - fast_period: Period for fast EMA (default 12)
    - slow_period: Period for slow EMA (default 26)  
    - signal_period: Period for signal line EMA (default 9)
    
    Returns:
    - macd_line: Fast EMA - Slow EMA
    - signal_line: EMA of MACD line
    - histogram: MACD line - Signal line
    """
    # TODO: Calculate fast and slow EMAs
    # fast_ema = ???
    # slow_ema = ???
    
    # TODO: Calculate MACD line
    # macd_line = ???
    
    # TODO: Calculate signal line
    # signal_line = ???
    
    # TODO: Calculate histogram
    # histogram = ???
    
    # return macd_line, signal_line, histogram
    pass

# TODO: Test your MACD implementation
# macd, signal, hist = calculate_macd_manual(cleaned_data['Close'])

# TODO: Create visualization comparing price and MACD
# Include subplots showing:
# 1. Price chart
# 2. MACD line and signal line
# 3. MACD histogram

print("Exercise 1: Implement the MACD calculation above!")

### Exercise 2: Bollinger Bands Implementation

Create Bollinger Bands and analyze price position relative to the bands.

**Components:**
- Middle Band = 20-period Simple Moving Average
- Upper Band = Middle Band + (2 × Standard Deviation)
- Lower Band = Middle Band - (2 × Standard Deviation)

In [None]:
# EXERCISE 2: Bollinger Bands Implementation
# Your task:
# 1. Calculate 20-period SMA as middle band
# 2. Calculate rolling standard deviation
# 3. Create upper and lower bands (middle ± 2×std)
# 4. Calculate band width and price position within bands
# 5. Identify when price touches or breaks bands

def calculate_bollinger_bands(prices, window=20, num_std=2):
    """
    TODO: Implement Bollinger Bands calculation
    
    Parameters:
    - prices: pandas Series of price data
    - window: Period for moving average and std (default 20)
    - num_std: Number of standard deviations (default 2)
    
    Returns:
    - upper_band: Upper Bollinger Band
    - middle_band: Middle Bollinger Band (SMA)
    - lower_band: Lower Bollinger Band
    - band_width: (Upper - Lower) / Middle
    - price_position: (Price - Lower) / (Upper - Lower)
    """
    # TODO: Calculate middle band (SMA)
    middle_band = prices.rolling(window=window).mean()
    
    # TODO: Calculate rolling standard deviation
    rolling_std = prices.rolling(window=window).std()
    
    # TODO: Calculate upper and lower bands
    upper_band = middle_band + (num_std * rolling_std)
    lower_band = middle_band - (num_std * rolling_std)
    
    # TODO: Calculate band width (volatility measure)
    band_width = (upper_band - lower_band) / middle_band
    
    # TODO: Calculate price position within bands (0 to 1)
    price_position = (prices - lower_band) / (upper_band - lower_band)
    
    return upper_band, middle_band, lower_band, band_width, price_position

# Test your Bollinger Bands implementation
upper, middle, lower, width, position = calculate_bollinger_bands(df['close'])

# Create comprehensive visualization
fig, axes = plt.subplots(4, 1, figsize=(15, 16))

# 1. Price chart with Bollinger Bands
axes[0].plot(df.index, df['close'], label='Close Price', color='black', linewidth=2)
axes[0].plot(df.index, upper, label='Upper Band', color='red', alpha=0.7)
axes[0].plot(df.index, middle, label='Middle Band (SMA)', color='blue', alpha=0.7)
axes[0].plot(df.index, lower, label='Lower Band', color='green', alpha=0.7)
axes[0].fill_between(df.index, upper, lower, alpha=0.1, color='gray')
axes[0].set_title('Bollinger Bands Analysis')
axes[0].set_ylabel('Price')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# 2. Band width over time (volatility indicator)
axes[1].plot(df.index, width, color='purple', linewidth=2)
axes[1].set_title('Bollinger Band Width (Volatility Measure)')
axes[1].set_ylabel('Band Width')
axes[1].grid(True, alpha=0.3)
axes[1].axhline(y=width.mean(), color='red', linestyle='--', alpha=0.7, label=f'Mean: {width.mean():.3f}')
axes[1].legend()

# 3. Price position within bands
axes[2].plot(df.index, position, color='orange', linewidth=2)
axes[2].axhline(y=0.5, color='blue', linestyle='-', alpha=0.5, label='Middle')
axes[2].axhline(y=0.8, color='red', linestyle='--', alpha=0.7, label='Upper Threshold (80%)')
axes[2].axhline(y=0.2, color='green', linestyle='--', alpha=0.7, label='Lower Threshold (20%)')
axes[2].set_title('Price Position Within Bollinger Bands (0=Lower Band, 1=Upper Band)')
axes[2].set_ylabel('Position')
axes[2].set_ylim(-0.1, 1.1)
axes[2].legend()
axes[2].grid(True, alpha=0.3)

# 4. Price position histogram
axes[3].hist(position.dropna(), bins=50, color='skyblue', alpha=0.7, edgecolor='black')
axes[3].axvline(x=0.5, color='blue', linestyle='-', alpha=0.7, label='Middle (50%)')
axes[3].axvline(x=0.8, color='red', linestyle='--', alpha=0.7, label='Upper Threshold (80%)')
axes[3].axvline(x=0.2, color='green', linestyle='--', alpha=0.7, label='Lower Threshold (20%)')
axes[3].set_title('Distribution of Price Position Within Bands')
axes[3].set_xlabel('Price Position')
axes[3].set_ylabel('Frequency')
axes[3].legend()
axes[3].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Analysis: Band Breakouts and Trading Signals
breakouts_upper = position > 1.0  # Price above upper band
breakouts_lower = position < 0.0  # Price below lower band
squeeze_periods = width < width.quantile(0.2)  # Low volatility periods

print("Bollinger Bands Analysis:")
print(f"Upper band breakouts: {breakouts_upper.sum()} occurrences")
print(f"Lower band breakouts: {breakouts_lower.sum()} occurrences")
print(f"Squeeze periods (low volatility): {squeeze_periods.sum()} days")
print(f"Average band width: {width.mean():.4f}")
print(f"Price position statistics:")
print(position.describe())

# Analyze the relationship between band width and future volatility
future_vol = df['close'].pct_change().rolling(5).std().shift(-5)  # 5-day forward volatility
width_vol_corr = width.corr(future_vol)
print(f"\nCorrelation between band width and 5-day forward volatility: {width_vol_corr:.4f}")

print("\nExercise 2 completed! Bollinger Bands implemented with comprehensive analysis.")

### Exercise 3: PCA vs Feature Importance Comparison

Compare the top components from PCA with the top features from Random Forest importance ranking.

**Analysis Points:**
- Which features load heavily on the first few principal components?
- How do PCA loadings correlate with feature importance rankings?
- When might PCA be preferred over feature selection and vice versa?

In [None]:
# EXERCISE 3: PCA vs Feature Importance Comparison
# Compare PCA loadings with Random Forest feature importance rankings

# Prepare feature matrix for comparison (using our engineered features)
feature_columns = ['returns', 'log_returns', 'sma_5', 'sma_20', 'volatility_5', 
                   'volatility_20', 'rsi', 'macd', 'macd_signal', 'bb_upper', 
                   'bb_middle', 'bb_lower', 'bb_width', 'bb_position']

# Create feature matrix (drop NaN values)
features_df = df[feature_columns].dropna()
print(f"Feature matrix shape: {features_df.shape}")
print(f"Features used: {list(features_df.columns)}")

# Standardize features for PCA
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor

scaler = StandardScaler()
features_scaled = scaler.fit_transform(features_df)

# 1. PCA Analysis
print("\n" + "="*50)
print("PCA ANALYSIS")
print("="*50)

# Fit PCA
pca = PCA()
pca_components = pca.fit_transform(features_scaled)

# Calculate cumulative explained variance
cumsum_var = np.cumsum(pca.explained_variance_ratio_)
print(f"Explained variance by component:")
for i, var in enumerate(pca.explained_variance_ratio_[:6]):
    print(f"PC{i+1}: {var:.3f} ({cumsum_var[i]:.3f} cumulative)")

# Get loadings (components) for first 3 PCs
loadings = pd.DataFrame(
    pca.components_[:3].T,
    columns=['PC1', 'PC2', 'PC3'],
    index=features_df.columns
)

print(f"\nPCA Loadings for first 3 components:")
print(loadings.round(3))

# Find features with highest absolute loadings for each PC
print(f"\nTop contributing features per component:")
for pc in ['PC1', 'PC2', 'PC3']:
    top_features = loadings[pc].abs().sort_values(ascending=False).head(3)
    print(f"{pc}: {', '.join([f'{feat}({val:.3f})' for feat, val in top_features.items()])}")

# 2. Random Forest Feature Importance
print("\n" + "="*50)
print("RANDOM FOREST FEATURE IMPORTANCE")
print("="*50)

# Create target variable (next day returns for prediction)
target = df['returns'].shift(-1).dropna()
features_for_rf = features_df.iloc[:-1]  # Remove last row to match target

# Fit Random Forest
rf = RandomForestRegressor(n_estimators=100, random_state=42, max_depth=10)
rf.fit(features_for_rf, target)

# Get feature importance
importance_df = pd.DataFrame({
    'feature': features_for_rf.columns,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

print("Random Forest Feature Importance Rankings:")
for i, row in importance_df.iterrows():
    print(f"{row['feature']}: {row['importance']:.4f}")

# 3. Comparison Analysis
print("\n" + "="*50)
print("PCA vs FEATURE IMPORTANCE COMPARISON")
print("="*50)

# Create comparison visualization
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Plot 1: PCA Explained Variance
axes[0, 0].bar(range(1, len(pca.explained_variance_ratio_) + 1), 
               pca.explained_variance_ratio_, alpha=0.7, color='steelblue')
axes[0, 0].plot(range(1, len(cumsum_var) + 1), cumsum_var, 
                'ro-', markersize=6, color='red')
axes[0, 0].set_xlabel('Principal Component')
axes[0, 0].set_ylabel('Explained Variance Ratio')
axes[0, 0].set_title('PCA Explained Variance')
axes[0, 0].grid(True, alpha=0.3)
axes[0, 0].axhline(y=0.95, color='green', linestyle='--', alpha=0.7, label='95% Threshold')
axes[0, 0].legend()

# Plot 2: Feature Importance
top_features = importance_df.head(10)
axes[0, 1].barh(range(len(top_features)), top_features['importance'], 
                color='forestgreen', alpha=0.7)
axes[0, 1].set_yticks(range(len(top_features)))
axes[0, 1].set_yticklabels(top_features['feature'])
axes[0, 1].set_xlabel('Importance Score')
axes[0, 1].set_title('Random Forest Feature Importance (Top 10)')
axes[0, 1].grid(True, alpha=0.3)

# Plot 3: PCA Loadings Heatmap
import seaborn as sns
sns.heatmap(loadings.T, annot=True, cmap='RdBu_r', center=0, 
            ax=axes[1, 0], cbar_kws={'label': 'Loading'})
axes[1, 0].set_title('PCA Loadings (First 3 Components)')
axes[1, 0].set_xlabel('Features')

# Plot 4: Correlation between PCA loadings and RF importance
# Calculate correlation for PC1 loadings vs RF importance
pc1_loadings_abs = loadings['PC1'].abs()
rf_importance_dict = dict(zip(importance_df['feature'], importance_df['importance']))
rf_scores_aligned = pd.Series([rf_importance_dict[feat] for feat in pc1_loadings_abs.index])

axes[1, 1].scatter(pc1_loadings_abs, rf_scores_aligned, alpha=0.7, s=60, color='purple')
for i, feature in enumerate(pc1_loadings_abs.index):
    axes[1, 1].annotate(feature, (pc1_loadings_abs.iloc[i], rf_scores_aligned.iloc[i]), 
                       xytext=(5, 5), textcoords='offset points', fontsize=8)

correlation = pc1_loadings_abs.corr(rf_scores_aligned)
axes[1, 1].set_xlabel('|PC1 Loading|')
axes[1, 1].set_ylabel('RF Feature Importance')
axes[1, 1].set_title(f'PC1 Loadings vs RF Importance\n(Correlation: {correlation:.3f})')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# 4. Analysis Summary
print("\n" + "="*50)
print("ANALYSIS SUMMARY")
print("="*50)

# Find components needed for 95% variance
components_95 = np.argmax(cumsum_var >= 0.95) + 1
print(f"Components needed for 95% variance: {components_95}")

# Top features from each method
top_rf_features = set(importance_df.head(5)['feature'])
top_pc1_features = set(loadings['PC1'].abs().sort_values(ascending=False).head(5).index)

common_features = top_rf_features.intersection(top_pc1_features)
print(f"Top 5 RF features: {top_rf_features}")
print(f"Top 5 PC1 features: {top_pc1_features}")
print(f"Common top features: {common_features}")

# When to use each method
print(f"\nRECOMMENDATIONS:")
print(f"• Use PCA when:")
print(f"  - High multicollinearity (VIF > 10)")
print(f"  - Need dimensionality reduction ({components_95} components vs {len(feature_columns)} features)")
print(f"  - Want orthogonal features")
print(f"  - Interpretability is less important")
print(f"• Use Feature Selection when:")
print(f"  - Interpretability is crucial")
print(f"  - Want to maintain original feature meanings")
print(f"  - Low correlation between features")
print(f"  - Need domain-specific feature insights")

print(f"\nExercise 3 completed! PCA vs Feature Importance analysis finished.")