In [11]:
# Import required libraries
import pandas as pd
import numpy as np
from pathlib import Path
import logging
from tqdm import tqdm
import warnings

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

# Suppress warnings
warnings.filterwarnings('ignore')

# Define expected sectors
EXPECTED_SECTORS = {
    'Information Technology',
    'Financials',
    'Healthcare',
    'Energy',
    'Consumer Discretionary',
    'Consumer Staples',
    'Industrials',
    'Materials',
    'Utilities',
    'Real Estate',
    'Communication Services'
}

# Load validated data
def load_validated_data():
    """Load validated data from either processed or enriched directory."""
    try:
        # Try processed directory first
        processed_path = Path('../data/processed/merged_data.csv')
        if processed_path.exists():
            df = pd.read_csv(processed_path)
            logging.info(f"Loaded data from {processed_path}")
        else:
            # Try enriched directory as fallback
            enriched_path = Path('../data/enriched/nasdaq_validated.csv')
            if enriched_path.exists():
                df = pd.read_csv(enriched_path)
                logging.info(f"Loaded data from {enriched_path}")
            else:
                raise FileNotFoundError("No validated data found. Please run data validation notebook first.")
        
        # Convert date to datetime
        df['Date'] = pd.to_datetime(df['Date'])
        
        # Basic data validation
        required_cols = ['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Sector', 'Ticker']
        missing_cols = set(required_cols) - set(df.columns)
        if missing_cols:
            raise ValueError(f"Missing required columns: {missing_cols}")
            
        # Check sector coverage
        found_sectors = set(df['Sector'].unique())
        missing_sectors = EXPECTED_SECTORS - found_sectors
        if missing_sectors:
            logging.warning(f"Missing expected sectors: {missing_sectors}")
            
        logging.info(f"Found {len(found_sectors)} sectors: {sorted(found_sectors)}")
        
        # Print sector distribution
        for sector in sorted(found_sectors):
            tickers = sorted(df[df['Sector'] == sector]['Ticker'].unique())
            logging.info(f"{sector}: {len(tickers)} stocks - {tickers}")
        
        logging.info(f"Successfully loaded data: {len(df)} rows, {df['Ticker'].nunique()} stocks")
        logging.info(f"Date range: {df['Date'].min():%Y-%m-%d} to {df['Date'].max():%Y-%m-%d}")
        
        return df
        
    except Exception as e:
        logging.error(f"Error loading data: {str(e)}")
        raise Exception("Cannot proceed without validated data")


In [12]:
# Technical indicator calculation functions
def calculate_moving_averages(data, periods):
    """Calculate simple and exponential moving averages."""
    for period in periods:
        data[f'MA_{period}'] = data['Close'].rolling(window=period).mean()
        data[f'EMA_{period}'] = data['Close'].ewm(span=period, adjust=False).mean()
    return data

def calculate_rsi(data, period=14):
    """Calculate Relative Strength Index."""
    delta = data['Close'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=period).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=period).mean()
    rs = gain / loss
    data['RSI'] = 100 - (100 / (1 + rs))
    return data

def calculate_bollinger_bands(data, period=20):
    """Calculate Bollinger Bands."""
    bb_ma = data['Close'].rolling(window=period).mean()
    bb_std = data['Close'].rolling(window=period).std()
    data['BB_Upper'] = bb_ma + (bb_std * 2)
    data['BB_Lower'] = bb_ma - (bb_std * 2)
    data['BB_Width'] = (data['BB_Upper'] - data['BB_Lower']) / bb_ma
    return data

def calculate_macd(data, fast=12, slow=26, signal=9):
    """Calculate MACD (Moving Average Convergence Divergence)."""
    exp1 = data['Close'].ewm(span=fast, adjust=False).mean()
    exp2 = data['Close'].ewm(span=slow, adjust=False).mean()
    data['MACD'] = exp1 - exp2
    data['MACD_Signal'] = data['MACD'].ewm(span=signal, adjust=False).mean()
    data['MACD_Hist'] = data['MACD'] - data['MACD_Signal']
    return data

def calculate_atr(data, period=14):
    """Calculate Average True Range."""
    high_low = data['High'] - data['Low']
    high_close = abs(data['High'] - data['Close'].shift())
    low_close = abs(data['Low'] - data['Close'].shift())
    ranges = pd.concat([high_low, high_close, low_close], axis=1)
    true_range = ranges.max(axis=1)
    data['ATR'] = true_range.rolling(window=period).mean()
    return data

def calculate_volume_indicators(data, period=20):
    """Calculate volume-based indicators."""
    # On-Balance Volume
    data['OBV'] = (np.sign(data['Close'].diff()) * data['Volume']).fillna(0).cumsum()
    
    # Volume Moving Average and Ratio
    data['Volume_MA'] = data['Volume'].rolling(window=period).mean()
    data['Volume_Ratio'] = data['Volume'] / data['Volume_MA']
    return data

def calculate_momentum_indicators(data, period=14):
    """Calculate momentum indicators."""
    # Rate of Change
    data['ROC'] = data['Close'].pct_change(periods=period) * 100
    
    # Stochastic Oscillator
    low_min = data['Low'].rolling(window=period).min()
    high_max = data['High'].rolling(window=period).max()
    data['%K'] = ((data['Close'] - low_min) / (high_max - low_min)) * 100
    data['%D'] = data['%K'].rolling(window=3).mean()
    return data

def calculate_price_indicators(data):
    """Calculate price-based indicators."""
    data['Daily_Return'] = data['Close'].pct_change()
    data['Log_Return'] = np.log(data['Close'] / data['Close'].shift(1))
    data['Volatility'] = data['Log_Return'].rolling(window=20).std() * np.sqrt(252)
    return data

def calculate_sector_indicators(data):
    """Calculate sector-relative indicators."""
    if 'Sector_Close' in data.columns:
        data['Sector_Return'] = data['Sector_Close'].pct_change()
        data['Relative_Strength'] = data['Close'] / data['Sector_Close']
        data['RS_MA'] = data['Relative_Strength'].rolling(window=20).mean()
    return data


In [13]:
# Main feature engineering functions
def calculate_all_features(df, config):
    """Calculate all technical indicators for a given stock/sector."""
    try:
        # Make a copy to avoid modifying original data
        data = df.copy()
        
        # Sort by date to ensure correct calculations
        data = data.sort_values('Date')
        
        # Calculate all indicators
        data = calculate_moving_averages(data, config['ma_periods'])
        data = calculate_rsi(data, config['rsi_period'])
        data = calculate_bollinger_bands(data, config['bb_period'])
        data = calculate_macd(data, 
                            fast=config['macd_params']['fast'],
                            slow=config['macd_params']['slow'],
                            signal=config['macd_params']['signal'])
        data = calculate_atr(data, config['atr_period'])
        data = calculate_volume_indicators(data, config['obv_period'])
        data = calculate_momentum_indicators(data, config['roc_period'])
        data = calculate_price_indicators(data)
        data = calculate_sector_indicators(data)
        
        return data
        
    except Exception as e:
        logging.error(f"Error calculating features: {str(e)}")
        raise

def process_stock_data(df, ticker, config):
    """Process data for a single stock."""
    try:
        # Get stock data
        stock_df = df[df['Ticker'] == ticker].copy()
        if len(stock_df) == 0:
            raise ValueError(f"No data found for ticker {ticker}")
        
        # Calculate features
        stock_df = calculate_all_features(stock_df, config)
        
        # Calculate feature statistics
        feature_stats = {
            'ticker': ticker,
            'sector': stock_df['Sector'].iloc[0],
            'start_date': stock_df['Date'].min(),
            'end_date': stock_df['Date'].max(),
            'trading_days': len(stock_df),
            'avg_volume': stock_df['Volume'].mean(),
            'avg_volatility': stock_df['Volatility'].mean(),
            'sharpe_ratio': stock_df['Daily_Return'].mean() / stock_df['Daily_Return'].std() * np.sqrt(252),
            'correlation_with_sector': stock_df['Daily_Return'].corr(stock_df['Sector_Return'])
        }
        
        return stock_df, feature_stats
        
    except Exception as e:
        logging.error(f"Error processing {ticker}: {str(e)}")
        return None, None


In [14]:
# Load data and prepare for processing
try:
    # Load the validated data
    df = load_validated_data()
    
    # Configuration for feature engineering
    FEATURE_CONFIG = {
        'price_features': ['Open', 'High', 'Low', 'Close', 'Volume'],
        'ma_periods': [5, 10, 20, 50, 100, 200],  # Moving average periods
        'rsi_period': 14,  # RSI calculation period
        'bb_period': 20,   # Bollinger Bands period
        'macd_params': {
            'fast': 12,    # Fast EMA period
            'slow': 26,    # Slow EMA period
            'signal': 9    # Signal line period
        },
        'atr_period': 14,  # Average True Range period
        'obv_period': 20,  # On-Balance Volume period
        'roc_period': 12,  # Rate of Change period
        'stoch_params': {
            'k_period': 14,  # %K period
            'k_smooth': 3,   # %K smoothing
            'd_period': 3    # %D period
        }
    }
    
    # Define sector ETF mapping
    SECTOR_ETF_MAP = {
        'XLK': 'Information Technology',
        'XLF': 'Financials',
        'XLV': 'Healthcare',
        'XLE': 'Energy',
        'XLY': 'Consumer Discretionary',
        'XLP': 'Consumer Staples',
        'XLI': 'Industrials',
        'XLB': 'Materials',
        'XLU': 'Utilities',
        'XLRE': 'Real Estate',
        'XLC': 'Communication Services'
    }
    
    # Clean and standardize sector names in the dataframe
    df['Sector'] = df['Sector'].str.strip()  # Remove trailing/leading spaces
    
    # Map old sector names to standardized names
    SECTOR_NAME_MAP = {
        'Technology': 'Information Technology',
        'Technology ': 'Information Technology',  # Handle the case with extra space
        'Consumer_Discretionary': 'Consumer Discretionary',
        'Consumer_Staples': 'Consumer Staples',
        'Communication_Services': 'Communication Services'
    }
    
    # Apply sector name standardization
    df['Sector'] = df['Sector'].replace(SECTOR_NAME_MAP)
    
    # Get unique sectors and validate
    sectors = sorted(df['Sector'].unique())
    expected_sectors = set(SECTOR_ETF_MAP.values())
    missing_sectors = expected_sectors - set(sectors)
    extra_sectors = set(sectors) - expected_sectors
    
    if missing_sectors:
        logging.warning(f"Missing expected sectors: {missing_sectors}")
    if extra_sectors:
        logging.warning(f"Found unexpected sectors: {extra_sectors}")
    
    logging.info(f"Found {len(sectors)} sectors: {sectors}")
    
    # Get tickers by sector
    sector_tickers = {}
    for sector in sectors:
        tickers = sorted(df[df['Sector'] == sector]['Ticker'].unique())
        sector_tickers[sector] = tickers
        logging.info(f"{sector}: {len(tickers)} stocks - {tickers}")
    
except Exception as e:
    logging.error(f"Error in data preparation: {str(e)}")
    raise

2025-07-26 07:35:57,089 - INFO - Loaded data from ..\data\processed\merged_data.csv
2025-07-26 07:35:57,205 - INFO - Found 11 sectors: ['Communication Services', 'Consumer Discretionary', 'Consumer Staples', 'Energy', 'Financials', 'Healthcare', 'Industrials', 'Information Technology', 'Materials', 'Real Estate', 'Utilities']
2025-07-26 07:35:57,249 - INFO - Communication Services: 6 stocks - ['CHTR', 'CMCSA', 'NFLX', 'SIRI', 'TMUS', 'WBD']
2025-07-26 07:35:57,295 - INFO - Consumer Discretionary: 14 stocks - ['ABNB', 'AMZN', 'BKNG', 'CVNA', 'DLTR', 'EBAY', 'LCID', 'MAR', 'MELI', 'ORLY', 'RIVN', 'ROST', 'SBUX', 'TSLA']
2025-07-26 07:35:57,331 - INFO - Consumer Staples: 6 stocks - ['COST', 'KDP', 'KHC', 'MDLZ', 'MNST', 'PEP']
2025-07-26 07:35:57,364 - INFO - Energy: 2 stocks - ['BKR', 'FANG']
2025-07-26 07:35:57,399 - INFO - Financials: 2 stocks - ['COIN', 'PYPL']
2025-07-26 07:35:57,449 - INFO - Healthcare: 11 stocks - ['ALGN', 'AMGN', 'BIIB', 'DXCM', 'GILD', 'IDXX', 'ILMN', 'ISRG', 'MR

In [15]:
# Process data and save results
try:
    # Create output directories if they don't exist
    enriched_dir = Path('../data/enriched')
    stocks_dir = enriched_dir / 'stocks'
    sectors_dir = enriched_dir / 'sectors'
    for dir_path in [enriched_dir, stocks_dir, sectors_dir]:
        dir_path.mkdir(parents=True, exist_ok=True)
    
    # Process stocks by sector
    feature_stats = {'stocks': [], 'sectors': []}
    
    for sector in tqdm(sectors, desc="Processing sectors"):
        try:
            logging.info(f"\nProcessing sector: {sector}")
            sector_ticks = sector_tickers[sector]
            
            # Skip if no tickers in sector
            if not sector_ticks:
                logging.warning(f"No stocks found for sector {sector}, skipping...")
                continue
            
            # Process each stock in the sector
            sector_data = []
            for ticker in tqdm(sector_ticks, desc=f"Processing {sector} stocks", leave=False):
                stock_df, stats = process_stock_data(df, ticker, FEATURE_CONFIG)
                if stock_df is not None:
                    sector_data.append(stock_df)
                    feature_stats['stocks'].append(stats)
            
            if sector_data:
                # Combine sector data
                sector_df = pd.concat(sector_data, ignore_index=True)
                
                # Save individual stock files
                for ticker in sector_ticks:
                    stock_data = sector_df[sector_df['Ticker'] == ticker]
                    stock_file = stocks_dir / f"{ticker}_features.csv"
                    stock_data.to_csv(stock_file, index=False)
                
                # Save sector data
                sector_file = sectors_dir / f"{sector.replace(' ', '_')}_features.csv"
                sector_df.to_csv(sector_file, index=False)
                logging.info(f"Saved {sector} features to {sector_file}")
                
                # Calculate sector statistics
                sector_stats = {
                    'sector': sector,
                    'num_stocks': len(sector_ticks),
                    'avg_correlation': np.mean([s['correlation_with_sector'] for s in feature_stats['stocks'] if s['sector'] == sector]),
                    'avg_volatility': np.mean([s['avg_volatility'] for s in feature_stats['stocks'] if s['sector'] == sector]),
                    'avg_sharpe': np.mean([s['sharpe_ratio'] for s in feature_stats['stocks'] if s['sector'] == sector])
                }
                feature_stats['sectors'].append(sector_stats)
                
        except Exception as e:
            logging.error(f"Error processing sector {sector}: {str(e)}")
            continue
    
    # Save feature statistics
    stats_df = pd.DataFrame(feature_stats['stocks'])
    sector_stats_df = pd.DataFrame(feature_stats['sectors'])
    
    stats_df.to_csv(stocks_dir / 'stock_statistics.csv', index=False)
    sector_stats_df.to_csv(sectors_dir / 'sector_statistics.csv', index=False)
    
    logging.info("\nFeature engineering complete!")
    logging.info(f"Processed {len(feature_stats['stocks'])} stocks across {len(feature_stats['sectors'])} sectors")
    
    # Print sector coverage summary
    processed_sectors = set(s['sector'] for s in feature_stats['sectors'])
    missing_sectors = expected_sectors - processed_sectors
    if missing_sectors:
        logging.warning(f"Sectors with no processed data: {missing_sectors}")
    
    # Print stock distribution
    print("\nStock distribution by sector:")
    for sector_stats in sorted(feature_stats['sectors'], key=lambda x: x['sector']):
        print(f"- {sector_stats['sector']}: {sector_stats['num_stocks']} stocks")
    
except Exception as e:
    logging.error(f"Error in main processing: {str(e)}")
    raise

Processing sectors:   0%|          | 0/11 [00:00<?, ?it/s]2025-07-26 07:35:58,419 - INFO - 
Processing sector: Communication Services
2025-07-26 07:36:00,609 - INFO - Saved Communication Services features to ..\data\enriched\sectors\Communication_Services_features.csv
Processing sectors:   9%|▉         | 1/11 [00:02<00:21,  2.19s/it]2025-07-26 07:36:00,611 - INFO - 
Processing sector: Consumer Discretionary
2025-07-26 07:36:04,556 - INFO - Saved Consumer Discretionary features to ..\data\enriched\sectors\Consumer_Discretionary_features.csv
Processing sectors:  18%|█▊        | 2/11 [00:06<00:29,  3.22s/it]2025-07-26 07:36:04,559 - INFO - 
Processing sector: Consumer Staples
2025-07-26 07:36:06,447 - INFO - Saved Consumer Staples features to ..\data\enriched\sectors\Consumer_Staples_features.csv
Processing sectors:  27%|██▋       | 3/11 [00:08<00:20,  2.62s/it]2025-07-26 07:36:06,449 - INFO - 
Processing sector: Energy
2025-07-26 07:36:07,091 - INFO - Saved Energy features to ..\data\enr


Stock distribution by sector:
- Communication Services: 6 stocks
- Consumer Discretionary: 14 stocks
- Consumer Staples: 6 stocks
- Energy: 2 stocks
- Financials: 2 stocks
- Healthcare: 11 stocks
- Industrials: 7 stocks
- Information Technology: 51 stocks
- Materials: 3 stocks
- Real Estate: 3 stocks
- Utilities: 3 stocks
