In [19]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mutual_info_score
from pathlib import Path
import warnings
from datetime import datetime
import logging
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor, as_completed
import multiprocessing
from scipy import stats
import networkx as nx

# Configure logging
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
log_file = f'../reports/feature_selection_{timestamp}.log'
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(log_file),
        logging.StreamHandler()
    ]
)

# Suppress warnings
warnings.filterwarnings('ignore')

# Set plotting style
sns.set_style("darkgrid")
sns.set_palette("husl", n_colors=11)  # One color per sector

# Customize plot appearance
plt.rcParams.update({
    'figure.figsize': [15, 8],
    'axes.grid': True,
    'axes.grid.which': 'both',
    'axes.grid.axis': 'both',
    'axes.facecolor': 'white',
    'figure.facecolor': 'white',
    'grid.color': '#CCCCCC',
    'grid.alpha': 0.5,
    'font.size': 12,
    'axes.labelsize': 12,
    'xtick.labelsize': 10,
    'ytick.labelsize': 10,
    'legend.fontsize': 10,
    'legend.frameon': True,
    'legend.framealpha': 0.8,
    'axes.spines.left': True,
    'axes.spines.bottom': True,
    'axes.spines.right': False,
    'axes.spines.top': False
})

# Create necessary directories
for dir_path in [
    '../data/smi_scores',
    '../reports/figures/features',
    '../reports/figures/sectors'
]:
    Path(dir_path).mkdir(parents=True, exist_ok=True)

# Set number of processes for parallel computation
N_PROCESSES = max(1, multiprocessing.cpu_count() - 1)

# Define feature groups with descriptions
FEATURE_GROUPS = {
    'Price': {
        'features': ['Open', 'High', 'Low', 'Close', 'Volume'],
        'description': 'Raw price and volume data'
    },
    'Technical': {
        'features': [
            'EMA_10', 'EMA_50',  # Trend following (renamed from Short/Long)
            'RSI',               # Momentum
            'MACD', 'MACD_Signal',  # Trend and momentum
            'MACD_Hist',         # Momentum divergence
            'BB_Upper', 'BB_Lower',  # Volatility bands
            'BB_Width'          # Volatility measure
        ],
        'description': 'Technical analysis indicators'
    },
    'Volatility': {
        'features': [
            'Volatility',        # Historical volatility
            'Daily_Return',      # Daily returns
            'Log_Return'         # Log returns
        ],
        'description': 'Volatility and return metrics'
    },
    'Volume': {
        'features': [
            'OBV',              # On-Balance Volume
            'Volume_MA',        # Volume moving average
            'Volume_Ratio'      # Current to average volume
        ],
        'description': 'Volume-based indicators'
    },
    'Sector': {
        'features': [
            'Relative_Strength',  # Relative strength vs sector
            'Sector_Return',     # Sector returns
            'RS_MA'             # Moving average of relative strength
        ],
        'description': 'Sector-relative indicators'
    }
}

# Feature selection configuration
FEATURE_CONFIG = {
    'min_samples': 100,        # Minimum samples required
    'min_class_pct': 5.0,      # Minimum class percentage
    'test_size': 0.2,          # Validation set size
    'cv_folds': 5,             # Cross-validation folds
    'smi_random_state': 42,    # Random state for reproducibility
    'correlation_threshold': 0.7  # Threshold for feature correlation
}

# Sector colors for consistent plotting
SECTOR_COLORS = {
    'Information Technology': '#2ecc71',
    'Healthcare': '#3498db',
    'Financials': '#9b59b6',
    'Consumer Discretionary': '#e74c3c',
    'Consumer Staples': '#f1c40f',
    'Energy': '#e67e22',
    'Industrials': '#1abc9c',
    'Materials': '#34495e',
    'Real Estate': '#95a5a6',
    'Utilities': '#7f8c8d',
    'Communication Services': '#d35400'
}

# Flatten feature list and add descriptions
FEATURE_COLUMNS = []
FEATURE_DESCRIPTIONS = {}
for group_name, group_info in FEATURE_GROUPS.items():
    FEATURE_COLUMNS.extend(group_info['features'])
    for feature in group_info['features']:
        FEATURE_DESCRIPTIONS[feature] = {
            'group': group_name,
            'description': group_info['description']
        }

logging.info(f"Configuration complete. Using {N_PROCESSES} processes for parallel computation.")
print(f"\nFeature Groups:")
for group, info in FEATURE_GROUPS.items():
    print(f"- {group} ({len(info['features'])} features): {info['description']}")
print(f"\nTotal features: {len(FEATURE_COLUMNS)}")


2025-07-27 22:31:41,208 - INFO - Configuration complete. Using 19 processes for parallel computation.



Feature Groups:
- Price (5 features): Raw price and volume data
- Technical (9 features): Technical analysis indicators
- Volatility (3 features): Volatility and return metrics
- Volume (3 features): Volume-based indicators
- Sector (3 features): Sector-relative indicators

Total features: 23


In [20]:
# Load and prepare dataset
try:
    # Load sector mapping for reference
    sector_mapping = pd.read_csv('../data/sector_mapping.csv')
    all_tickers = sector_mapping['Ticker'].unique()
    logging.info(f"Found {len(all_tickers)} tickers in sector mapping")
    
    # Load and combine enriched data from stocks directory
    stock_files = list(Path('../data/enriched/stocks').glob('*_features.csv'))
    if not stock_files:
        raise FileNotFoundError("No feature files found. Please run feature engineering notebook first.")
    
    # Load all stock data
    stock_data = []
    for file in tqdm(stock_files, desc="Loading stock data"):
        if 'statistics' not in file.name:  # Skip statistics file
            df = pd.read_csv(file)
            stock_data.append(df)
    
    # Combine all stock data
    df = pd.concat(stock_data, ignore_index=True)
    df['Date'] = pd.to_datetime(df['Date'])
    
    # Save combined dataset
    output_path = Path('../data/enriched/nasdaq_features.csv')
    output_path.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(output_path, index=False)
    logging.info(f"Combined dataset saved to {output_path}")
    
    # Verify data loading
    print("\nDataset Loading:")
    print(f"- Total records: {len(df):,}")
    print(f"- Date range: {df['Date'].min():%Y-%m-%d} to {df['Date'].max():%Y-%m-%d}")
    
    # Check for missing tickers
    available_tickers = df['Ticker'].unique()
    missing_tickers = set(all_tickers) - set(available_tickers)
    if missing_tickers:
        print("\nWarning: Missing data for tickers:")
        for ticker in sorted(missing_tickers):
            sector = sector_mapping[sector_mapping['Ticker'] == ticker]['Sector'].iloc[0]
            print(f"- {ticker} ({sector})")

# Get sectors and verify coverage
    sectors = df['Sector'].unique()
    sector_coverage = df.groupby('Sector')['Ticker'].nunique()
    total_sectors = sector_mapping['Sector'].nunique()
    
    if len(sectors) < total_sectors:
        missing_sectors = set(sector_mapping['Sector'].unique()) - set(sectors)
        print("\nWarning: Missing sectors:")
        for sector in missing_sectors:
            print(f"- {sector}")
    
    # Verify feature availability
    print("\nFeature Availability:")
    for group_name, group_info in FEATURE_GROUPS.items():
        available = [f for f in group_info['features'] if f in df.columns]
        missing = [f for f in group_info['features'] if f not in df.columns]
        
        print(f"\n{group_name} Features:")
        print(f"- Available ({len(available)}/{len(group_info['features'])}):")
        for feat in available:
            print(f"  * {feat}")
        
        if missing:
            print(f"- Missing ({len(missing)}):")
            for feat in missing:
                print(f"  * {feat}")
    
    # Data quality validation
    print("\nData Quality Validation:")
    quality_stats = {
        'total_stocks': len(available_tickers),
        'total_sectors': len(sectors),
        'avg_days_per_stock': df.groupby('Ticker').size().mean(),
        'min_days_per_stock': df.groupby('Ticker').size().min(),
        'missing_values_pct': (df[FEATURE_COLUMNS].isnull().sum() / len(df) * 100).mean(),
        'stocks_with_issues': []
    }
    
    # Validate each stock
    for ticker in tqdm(available_tickers, desc="Validating stocks"):
        stock_data = df[df['Ticker'] == ticker]
        issues = []
        
        # Check data volume
        if len(stock_data) < FEATURE_CONFIG['min_samples']:
            issues.append(f"Insufficient samples: {len(stock_data)}")
        
        # Check class balance
        if 'Label' in stock_data.columns:
            label_dist = stock_data['Label'].value_counts(normalize=True) * 100
            if label_dist.min() < FEATURE_CONFIG['min_class_pct']:
                issues.append(f"Class imbalance: min class = {label_dist.min():.1f}%")
        
        # Check missing values
        missing_pct = stock_data[FEATURE_COLUMNS].isnull().mean() * 100
        if missing_pct.max() > 0:
            issues.append(f"Missing values: max {missing_pct.max():.1f}%")
        
        if issues:
            quality_stats['stocks_with_issues'].append((ticker, issues))
    
    # Print quality summary
    print("\nQuality Summary:")
    print(f"- Stocks: {quality_stats['total_stocks']} across {quality_stats['total_sectors']} sectors")
    print(f"- Average days per stock: {quality_stats['avg_days_per_stock']:.1f}")
    print(f"- Minimum days per stock: {quality_stats['min_days_per_stock']}")
    print(f"- Average missing values: {quality_stats['missing_values_pct']:.2f}%")
    
    if quality_stats['stocks_with_issues']:
        print("\nStocks with Quality Issues:")
        for ticker, issues in quality_stats['stocks_with_issues']:
            print(f"- {ticker}:")
            for issue in issues:
                print(f"  * {issue}")
    
    # Save quality report
    quality_report_path = f'../reports/data_quality_{timestamp}.txt'
    with open(quality_report_path, 'w') as f:
        f.write("NASDAQ-100 Feature Selection Data Quality Report\n")
        f.write("=" * 50 + "\n\n")
        
        f.write("Dataset Overview:\n")
        f.write(f"- Total records: {len(df):,}\n")
        f.write(f"- Date range: {df['Date'].min():%Y-%m-%d} to {df['Date'].max():%Y-%m-%d}\n")
        f.write(f"- Stocks: {quality_stats['total_stocks']}\n")
        f.write(f"- Sectors: {quality_stats['total_sectors']}\n\n")
        
        f.write("Data Quality Metrics:\n")
        f.write(f"- Average days per stock: {quality_stats['avg_days_per_stock']:.1f}\n")
        f.write(f"- Minimum days per stock: {quality_stats['min_days_per_stock']}\n")
        f.write(f"- Average missing values: {quality_stats['missing_values_pct']:.2f}%\n\n")
        
        if quality_stats['stocks_with_issues']:
            f.write("Stocks with Quality Issues:\n")
            for ticker, issues in quality_stats['stocks_with_issues']:
                f.write(f"\n{ticker}:\n")
                for issue in issues:
                    f.write(f"- {issue}\n")
    
    print(f"\nDetailed quality report saved to: {quality_report_path}")
    print(f"\nReady to process {len(available_tickers)} stocks across {len(sectors)} sectors")
    
    # Set up for processing
    tickers = [t for t in available_tickers if t not in [i[0] for i in quality_stats['stocks_with_issues']]]
    ticker_sectors = df.groupby('Ticker')['Sector'].first()
    
except FileNotFoundError:
    logging.error("Required dataset not found. Please run feature engineering notebook first.")
    raise
except Exception as e:
    logging.error(f"Error loading data: {str(e)}")
    raise


2025-07-27 22:31:41,253 - INFO - Found 109 tickers in sector mapping
Loading stock data: 100%|██████████| 108/108 [00:03<00:00, 34.50it/s]
2025-07-27 22:32:02,087 - INFO - Combined dataset saved to ..\data\enriched\nasdaq_features.csv



Dataset Loading:
- Total records: 266,029
- Date range: 2015-01-02 to 2025-07-25

- ATVI (Information Technology)

- Materials 

Feature Availability:

Price Features:
- Available (5/5):
  * Open
  * High
  * Low
  * Close
  * Volume

Technical Features:
- Available (9/9):
  * EMA_10
  * EMA_50
  * RSI
  * MACD
  * MACD_Signal
  * MACD_Hist
  * BB_Upper
  * BB_Lower
  * BB_Width

Volatility Features:
- Available (3/3):
  * Volatility
  * Daily_Return
  * Log_Return

Volume Features:
- Available (3/3):
  * OBV
  * Volume_MA
  * Volume_Ratio

Sector Features:
- Available (3/3):
  * Relative_Strength
  * Sector_Return
  * RS_MA

Data Quality Validation:


Validating stocks: 100%|██████████| 108/108 [00:02<00:00, 43.64it/s]


Quality Summary:
- Stocks: 108 across 11 sectors
- Average days per stock: 2463.2
- Minimum days per stock: 929
- Average missing values: 0.60%

Stocks with Quality Issues:
- AAPL:
  * Missing values: max 0.8%
- ABNB:
  * Missing values: max 1.7%
- ADBE:
  * Missing values: max 0.8%
- ADI:
  * Missing values: max 0.8%
- ADP:
  * Missing values: max 0.8%
- ADSK:
  * Missing values: max 0.8%
- AEP:
  * Missing values: max 0.8%
- ALGN:
  * Missing values: max 7.5%
- AMAT:
  * Missing values: max 0.8%
- AMD:
  * Missing values: max 0.8%
- AMGN:
  * Missing values: max 0.8%
- AMZN:
  * Missing values: max 0.8%
- ANSS:
  * Missing values: max 0.8%
- APD:
  * Missing values: max 0.8%
- ASML:
  * Missing values: max 0.8%
- AVGO:
  * Missing values: max 14.1%
- BIIB:
  * Missing values: max 1.7%
- BKNG:
  * Missing values: max 10.6%
- BKR:
  * Missing values: max 0.8%
- CDNS:
  * Missing values: max 0.8%
- CHTR:
  * Missing values: max 33.6%
- CMCSA:
  * Missing values: max 33.6%
- COIN:
  * M




In [21]:
# Enhanced SMI calculation with sector context
def calculate_feature_importance(stock_data: pd.DataFrame, features: list, config: dict) -> tuple:
    """
    Calculate feature importance scores with sector context.
    
    Args:
        stock_data (pd.DataFrame): Stock data with features and labels
        features (list): List of features to analyze
        config (dict): Configuration parameters
        
    Returns:
        tuple: (feature scores, feature ranks, temporal stability)
    """
    try:
        # Verify data requirements
        if len(stock_data) < config['min_samples']:
            raise ValueError(f"Insufficient samples: {len(stock_data)}")
        
        # Create label (1 if next day's return is positive, 0 otherwise)
        stock_data['Label'] = (stock_data['Close'].shift(-1) / stock_data['Close'] - 1 > 0).astype(int)
        stock_data = stock_data.iloc[:-1]  # Remove last row since we can't calculate next day's return
        
        # Prepare features and labels
        X = stock_data[features].copy()
        y = stock_data['Label']
        
        # Handle missing and infinite values
        X = X.replace([np.inf, -np.inf], np.nan)
        X = X.fillna(method='ffill').fillna(method='bfill').fillna(0)
        
        # Scale features
        scaler = StandardScaler()
        X_scaled = pd.DataFrame(
            scaler.fit_transform(X),
            columns=features,
            index=X.index
        )
        
        # Calculate base SMI scores
        base_scores = pd.Series(
            mutual_info_classif(X_scaled, y, random_state=config['smi_random_state']),
            index=features
        )
        
        # Calculate temporal stability
        stability_scores = {}
        window_size = len(stock_data) // 5  # Use 5 windows
        
        for i in range(5):
            start_idx = i * window_size
            end_idx = start_idx + window_size
            window_data = X_scaled.iloc[start_idx:end_idx]
            window_labels = y.iloc[start_idx:end_idx]
            
            window_scores = pd.Series(
                mutual_info_classif(window_data, window_labels, 
                                  random_state=config['smi_random_state']),
                index=features
            )
            stability_scores[f'window_{i}'] = window_scores
        
        stability_df = pd.DataFrame(stability_scores)
        temporal_stability = 1 - stability_df.std(axis=1) / stability_df.mean(axis=1)
        
        # Calculate feature ranks
        ranks = base_scores.rank(ascending=False)
        
        return base_scores, ranks, temporal_stability
        
    except Exception as e:
        logging.error(f"Error in feature importance calculation: {str(e)}")
        return None, None, None

# Process stocks by sector
print("\nCalculating feature importance by sector...")
results = {
    'stocks': {},      # Stock-level results
    'sectors': {},     # Sector-level results
    'features': {},    # Feature-level statistics
    'correlations': {} # Feature correlations
}

# Process each sector
for sector in tqdm(sectors, desc="Processing sectors"):
    try:
        sector_tickers = df[df['Sector'] == sector]['Ticker'].unique()
        print(f"\nProcessing {sector} sector ({len(sector_tickers)} stocks)...")
        
        sector_scores = []
        sector_stability = []
        
        # Process each stock in sector
        for ticker in tqdm(sector_tickers, desc=f"Processing {sector} stocks"):
            try:
                # Get stock data
                stock_data = df[df['Ticker'] == ticker].copy()
                
                # Calculate feature importance
                scores, ranks, stability = calculate_feature_importance(
                    stock_data, FEATURE_COLUMNS, FEATURE_CONFIG
                )
                
                if scores is not None:
                    # Store stock results
                    results['stocks'][ticker] = {
                        'ticker': ticker,
                        'sector': sector,
                        'scores': scores,
                        'ranks': ranks,
                        'stability': stability,
                        'top_features': scores.nlargest(10).index.tolist(),
                        'top_scores': scores.nlargest(10).values.tolist()
                    }
                    
                    # Add to sector aggregates
                    sector_scores.append(scores)
                    sector_stability.append(stability)
                    
                    # Save stock-level results
                    stock_df = pd.DataFrame({
                        'Feature': scores.index,
                        'MI_Score': scores.values,
                        'Rank': ranks.values,
                        'Stability': stability.values,
                        'Feature_Group': [FEATURE_DESCRIPTIONS[f]['group'] for f in scores.index],
                        'Description': [FEATURE_DESCRIPTIONS[f]['description'] for f in scores.index]
                    })
                    stock_df = stock_df.sort_values('MI_Score', ascending=False)
                    stock_df.to_csv(f'../data/smi_scores/{ticker}_features.csv', index=False)
                    
            except Exception as e:
                logging.error(f"Error processing {ticker}: {str(e)}")
                continue
        
        # Calculate sector-level statistics
        if sector_scores:
            sector_scores_df = pd.DataFrame(sector_scores)
            sector_stability_df = pd.DataFrame(sector_stability)
            
            results['sectors'][sector] = {
                'mean_scores': sector_scores_df.mean(),
                'std_scores': sector_scores_df.std(),
                'mean_stability': sector_stability_df.mean(),
                'feature_ranks': sector_scores_df.mean().rank(ascending=False),
                'top_features': sector_scores_df.mean().nlargest(10).index.tolist(),
                'stock_count': len(sector_scores)
            }
            
            # Save sector-level results
            sector_df = pd.DataFrame({
                'Feature': results['sectors'][sector]['mean_scores'].index,
                'Mean_Score': results['sectors'][sector]['mean_scores'].values,
                'Std_Score': results['sectors'][sector]['std_scores'].values,
                'Mean_Stability': results['sectors'][sector]['mean_stability'].values,
                'Rank': results['sectors'][sector]['feature_ranks'].values,
                'Feature_Group': [FEATURE_DESCRIPTIONS[f]['group'] for f in results['sectors'][sector]['mean_scores'].index],
                'Description': [FEATURE_DESCRIPTIONS[f]['description'] for f in results['sectors'][sector]['mean_scores'].index]
            })
            sector_df = sector_df.sort_values('Mean_Score', ascending=False)
            sector_df.to_csv(f'../data/smi_scores/{sector}_features.csv', index=False)
            
            # Print sector summary
            print(f"\n{sector} Sector Summary:")
            print(f"- Processed {len(sector_scores)} stocks successfully")
            print("\nTop 10 Features:")
            top_features = sector_df.head(10)
            for _, row in top_features.iterrows():
                print(f"- {row['Feature']} ({row['Feature_Group']}): {row['Mean_Score']:.4f} ± {row['Std_Score']:.4f}")
            
    except Exception as e:
        logging.error(f"Error processing sector {sector}: {str(e)}")
        continue

# Calculate cross-sector statistics
if results['sectors']:
    # Combine sector scores
    sector_scores = pd.DataFrame({
        sector: data['mean_scores'] 
        for sector, data in results['sectors'].items()
    })
    
    # Calculate feature-level statistics
    for feature in FEATURE_COLUMNS:
        results['features'][feature] = {
            'global_rank': sector_scores.loc[feature].mean(),
            'rank_stability': 1 - sector_scores.loc[feature].std() / sector_scores.loc[feature].mean(),
            'top_sectors': sector_scores.loc[feature].nlargest(3).index.tolist(),
            'feature_group': FEATURE_DESCRIPTIONS[feature]['group']
        }
    
    # Calculate feature correlations
    feature_corr = sector_scores.corr()
    results['correlations'] = {
        'matrix': feature_corr,
        'high_correlations': []
    }
    
    # Find highly correlated features
    for i in range(len(feature_corr.columns)):
        for j in range(i+1, len(feature_corr.columns)):
            corr = feature_corr.iloc[i, j]
            if abs(corr) > FEATURE_CONFIG['correlation_threshold']:
                results['correlations']['high_correlations'].append({
                    'feature1': feature_corr.columns[i],
                    'feature2': feature_corr.columns[j],
                    'correlation': corr
                })

    # Save final results
print("\nSaving final results...")

# Initialize results structure if not already initialized
if 'correlations' not in results:
    results['correlations'] = {'high_correlations': []}

# 1. Create summary report
summary_path = f'../reports/feature_selection_summary_{timestamp}.txt'
with open(summary_path, 'w', encoding='utf-8') as f:
    f.write("NASDAQ-100 Feature Selection Summary\n")
    f.write("=" * 50 + "\n\n")
    
    # Overall statistics
    f.write("Analysis Coverage:\n")
    f.write(f"- Total stocks analyzed: {len(results['stocks'])}\n")
    f.write(f"- Sectors covered: {len(results['sectors'])}\n\n")
    
    # Sector summaries
    f.write("Sector-wise Feature Importance:\n")
    for sector, data in results['sectors'].items():
        f.write(f"\n{sector}:\n")
        f.write(f"- Stocks analyzed: {data['stock_count']}\n")
        f.write("- Top 5 features:\n")
        for feature in data['top_features'][:5]:
            score = data['mean_scores'][feature]
            stability = data['mean_stability'][feature]
            f.write(f"  * {feature}: score={score:.4f}, stability={stability:.4f}\n")
    
    # Feature correlations
    if results['correlations']['high_correlations']:
        f.write("\nHighly Correlated Features:\n")
        for corr in results['correlations']['high_correlations']:
            f.write(f"- {corr['feature1']} ↔ {corr['feature2']}: {corr['correlation']:.4f}\n")
    
    # Global feature ranking
    if 'features' in results:
        f.write("\nGlobal Feature Ranking:\n")
        global_ranks = pd.DataFrame(results['features']).T
        top_features = global_ranks.sort_values('global_rank', ascending=False).head(10)
        for feature, data in top_features.iterrows():
            f.write(f"- {feature} ({data['feature_group']}):\n")
            f.write(f"  * Global rank: {data['global_rank']:.4f}\n")
            f.write(f"  * Rank stability: {data['rank_stability']:.4f}\n")
            f.write(f"  * Best sectors: {', '.join(data['top_sectors'])}\n")

print(f"Summary report saved to: {summary_path}")
print("\nFeature selection analysis complete!")



Calculating feature importance by sector...


Processing sectors:   0%|          | 0/11 [00:00<?, ?it/s]


Processing Information Technology sector (51 stocks)...


Processing Information Technology stocks: 100%|██████████| 51/51 [00:42<00:00,  1.21it/s]
Processing sectors:   9%|▉         | 1/11 [00:42<07:01, 42.19s/it]


Information Technology Sector Summary:
- Processed 51 stocks successfully

Top 10 Features:
- OBV (Volume): 0.0182 ± 0.0273
- High (Price): 0.0182 ± 0.0278
- Log_Return (Volatility): 0.0172 ± 0.0265
- Daily_Return (Volatility): 0.0171 ± 0.0262
- Close (Price): 0.0170 ± 0.0293
- Open (Price): 0.0170 ± 0.0288
- EMA_10 (Technical): 0.0167 ± 0.0297
- Low (Price): 0.0149 ± 0.0286
- RSI (Technical): 0.0148 ± 0.0251
- BB_Width (Technical): 0.0146 ± 0.0272

Processing Consumer Discretionary sector (14 stocks)...


Processing Consumer Discretionary stocks: 100%|██████████| 14/14 [00:10<00:00,  1.37it/s]
Processing sectors:  18%|█▊        | 2/11 [00:52<03:30, 23.40s/it]


Consumer Discretionary Sector Summary:
- Processed 14 stocks successfully

Top 10 Features:
- Log_Return (Volatility): 0.0158 ± 0.0253
- Daily_Return (Volatility): 0.0155 ± 0.0251
- Open (Price): 0.0141 ± 0.0214
- OBV (Volume): 0.0128 ± 0.0187
- High (Price): 0.0119 ± 0.0178
- Close (Price): 0.0117 ± 0.0193
- BB_Lower (Technical): 0.0114 ± 0.0240
- EMA_10 (Technical): 0.0114 ± 0.0215
- BB_Width (Technical): 0.0109 ± 0.0190
- Volatility (Volatility): 0.0109 ± 0.0196

Processing Utilities sector (3 stocks)...


Processing Utilities stocks: 100%|██████████| 3/3 [00:02<00:00,  1.29it/s]
Processing sectors:  27%|██▋       | 3/11 [00:54<01:50, 13.80s/it]


Utilities Sector Summary:
- Processed 3 stocks successfully

Top 10 Features:
- Close (Price): 0.0081 ± 0.0140
- Low (Price): 0.0056 ± 0.0082
- Open (Price): 0.0053 ± 0.0089
- Sector_Return (Sector): 0.0053 ± 0.0071
- MACD_Signal (Technical): 0.0046 ± 0.0079
- RS_MA (Sector): 0.0035 ± 0.0017
- MACD (Technical): 0.0031 ± 0.0036
- High (Price): 0.0031 ± 0.0038
- Volatility (Volatility): 0.0031 ± 0.0029
- EMA_10 (Technical): 0.0030 ± 0.0029

Processing Healthcare sector (11 stocks)...


Processing Healthcare stocks: 100%|██████████| 11/11 [00:09<00:00,  1.11it/s]
Processing sectors:  36%|███▋      | 4/11 [01:04<01:25, 12.28s/it]


Healthcare Sector Summary:
- Processed 11 stocks successfully

Top 10 Features:
- OBV (Volume): 0.0180 ± 0.0211
- Open (Price): 0.0168 ± 0.0170
- Low (Price): 0.0151 ± 0.0177
- Log_Return (Volatility): 0.0147 ± 0.0183
- Daily_Return (Volatility): 0.0146 ± 0.0179
- BB_Lower (Technical): 0.0146 ± 0.0201
- Close (Price): 0.0129 ± 0.0187
- EMA_10 (Technical): 0.0126 ± 0.0158
- High (Price): 0.0120 ± 0.0130
- BB_Width (Technical): 0.0119 ± 0.0133

Processing Materials sector (3 stocks)...


Processing Materials stocks: 100%|██████████| 3/3 [00:03<00:00,  1.31s/it]
Processing sectors:  45%|████▌     | 5/11 [01:08<00:55,  9.28s/it]


Materials Sector Summary:
- Processed 3 stocks successfully

Top 10 Features:
- RSI (Technical): 0.0125 ± 0.0100
- MACD (Technical): 0.0102 ± 0.0098
- Volume (Price): 0.0089 ± 0.0054
- RS_MA (Sector): 0.0083 ± 0.0033
- Volume_MA (Volume): 0.0079 ± 0.0055
- Open (Price): 0.0059 ± 0.0049
- Close (Price): 0.0058 ± 0.0070
- MACD_Hist (Technical): 0.0052 ± 0.0080
- MACD_Signal (Technical): 0.0050 ± 0.0082
- Log_Return (Volatility): 0.0039 ± 0.0067

Processing Energy sector (2 stocks)...


Processing Energy stocks: 100%|██████████| 2/2 [00:01<00:00,  1.08it/s]
Processing sectors:  55%|█████▍    | 6/11 [01:10<00:33,  6.77s/it]


Energy Sector Summary:
- Processed 2 stocks successfully

Top 10 Features:
- BB_Lower (Technical): 0.0079 ± 0.0027
- High (Price): 0.0078 ± 0.0011
- BB_Width (Technical): 0.0073 ± 0.0020
- Volume (Price): 0.0053 ± 0.0076
- MACD_Hist (Technical): 0.0050 ± 0.0025
- Open (Price): 0.0046 ± 0.0064
- Volume_Ratio (Volume): 0.0045 ± 0.0064
- Sector_Return (Sector): 0.0039 ± 0.0055
- EMA_50 (Technical): 0.0034 ± 0.0047
- Volume_MA (Volume): 0.0016 ± 0.0022

Processing Communication Services sector (6 stocks)...


Processing Communication Services stocks: 100%|██████████| 6/6 [00:04<00:00,  1.23it/s]
Processing sectors:  64%|██████▎   | 7/11 [01:15<00:24,  6.16s/it]


Communication Services Sector Summary:
- Processed 6 stocks successfully

Top 10 Features:
- Daily_Return (Volatility): 0.0186 ± 0.0090
- Log_Return (Volatility): 0.0184 ± 0.0091
- Low (Price): 0.0151 ± 0.0105
- MACD_Signal (Technical): 0.0095 ± 0.0059
- Close (Price): 0.0094 ± 0.0147
- OBV (Volume): 0.0093 ± 0.0110
- Open (Price): 0.0091 ± 0.0068
- MACD (Technical): 0.0090 ± 0.0120
- High (Price): 0.0074 ± 0.0045
- EMA_10 (Technical): 0.0071 ± 0.0082

Processing Financials sector (2 stocks)...


Processing Financials stocks: 100%|██████████| 2/2 [00:01<00:00,  1.37it/s]
Processing sectors:  73%|███████▎  | 8/11 [01:17<00:14,  4.68s/it]


Financials Sector Summary:
- Processed 2 stocks successfully

Top 10 Features:
- RSI (Technical): 0.0567 ± 0.0496
- Close (Price): 0.0545 ± 0.0511
- EMA_50 (Technical): 0.0538 ± 0.0512
- MACD_Signal (Technical): 0.0538 ± 0.0264
- BB_Upper (Technical): 0.0534 ± 0.0756
- High (Price): 0.0508 ± 0.0584
- Low (Price): 0.0496 ± 0.0701
- Volatility (Volatility): 0.0489 ± 0.0692
- Open (Price): 0.0473 ± 0.0669
- BB_Lower (Technical): 0.0462 ± 0.0615

Processing Consumer Staples sector (6 stocks)...


Processing Consumer Staples stocks: 100%|██████████| 6/6 [00:05<00:00,  1.09it/s]
Processing sectors:  82%|████████▏ | 9/11 [01:22<00:09,  4.94s/it]


Consumer Staples Sector Summary:
- Processed 6 stocks successfully

Top 10 Features:
- Log_Return (Volatility): 0.0085 ± 0.0047
- Daily_Return (Volatility): 0.0084 ± 0.0049
- BB_Width (Technical): 0.0080 ± 0.0057
- Sector_Return (Sector): 0.0079 ± 0.0093
- BB_Upper (Technical): 0.0074 ± 0.0130
- BB_Lower (Technical): 0.0060 ± 0.0053
- High (Price): 0.0046 ± 0.0051
- Close (Price): 0.0043 ± 0.0056
- Volume_MA (Volume): 0.0042 ± 0.0071
- MACD_Hist (Technical): 0.0030 ± 0.0056

Processing Industrials sector (7 stocks)...


Processing Industrials stocks: 100%|██████████| 7/7 [00:05<00:00,  1.21it/s]
Processing sectors:  91%|█████████ | 10/11 [01:28<00:05,  5.22s/it]


Industrials Sector Summary:
- Processed 7 stocks successfully

Top 10 Features:
- Close (Price): 0.0133 ± 0.0176
- RSI (Technical): 0.0130 ± 0.0219
- High (Price): 0.0113 ± 0.0078
- OBV (Volume): 0.0102 ± 0.0103
- EMA_10 (Technical): 0.0085 ± 0.0160
- BB_Width (Technical): 0.0084 ± 0.0135
- Daily_Return (Volatility): 0.0074 ± 0.0114
- Log_Return (Volatility): 0.0071 ± 0.0106
- Relative_Strength (Sector): 0.0061 ± 0.0065
- BB_Upper (Technical): 0.0061 ± 0.0083

Processing Real Estate sector (3 stocks)...


Processing Real Estate stocks: 100%|██████████| 3/3 [00:02<00:00,  1.10it/s]
Processing sectors: 100%|██████████| 11/11 [01:31<00:00,  8.29s/it]


Real Estate Sector Summary:
- Processed 3 stocks successfully

Top 10 Features:
- Open (Price): 0.0085 ± 0.0064
- Sector_Return (Sector): 0.0075 ± 0.0077
- Volume_MA (Volume): 0.0067 ± 0.0059
- Low (Price): 0.0060 ± 0.0103
- Volume (Price): 0.0058 ± 0.0050
- MACD (Technical): 0.0055 ± 0.0064
- RS_MA (Sector): 0.0048 ± 0.0083
- Volatility (Volatility): 0.0044 ± 0.0048
- BB_Upper (Technical): 0.0042 ± 0.0052
- BB_Lower (Technical): 0.0039 ± 0.0050

Saving final results...
Summary report saved to: ../reports/feature_selection_summary_20250727_223141.txt

Feature selection analysis complete!





In [24]:
# Create visualizations
print("\nGenerating feature importance visualizations...")

try:
    # 1. Sector-wise Feature Importance Heatmap
    plt.figure(figsize=(20, 12))
    
    # Prepare data for heatmap
    sector_scores = pd.DataFrame({
        sector: data['mean_scores'] 
        for sector, data in results['sectors'].items()
    })
    
    # Sort features by global importance
    global_importance = sector_scores.mean(axis=1)
    sector_scores = sector_scores.loc[global_importance.sort_values(ascending=False).index]
    
    # Create heatmap
    sns.heatmap(sector_scores, cmap='YlOrRd', annot=True, fmt='.3f',
                xticklabels=True, yticklabels=True, center=0)
    plt.title('Feature Importance by Sector', pad=20)
    plt.xlabel('Sector', labelpad=10)
    plt.ylabel('Feature', labelpad=10)
    plt.xticks(rotation=45, ha='right')
    
    # Add feature group annotations
    feature_groups = [FEATURE_DESCRIPTIONS[f]['group'] for f in sector_scores.index]
    plt.gca().set_yticklabels([f"{feat} ({group})" for feat, group in zip(sector_scores.index, feature_groups)])
    
    plt.tight_layout()
    plt.savefig('../reports/figures/features/importance_heatmap.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    # 2. Feature Group Performance by Sector
    plt.figure(figsize=(15, 10))
    
    # Calculate group-wise statistics
    group_stats = []
    for sector, data in results['sectors'].items():
        scores = data['mean_scores']
        stability = data['mean_stability']
        
        for group_name, group_info in FEATURE_GROUPS.items():
            group_features = [f for f in group_info['features'] if f in scores.index]
            if group_features:
                group_stats.append({
                    'Sector': sector,
                    'Feature_Group': group_name,
                    'Mean_Score': scores[group_features].mean(),
                    'Mean_Stability': stability[group_features].mean(),
                    'Feature_Count': len(group_features)
                })
    
    group_df = pd.DataFrame(group_stats)
    
    # Create grouped violin plot
    sns.violinplot(data=group_df, x='Feature_Group', y='Mean_Score',
                  hue='Sector', palette=SECTOR_COLORS, split=True)
    
    plt.title('Feature Group Performance Distribution', pad=20)
    plt.xlabel('Feature Group', labelpad=10)
    plt.ylabel('Mean Importance Score', labelpad=10)
    plt.xticks(rotation=45, ha='right')
    plt.legend(title='Sector', bbox_to_anchor=(1.05, 1))
    
    plt.tight_layout()
    plt.savefig('../reports/figures/features/group_performance.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    # 3. Feature Stability Analysis
    plt.figure(figsize=(15, 8))
    
    # Calculate stability statistics
    stability_stats = pd.DataFrame({
        sector: data['mean_stability']
        for sector, data in results['sectors'].items()
    })
    
    # Sort by average stability
    avg_stability = stability_stats.mean(axis=1)
    stability_stats = stability_stats.loc[avg_stability.sort_values(ascending=False).index]
    
    # Create stability heatmap
    sns.heatmap(stability_stats, cmap='viridis', annot=True, fmt='.2f',
                xticklabels=True, yticklabels=True)
    plt.title('Feature Stability by Sector', pad=20)
    plt.xlabel('Sector', labelpad=10)
    plt.ylabel('Feature', labelpad=10)
    plt.xticks(rotation=45, ha='right')
    
    plt.tight_layout()
    plt.savefig('../reports/figures/features/stability_heatmap.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    # 4. Feature Correlation Network (simplified to avoid group attribute issues)
    if results['correlations']['high_correlations']:
        plt.figure(figsize=(15, 15))
        
        # Create correlation network
        G = nx.Graph()
        
        # Create mapping of features to their groups
        feature_to_group = {}
        for group_name, group_info in FEATURE_GROUPS.items():
            for feature in group_info['features']:
                feature_to_group[feature] = group_name
        
        # Add nodes colored by feature group
        for feature in FEATURE_COLUMNS:
            group = feature_to_group.get(feature, 'Unknown')
            G.add_node(feature, group=group)
        
        # Add edges for highly correlated features
        for corr in results['correlations']['high_correlations']:
            G.add_edge(
                corr['feature1'],
                corr['feature2'],
                weight=abs(corr['correlation'])
            )
        
        # Ensure all nodes have group attribute
        for node in G.nodes():
            if 'group' not in G.nodes[node]:
                G.nodes[node]['group'] = feature_to_group.get(node, 'Unknown')
        
        # Set node colors by group
        node_colors = []
        for n in G.nodes():
            group = G.nodes[n]['group']
            if group in list(FEATURE_GROUPS.keys()):
                color_idx = list(FEATURE_GROUPS.keys()).index(group) / len(FEATURE_GROUPS)
                node_colors.append(plt.cm.tab20(color_idx))
            else:
                node_colors.append(plt.cm.tab20(0))  # Default color
        
        # Draw network
        pos = nx.spring_layout(G, k=1, iterations=50)
        
        # Draw nodes
        nx.draw_networkx_nodes(G, pos,
                              node_color=node_colors,
                              node_size=1000,
                              alpha=0.7)
        
        # Draw edges with varying thickness
        edge_weights = [G[u][v]['weight'] * 2 for u, v in G.edges()]
        nx.draw_networkx_edges(G, pos,
                              width=edge_weights,
                              alpha=0.5)
        
        # Add labels
        nx.draw_networkx_labels(G, pos, font_size=8)
        
        # Add legend
        legend_elements = [
            plt.Line2D([0], [0], marker='o', color='w',
                      markerfacecolor=plt.cm.tab20(i/len(FEATURE_GROUPS)),
                      label=group, markersize=10)
            for i, group in enumerate(FEATURE_GROUPS.keys())
        ]
        plt.legend(handles=legend_elements,
                  title='Feature Groups',
                  loc='center left',
                  bbox_to_anchor=(1, 0.5))
        
        plt.title('Feature Correlation Network\n(edges: |correlation| > 0.7)', pad=20)
        plt.axis('off')
        plt.tight_layout()
        plt.savefig('../reports/figures/features/correlation_network.png', dpi=300, bbox_inches='tight')
        plt.close()
    
    # 5. Sector-wise Feature Distribution
    for sector in results['sectors'].keys():
        plt.figure(figsize=(15, 6))
        
        # Get sector data
        sector_data = pd.DataFrame({
            'Score': results['sectors'][sector]['mean_scores'],
            'Stability': results['sectors'][sector]['mean_stability'],
            'Group': [FEATURE_DESCRIPTIONS[f]['group'] for f in results['sectors'][sector]['mean_scores'].index]
        })
        
        # Create scatter plot
        sns.scatterplot(data=sector_data,
                       x='Score', y='Stability',
                       hue='Group', size='Score',
                       sizes=(50, 400), alpha=0.6)
        
        plt.title(f'{sector} Sector - Feature Importance vs Stability', pad=20)
        plt.xlabel('Importance Score', labelpad=10)
        plt.ylabel('Temporal Stability', labelpad=10)
        
        # Add annotations for top features
        top_n = 5
        top_features = sector_data.nlargest(top_n, 'Score').index
        
        for feature in top_features:
            plt.annotate(
                feature,
                (sector_data.loc[feature, 'Score'],
                 sector_data.loc[feature, 'Stability']),
                xytext=(5, 5), textcoords='offset points',
                fontsize=8, alpha=0.8
            )
        
        plt.legend(title='Feature Group', bbox_to_anchor=(1.05, 1))
        plt.tight_layout()
        plt.savefig(f'../reports/figures/sectors/{sector}_features.png', dpi=300, bbox_inches='tight')
        plt.close()

except Exception as e:
    logging.error(f"Error creating visualizations: {str(e)}")
    raise

print("\nVisualization files saved:")
print("- Feature importance: ../reports/figures/features/")
print("- Sector analysis: ../reports/figures/sectors/")
print(f"- Summary report: {summary_path}")
print("\nFeature selection analysis complete!")


Generating feature importance visualizations...

Visualization files saved:
- Feature importance: ../reports/figures/features/
- Sector analysis: ../reports/figures/sectors/
- Summary report: ../reports/feature_selection_summary_20250727_223141.txt

Feature selection analysis complete!


In [25]:
# Data Quality Validation Script
print("Validating NASDAQ-100 dataset quality...")

try:
    # Load sector mapping for reference
    sector_mapping = pd.read_csv('../data/sector_mapping.csv')
    expected_tickers = set(sector_mapping['Ticker'].unique())
    print(f"\nExpected tickers from sector_mapping.csv: {len(expected_tickers)}")
    
    # Load the dataset
    df = pd.read_csv('../data/labeled_signals_nasdaq.csv')
    df['Date'] = pd.to_datetime(df['Date'])
    print(f"Dataset loaded: {df.shape[0]:,} rows, {df.shape[1]} columns")
    
    # Initialize validation results
    validation_results = []
    
    # Required columns by category
    REQUIRED_COLUMNS = {
        'Base': ['Date', 'Ticker', 'Sector', 'Open', 'High', 'Low', 'Close', 'Volume'],
        'Technical': ['EMA10', 'EMA50', 'RSI', 'MACD', 'MACD_Signal', 'MACD_Hist'],
        'Derived': ['volatility', 'volume_ma'],
        'Returns': ['Open_Return', 'High_Return', 'Low_Return', 'Close_Return'],
        'Labels': ['Label', 'Signal', 'future_return']
    }
    
    # Check for missing columns
    all_required = [col for cols in REQUIRED_COLUMNS.values() for col in cols]
    missing_cols = [col for col in all_required if col not in df.columns]
    if missing_cols:
        print("\n⚠️ Missing columns:")
        for col in missing_cols:
            print(f"- {col}")
    
    # Analyze each ticker
    available_tickers = df['Ticker'].unique()
    missing_tickers = expected_tickers - set(available_tickers)
    
    print(f"\nAnalyzing {len(available_tickers)} available tickers...")
    
    for ticker in tqdm(available_tickers, desc="Validating stocks"):
        ticker_data = df[df['Ticker'] == ticker]
        sector = ticker_data['Sector'].iloc[0]
        
        # Calculate metrics
        total_rows = len(ticker_data)
        date_range = (ticker_data['Date'].max() - ticker_data['Date'].min()).days
        missing_values = ticker_data[all_required].isnull().sum()
        has_missing = missing_values.sum() > 0
        
        # Check label distribution
        label_dist = ticker_data['Label'].value_counts()
        signal_dist = ticker_data['Signal'].value_counts()
        
        validation_results.append({
            'Ticker': ticker,
            'Sector': sector,
            'Total_Rows': total_rows,
            'Date_Range_Days': date_range,
            'Trading_Days_Per_Year': (total_rows / (date_range/365)) if date_range > 0 else 0,
            'Missing_Values': missing_values.sum(),
            'Missing_Columns': ', '.join(missing_values[missing_values > 0].index),
            'Unique_Labels': len(label_dist),
            'Label_Distribution': str(label_dist.to_dict()),
            'Signal_Distribution': str(signal_dist.to_dict()),
            'Mean_Forward_Return': ticker_data['future_return'].mean(),
            'Std_Forward_Return': ticker_data['future_return'].std(),
            'Status': 'Low Volume' if total_rows < 2000 else 'OK'
        })
    
    # Create validation DataFrame
    validation_df = pd.DataFrame(validation_results)
    
    # Print summary statistics
    print("\n=== Dataset Quality Summary ===")
    print(f"\nTicker Coverage:")
    print(f"- Expected tickers: {len(expected_tickers)}")
    print(f"- Available tickers: {len(available_tickers)}")
    print(f"- Missing tickers: {len(missing_tickers)}")
    
    print("\nData Volume:")
    low_volume = validation_df[validation_df['Total_Rows'] < 2000]
    print(f"- Stocks with <2000 rows: {len(low_volume)}")
    if not low_volume.empty:
        print("\nLow volume tickers:")
        for _, row in low_volume.iterrows():
            print(f"- {row['Ticker']} ({row['Sector']}): {row['Total_Rows']} rows")
    
    print("\nMissing Values:")
    has_missing = validation_df[validation_df['Missing_Values'] > 0]
    if not has_missing.empty:
        print(f"- Stocks with missing values: {len(has_missing)}")
        print("\nStocks with missing data:")
        for _, row in has_missing.iterrows():
            print(f"- {row['Ticker']}: {row['Missing_Values']} missing values in {row['Missing_Columns']}")
    else:
        print("✓ No missing values found")
    
    print("\nLabel Quality:")
    bad_labels = validation_df[validation_df['Unique_Labels'] < 2]
    if not bad_labels.empty:
        print(f"- Stocks with insufficient labels: {len(bad_labels)}")
        print("\nStocks with label issues:")
        for _, row in bad_labels.iterrows():
            print(f"- {row['Ticker']}: only {row['Unique_Labels']} unique labels")
    else:
        print("✓ All stocks have sufficient label variety")
    
    # Save detailed report
    validation_df.to_csv('../reports/data_quality_report.csv', index=False)
    print("\nDetailed quality report saved to: ../reports/data_quality_report.csv")
    
    # Print missing tickers by sector
    if missing_tickers:
        print("\nMissing Tickers by Sector:")
        sector_missing = {}
        for ticker in missing_tickers:
            sector = sector_mapping[sector_mapping['Ticker'] == ticker]['Sector'].iloc[0]
            if sector not in sector_missing:
                sector_missing[sector] = []
            sector_missing[sector].append(ticker)
        
        for sector, tickers in sector_missing.items():
            print(f"\n{sector} ({len(tickers)} missing):")
            for ticker in sorted(tickers):
                print(f"- {ticker}")
    
except FileNotFoundError as e:
    print(f"\n❌ Error: Required file not found - {str(e)}")
except Exception as e:
    print(f"\n❌ Error during validation: {str(e)}")


Validating NASDAQ-100 dataset quality...

Expected tickers from sector_mapping.csv: 109
Dataset loaded: 3,756 rows, 22 columns

⚠️ Missing columns:
- Open_Return
- High_Return
- Low_Return
- Close_Return

Analyzing 3 available tickers...


Validating stocks:   0%|          | 0/3 [00:00<?, ?it/s]


❌ Error during validation: "['Open_Return', 'High_Return', 'Low_Return', 'Close_Return'] not in index"





In [None]:
# Save detailed feature importance results
feature_importance_data = []

# Compile per-stock and per-sector statistics
for ticker, result in stock_results.items():
    sector = result['Sector']
    scores = result['Scores']
    
    # Get sector average scores
    sector_avg = sector_importance[sector]
    
    # Calculate deviation from sector average
    score_deviation = scores - sector_avg
    
    # Add to results
    for feature in FEATURE_COLUMNS:
        feature_importance_data.append({
            'Ticker': ticker,
            'Sector': sector,
            'Feature': feature,
            'Feature_Group': next(group for group, features in FEATURE_GROUPS.items() if feature in features),
            'MI_Score': scores[feature],
            'Sector_Avg': sector_avg[feature],
            'Deviation': score_deviation[feature]
        })

# Create and save detailed DataFrame
detailed_df = pd.DataFrame(feature_importance_data)
detailed_df.to_csv('../data/feature_importance_detailed.csv', index=False)

# Create feature group summary
group_summary = detailed_df.groupby(['Sector', 'Feature_Group'])['MI_Score'].agg(['mean', 'std']).round(4)
print("\nFeature Group Importance by Sector:")
print(group_summary)

# Save summary statistics
group_summary.to_csv('../data/feature_group_summary.csv')

print("\nFeature importance analysis files saved:")
print("1. feature_importance_smi.csv - Per-stock top features")
print("2. feature_importance_detailed.csv - Detailed per-feature scores")
print("3. feature_group_summary.csv - Feature group statistics")
print("4. sector_feature_importance.png - Heatmap visualization")
print("5. sector_top_features.png - Top features by sector")
