# 02 - Feature Extraction and Selection

This notebook demonstrates feature extraction and selection methods
for neural decoding.

**Contents:**
1. ROI extraction
2. Time window extraction (EEG)
3. Trial averaging
4. Feature selection (ANOVA, RFE, Stability)

In [None]:
import sys
sys.path.insert(0, '..')

import numpy as np
from sklearn.datasets import make_classification

from core.dataset import DecodingDataset
from features.extractors import ROIExtractor, TimeWindowExtractor, TrialAverager
from features.selectors import ANOVASelector, RFESelector, StabilitySelector

## Create Synthetic Data

In [None]:
# Synthetic fMRI-like data
X, y = make_classification(
    n_samples=200,
    n_features=5000,  # Many voxels
    n_informative=100,
    n_redundant=100,
    n_classes=2,
    random_state=42
)

groups = np.repeat(np.arange(1, 6), 40)  # 5 runs

dataset = DecodingDataset(
    X=X,
    y=y,
    groups=groups,
    feature_names=[f"voxel_{i}" for i in range(5000)],
    class_names=["class_A", "class_B"],
    modality="fmri"
)

print(f"Original: {dataset.n_samples} samples, {dataset.n_features} features")

## 1. ANOVA Feature Selection

Select features with highest F-scores from one-way ANOVA.

In [None]:
# Select top 500 features by ANOVA F-score
anova_selector = ANOVASelector(k=500)

# Fit on data
anova_selector.fit(dataset.X, dataset.y)

# Transform
X_selected = anova_selector.transform(dataset.X)
print(f"After ANOVA: {X_selected.shape[1]} features")

# Or use transform_dataset for full dataset
selected_dataset = anova_selector.transform_dataset(dataset)
print(f"Selected dataset: {selected_dataset.n_features} features")

In [None]:
# Check F-scores
import matplotlib.pyplot as plt

scores = anova_selector.get_scores()

plt.figure(figsize=(10, 4))
plt.hist(scores, bins=50)
plt.xlabel('F-score')
plt.ylabel('Count')
plt.title('ANOVA F-score Distribution')
plt.axvline(x=np.sort(scores)[-500], color='red', linestyle='--', label='Threshold')
plt.legend()
plt.show()

## 2. RFE Feature Selection

Recursive Feature Elimination using classifier weights.

In [None]:
# First reduce with ANOVA, then RFE
# (RFE is slow on many features)

# Pre-select with ANOVA
anova = ANOVASelector(k=1000)
anova.fit(dataset.X, dataset.y)
X_pre = anova.transform(dataset.X)

# RFE to select final features
rfe_selector = RFESelector(
    n_features=100,
    step=0.2,  # Remove 20% of features each step
    verbose=1
)

rfe_selector.fit(X_pre, dataset.y)
X_rfe = rfe_selector.transform(X_pre)

print(f"After RFE: {X_rfe.shape[1]} features")

In [None]:
# Feature rankings
rankings = rfe_selector.get_ranking()
print(f"Best features (rank=1): {np.sum(rankings == 1)}")
print(f"Ranking range: {rankings.min()} to {rankings.max()}")

## 3. Stability Selection

Select features consistently chosen across bootstrap samples.

In [None]:
# Stability selection
stability_selector = StabilitySelector(
    n_bootstrap=50,
    sample_fraction=0.75,
    threshold=0.6,  # Select features chosen in >60% of bootstraps
    n_jobs=-1
)

# Use ANOVA pre-selected features
stability_selector.fit(X_pre, dataset.y)
X_stable = stability_selector.transform(X_pre)

print(f"Stable features: {X_stable.shape[1]}")

In [None]:
# Stability scores
stability_scores = stability_selector.get_stability_scores()

plt.figure(figsize=(10, 4))
plt.hist(stability_scores, bins=30)
plt.xlabel('Selection Frequency')
plt.ylabel('Count')
plt.title('Feature Stability Scores')
plt.axvline(x=0.6, color='red', linestyle='--', label='Threshold')
plt.legend()
plt.show()

## 4. Trial Averaging

Average multiple trials to increase SNR.

In [None]:
# Average every 5 trials within each class
averager = TrialAverager(n_per_average=5, random_state=42)

averaged_dataset = averager.fit_transform(dataset)

print(f"Before averaging: {dataset.n_samples} samples")
print(f"After averaging: {averaged_dataset.n_samples} samples")
print(f"Trials per average: {averaged_dataset.metadata['n_per_average']}")

## 5. Time Window Extraction (EEG)

Extract features from specific time windows.

In [None]:
# Create synthetic EEG-like data
n_epochs = 100
n_channels = 64
n_times = 200  # 1 second at 200 Hz

# Simulate EEG: (n_epochs, n_channels, n_times) -> flatten
X_eeg = np.random.randn(n_epochs, n_channels * n_times)
y_eeg = np.random.randint(0, 2, n_epochs)

eeg_dataset = DecodingDataset(
    X=X_eeg,
    y=y_eeg,
    metadata={
        "n_channels": n_channels,
        "n_times": n_times,
        "sfreq": 200,
        "tmin": -0.2,
        "tmax": 0.8
    },
    modality="eeg"
)

print(f"EEG dataset: {eeg_dataset.n_samples} epochs, {eeg_dataset.n_features} features")

In [None]:
# Extract specific time windows
window_extractor = TimeWindowExtractor(
    windows=[
        (0.1, 0.2),   # P1 component
        (0.15, 0.25), # N170
        (0.3, 0.5),   # P300
    ],
    aggregation="mean",
    flatten=True
)

window_extractor.fit(eeg_dataset)
windowed_dataset = window_extractor.transform(eeg_dataset)

print(f"Original: {eeg_dataset.n_features} features")
print(f"After windowing: {windowed_dataset.n_features} features")
print(f"Windows: {windowed_dataset.metadata['windows']}")

## 6. Pipeline: Combined Feature Processing

In [None]:
# Complete pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Create feature processing pipeline
feature_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('anova', ANOVASelector(k=500)),
])

# Fit and transform
X_processed = feature_pipeline.fit_transform(dataset.X, dataset.y)
print(f"Processed shape: {X_processed.shape}")

## Next Steps

- **03_train_classifier.ipynb**: Training decoders with selected features
- **04_cross_validation.ipynb**: Proper CV with feature selection