# Data Preprocessing Pipeline

This notebook demonstrates the preprocessing steps for metabolomics data:
- Missing value imputation
- Normalization
- Batch effect correction
- Feature scaling
- Feature selection


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import sys
sys.path.append('../src')

from preprocessing import MetabolomicsPreprocessor
from features import FeatureSelector
import warnings
warnings.filterwarnings('ignore')

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)


In [None]:
# Load data
data_path = Path('../data/synthetic/synthetic_urine_metabolomics.csv')
df = pd.read_csv(data_path)

print(f"Original data shape: {df.shape}")

# Prepare labels
y = (df['diagnosis_label'] != 'control').astype(int).values
print(f"\nClass distribution: {np.bincount(y)}")


In [None]:
# Full preprocessing pipeline
preprocessor = MetabolomicsPreprocessor(
    imputation_method='knn',
    normalization_method='log2',
    batch_correction=True,
    scale_method='zscore'
)

X_processed = preprocessor.fit_transform(df)
print(f"Final processed data shape: {X_processed.shape}")
print(f"Mean: {X_processed.mean():.4f}, Std: {X_processed.std():.4f}")


In [None]:
# Feature selection
feature_selector = FeatureSelector(
    method='univariate',
    n_features=200,
    variance_threshold=0.01
)

X_selected = feature_selector.fit_transform(X_processed, y)
print(f"After feature selection: {X_selected.shape}")

# Get feature importance
metab_cols = [col for col in df.columns if col.startswith('metab_')]
importance_df = feature_selector.get_feature_importance(metab_cols)
print(f"\nTop 10 features:")
print(importance_df.head(10))
