# Feature Engineering

This notebook extracts features for resistance prediction:
1. Cell-level embeddings
2. Cluster-level dynamics
3. Trajectory features
4. Pathway features


In [None]:
import sys
from pathlib import Path
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
import seaborn as sns

# Add src to path
sys.path.insert(0, str(Path.cwd().parent))

from src.features import FeatureBuilder
from src.utils import load_adata, get_resistance_pathway_genes

sc.settings.verbosity = 2


In [None]:
# Load data with trajectories
adata = load_adata("../data/synthetic/adata_with_trajectories.h5ad")
print(f"Loaded: {adata.n_obs:,} cells, {adata.n_vars:,} genes")


In [None]:
# Initialize feature builder
feature_builder = FeatureBuilder(adata)

# Extract sample-level features
sample_features = feature_builder.extract_sample_features(
    include_clusters=True,
    include_trajectory=True,
    include_pathways=False  # Set to True if pathway genes available
)

print(f"Extracted features for {len(sample_features)} samples")
print(f"Feature columns: {list(sample_features.columns)}")
print("\nFirst few rows:")
print(sample_features.head())


In [None]:
# Visualize feature distributions
if len(sample_features) > 0:
    numeric_cols = sample_features.select_dtypes(include=[np.number]).columns
    n_cols = min(4, len(numeric_cols))
    
    if n_cols > 0:
        fig, axes = plt.subplots(1, n_cols, figsize=(4*n_cols, 4))
        if n_cols == 1:
            axes = [axes]
        
        for i, col in enumerate(numeric_cols[:n_cols]):
            axes[i].hist(sample_features[col].dropna(), bins=20, edgecolor='black')
            axes[i].set_xlabel(col)
            axes[i].set_ylabel('Frequency')
            axes[i].set_title(f'Distribution of {col}')
        
        plt.tight_layout()
        plt.show()


In [None]:
# Save features
sample_features.to_csv("../data/synthetic/sample_features.csv", index=False)
print("Features saved to ../data/synthetic/sample_features.csv")
