# Data Types

This notebook demonstrates PerturbLab's type system for single-cell data.

## Features
- `CellData`: Enhanced AnnData wrapper with gene alignment
- `PerturbationData`: Perturbation-specific dataset with splitting methods
- Virtual gene views for zero-copy alignment
- Backed mode support for large datasets


In [None]:
import numpy as np
import pandas as pd
import anndata as ad
from perturblab.types import CellData, PerturbationData

# Create sample AnnData
n_obs, n_vars = 100, 50
X = np.random.rand(n_obs, n_vars)
obs = pd.DataFrame({'cell_type': np.random.choice(['A', 'B', 'C'], n_obs)})
var = pd.DataFrame(index=[f'Gene_{i}' for i in range(n_vars)])
adata = ad.AnnData(X, obs=obs, var=var)

print(f"Original AnnData: {adata.shape}")


## CellData: Enhanced AnnData Wrapper


In [None]:
# Create CellData from AnnData
cell_data = CellData(adata, cell_type_col='cell_type')
print(f"CellData shape: {cell_data.shape}")
print(f"Cell types: {cell_data.cell_types}")

# Access underlying AnnData
print(f"\nUnderlying AnnData: {cell_data.adata.shape}")

# Gene alignment (missing genes are filled with fill_value)
target_genes = ['Gene_0', 'Gene_1', 'Gene_2', 'Virtual_Gene']
aligned = cell_data.align_genes(target_genes, fill_value=0.0)
print(f"\nAligned shape: {aligned.shape}")
print(f"Target genes: {list(aligned.var_names)}")


## PerturbationData: Perturbation-Specific Dataset


In [None]:
# Create PerturbationData with perturbation labels
pert_obs = obs.copy()
pert_obs['condition'] = np.random.choice(['ctrl', 'pert_A', 'pert_B'], n_obs)
pert_adata = ad.AnnData(X, obs=pert_obs, var=var)

pert_data = PerturbationData(
    pert_adata,
    perturbation_col='condition',
    control_label='ctrl',
    cell_type_col='cell_type'
)

print(f"PerturbationData shape: {pert_data.shape}")
print(f"Perturbations: {pert_data.perturbations}")
print(f"Control cells: {len(pert_data.control_cells)}")
print(f"Perturbed cells: {len(pert_data.perturbed_cells)}")


## Data Splitting for Perturbation Analysis


In [None]:
# Preview split without actually splitting (dry_run=True)
split_info = pert_data.split(
    split_type='simulation',
    test_size=0.2,
    dry_run=True
)
print(f"Split preview: {split_info}")

# Actually split the data
train_data, test_data = pert_data.split(
    split_type='simulation',
    test_size=0.2,
    dry_run=False
)
print(f"\nTrain: {train_data.shape}, Test: {test_data.shape}")
