# Data Types

This notebook demonstrates PerturbLab's type system for single-cell data.

## Features
- `CellData`: Enhanced AnnData wrapper with gene alignment
- `PerturbationData`: Perturbation-specific dataset with splitting methods
- Virtual gene views for zero-copy alignment
- Backed mode support for large datasets


In [1]:
import numpy as np
import pandas as pd
import anndata as ad
from perturblab.types import CellData, PerturbationData

# Create sample AnnData
n_obs, n_vars = 100, 50
X = np.random.rand(n_obs, n_vars)
obs = pd.DataFrame({'cell_type': np.random.choice(['A', 'B', 'C'], n_obs)})
var = pd.DataFrame(index=[f'Gene_{i}' for i in range(n_vars)])
adata = ad.AnnData(X, obs=obs, var=var)

print(f"Original AnnData: {adata.shape}")


Original AnnData: (100, 50)


  return dispatch(args[0].__class__)(*args, **kw)


## CellData: Enhanced AnnData Wrapper


In [2]:
# Create CellData from AnnData
cell_data = CellData(adata, cell_type_col='cell_type')
print(f"CellData shape: {cell_data.shape}")
print(f"Cell types: {cell_data.cell_types}")

# Access underlying AnnData
print(f"\nUnderlying AnnData: {cell_data.adata.shape}")

# Gene alignment (missing genes are filled with fill_value)
target_genes = ['Gene_0', 'Gene_1', 'Gene_2', 'Virtual_Gene']
aligned = cell_data.align_genes(target_genes, fill_value=0.0)
print(f"\nAligned shape: {aligned.shape}")
print(f"Target genes: {list(aligned.var_names)}")


CellData shape: (100, 50)
Cell types: 0     B
1     C
2     A
3     A
4     B
     ..
95    C
96    B
97    C
98    B
99    A
Name: cell_type, Length: 100, dtype: object

Underlying AnnData: (100, 50)

Aligned shape: (100, 4)
Target genes: ['Gene_0', 'Gene_1', 'Gene_2', 'Virtual_Gene']


## PerturbationData: Perturbation-Specific Dataset


In [3]:
# Create PerturbationData with perturbation labels
pert_obs = obs.copy()
pert_obs['condition'] = np.random.choice(['ctrl', 'pert_A', 'pert_B'], n_obs)
pert_adata = ad.AnnData(X, obs=pert_obs, var=var)

pert_data = PerturbationData(
    pert_adata,
    perturbation_col='condition',
    control_label='ctrl',
    cell_type_col='cell_type'
)

print(f"PerturbationData shape: {pert_data.shape}")
print(f"Perturbations: {pert_data.perturbations}")
print(f"Control cells: {len(pert_data.control_labels)}")
print(f"Perturbed cells: {len(pert_data.perturbation_col)}")


PerturbationData shape: (100, 50)
Perturbations: 0     pert_A
1     pert_B
2     pert_B
3       ctrl
4     pert_A
       ...  
95    pert_A
96      ctrl
97    pert_A
98      ctrl
99    pert_A
Name: condition, Length: 100, dtype: object
Control cells: 1
Perturbed cells: 9


  return dispatch(args[0].__class__)(*args, **kw)


## Data Splitting for Perturbation Analysis


In [6]:
# Preview split without actually splitting (dry_run=True)
split_info = pert_data.split(
    split_type='simulation',
    test_size=0.2,
    dry_run=True
)
print(f"Split preview: {split_info}")

# Actually split the data
splits = pert_data.split(
    split_type='simulation',
    test_size=0.2,
    dry_run=False
)
print(f"\nTrain: {splits['train']}, \nTest: {splits['test']}")


[perturblab] [INFO] Split (simulation): Train=2, Val=0, Test=1 perts
[perturblab] [INFO] Added split labels to obs['split']
Split preview: CellDataset with n_obs × n_vars = 100 × 50
    copy
    3 cell types
    perturbation_col: 'condition'
    n_perturbations: 3
    control_labels: {'ctrl'}
[perturblab] [INFO] Split (simulation): Train=2, Val=0, Test=1 perts

Train: CellDataset with n_obs × n_vars = 62 × 50
    view
    3 cell types
    perturbation_col: 'condition'
    n_perturbations: 2
    control_labels: {'ctrl'}, 
Test: CellDataset with n_obs × n_vars = 38 × 50
    view
    3 cell types
    perturbation_col: 'condition'
    n_perturbations: 1
    control_labels: {'ctrl'}
