# Download Datasets

This notebook demonstrates how to download and use datasets from PerturbLab's resource registry.

## Features
- Automatic caching in `~/.cache/perturblab/`
- Support for scPerturb benchmark datasets (55+ datasets)
- Gene Ontology (GO) files
- Progress tracking and resume support


In [None]:
from perturblab.data.resources import load_dataset, get_dataset, list_datasets
import anndata as ad

# List all available datasets
print("Available datasets:")
datasets = list_datasets()
print(f"Total: {len(datasets)} datasets")
print("\nFirst 10 datasets:")
for ds in datasets[:10]:
    print(f"  - {ds}")


## Download scPerturb Benchmark Dataset


In [None]:
# Download a scPerturb dataset (automatically cached)
# Note: norman_2019 is not available, use norman_2019_filtered instead
# First call downloads, subsequent calls use cache
h5ad_path = load_dataset('scperturb/norman_2019_filtered')
print(f"Dataset downloaded to: {h5ad_path}")

# Load into AnnData
adata = ad.read_h5ad(h5ad_path)
print(f"\nDataset shape: {adata.shape}")
print(f"Genes: {adata.n_vars}, Cells: {adata.n_obs}")
print(f"\nObservations columns: {list(adata.obs.columns[:5])}")
print(f"Variables columns: {list(adata.var.columns[:5])}")


## Download Gene Ontology File


In [None]:
# Download GO ontology file
go_path = load_dataset('go/go_basic')
print(f"GO file downloaded to: {go_path}")

# Check file size
import os
file_size_mb = os.path.getsize(go_path) / (1024 * 1024)
print(f"File size: {file_size_mb:.2f} MB")

# Second call uses cache (instant)
go_path_2 = load_dataset('go/go_basic')
print(f"\nCached path (same): {go_path == go_path_2}")


## Get Dataset Resource Metadata


In [None]:
# Get resource object for metadata
resource = get_dataset('scperturb/norman_2019_filtered')
print(f"Resource key: {resource.key}")
print(f"Resource type: {type(resource).__name__}")
print(f"Has remote config: {resource._remote_config is not None}")

if resource._remote_config:
    print(f"Downloader: {resource._remote_config.get('downloader', 'N/A')}")
