![](https://img.shields.io/badge/1/4-lightgrey)

# Register h5ad files of cellxgene-census

## Setup

In [None]:
# !lamin init --storage s3://lamindata --name cellxgene-census --schema bionty
# !lamin close

In [None]:
!lamin load laminlabs/cellxgene-census

In [None]:
import lamindb as ln
import lnschema_bionty as lb
import cellxgene_census

In [None]:
ln.track()

In [None]:
census_version = "2023-07-25"

## Register datasets

In [None]:
census = cellxgene_census.open_soma(census_version=census_version)

In [None]:
census

In [None]:
census["census_data"]

In [None]:
census["census_info"]

In [None]:
datasets_df = census["census_info"]["datasets"].read().concat().to_pandas()
datasets_df.shape

In [None]:
datasets_df.head()

In [None]:
files = ln.File.from_dir("s3://cellxgene-data-public/cell-census/2023-07-25/h5ads")
ln.save(files)

In [None]:
dataset = ln.Dataset(files, name="cellxgene-census", version=census_version)

In [None]:
dataset.save()

In [None]:
collections_df = (
    datasets_df[["collection_id", "collection_name", "collection_doi"]]
    .drop_duplicates()
    .set_index("collection_id")
)
collections = []
for collection_id, row in collections_df.iterrows():
    collection = ln.ULabel(
        name=row.collection_name,
        description=row.collection_doi,
        reference=collection_id,
        reference_type="collection_id",
    )
    collections.append(collection)

ln.save(collections)

is_collection = ln.ULabel(name="is_collection")
is_collection.save()
is_collection.children.set(collections)

In [None]:
collections = is_collection.children
files = ln.File.filter()

In [None]:
feature_collection = ln.Feature(name="collection", type="category")
feature_collection.save()

In [None]:
for _, row in datasets_df.iterrows():
    file = files.filter(key__endswith=f"{row.dataset_id}.h5ad").one()
    file.description = f"{row.dataset_title}|{row.dataset_id}"
    file.save()
    file.labels.add(collections.get(reference=row.collection_id), feature_collection)

## Annotate with species

In [None]:
feature_organism = ln.Feature(name="organism", type="category")
feature_organism.save()

In [None]:
files = ln.File.filter()

In [None]:
lb.settings.organism = "human"

human_datasets = (
    census["census_data"][lb.settings.organism.scientific_name]
    .obs.read(column_names=["dataset_id"])
    .concat()
    .to_pandas()
    .drop_duplicates()
)
print(human_datasets.shape)

for dataset_id in human_datasets.dataset_id:
    file = files.filter(description__contains=dataset_id).one()
    file.labels.add(lb.settings.organism, feature_organism)

In [None]:
lb.settings.organism = "mouse"

mouse_datasets = (
    census["census_data"][lb.settings.organism.scientific_name]
    .obs.read(column_names=["dataset_id"])
    .concat()
    .to_pandas()
    .drop_duplicates()
)
print(mouse_datasets.shape)

for dataset_id in mouse_datasets.dataset_id:
    file = files.filter(description__contains=dataset_id).one()
    file.labels.add(lb.settings.organism, feature_organism)

In [None]:
file.describe()

In [None]:
census.close()