# Register single-cell RNA-seq datasets

In [1]:
import lamindb as ln
import bionty as bt
import wetlab as wl

ln.track("WrlPTZWhrZZP0001")

[92m→[0m connected lamindb: laminlabs/hubmap
[92m→[0m loaded Transform('WrlPTZWhrZZP0001'), re-started Run('XCvmcC6e...') at 2025-01-29 09:26:09 UTC
[92m→[0m notebook imports: bionty==1.0.0 fsspec==2024.9.0 lamindb==1.0.5 wetlab==1.0.1


In [2]:
# Corresponds to the curated metadata not the raw API output
metadata = ln.Artifact.get("Z8sZr9vMHjY5WjqL0001").load()

In [3]:
# The table contains a bit of ATAC. Here we only focus on (sc-)RNA-seq and add the ATAC later.
scrna_seq_only = metadata[metadata["dataset_type"] == "RNAseq"]

In [4]:
from fsspec.asyn import FSTimeoutError

for idx, row in scrna_seq_only.iterrows():
    ds = row.to_dict()

    try:
        # Some rows do not have URLs. These are high level dataset views that may have metadata.
        artifacts = []
        if url := ds["raw_expr_url"]:
            raw_af = ln.Artifact(url, description=f"{ds['title']}").save()
            raw_af.ulabels.add(ln.ULabel.filter(name="raw").one())
            artifacts.append(raw_af)

        if sec_url := ds["secondary_analysis_url"]:
            sec_af = ln.Artifact(sec_url, description=f"{ds['title']}").save()
            sec_af.ulabels.add(ln.ULabel.filter(name="secondary").one())
            artifacts.append(sec_af)

        if scv_url := ds["scvelo_url"]:
            scv_af = ln.Artifact(scv_url, description=f"{ds['title']}").save()
            scv_af.ulabels.add(ln.ULabel.filter(name="scvelo").one())
            artifacts.append(scv_af)

        if expr_url := ds["expr_url"]:
            expr_af = ln.Artifact(expr_url, description=f"{ds['title']}").save()
            expr_af.ulabels.add(ln.ULabel.filter(name="expr").one())
            artifacts.append(expr_af)

            expr_af.references.add(ln.Reference.filter(doi=ds["doi"]).one())
            expr_af.tissues.add(bt.Tissue.filter(name=ds["tissue"]).one())
            expr_af.ethnicities.add(bt.Ethnicity.filter(name=ds["ethnicity"]).one())
            expr_af.phenotypes.add(bt.Phenotype.filter(name=ds["sex"]).one())
            for disease_name in ds["diseases"]:
                expr_af.diseases.add(bt.Disease.filter(name=disease_name).one())
            expr_af.ulabels.add(ln.ULabel.filter(name=ds["suspension_type"]).one())
            expr_af.experimental_factors.add(
                bt.ExperimentalFactor.filter(name=ds["assay"]).one()
            )
            expr_af.donors.add(wl.Donor.filter(name=ds["donor_id"]).one())
            expr_af.biosamples.add(wl.Biosample.filter(name=ds["sample_id"]).one())
            expr_af.ulabels.add(ln.ULabel.filter(name="tissue").one())

        ds_cl = ln.Collection.filter(key=ds["collection_uuid"]).one_or_none()
        if ds_cl:
            ds_cl.artifacts.add(*artifacts)
        elif artifacts:
            ds_cl = ln.Collection(
                artifacts, key=ds["collection_uuid"], description=ds["title"]
            ).save()
    except FSTimeoutError:
        print(
            f"Timeout error for: {ds['title']} of collection UUID {ds['collection_uuid']}"
        )
        continue

[92m→[0m returning existing artifact with same hash: Artifact(uid='AzqCWQAKLMV3iTMA0000', is_latest=True, key='f6eb890063d13698feb11d39fa61e45a/raw_expr.h5ad', description='RNAseq data from the small intestine of a 67.0-year-old white female', suffix='.h5ad', otype='AnnData', size=67867992, hash='of_TeLP6cet2JBj3o_kZmQ', space_id=1, storage_id=2, run_id=11, created_by_id=3, created_at=2025-01-28 14:16:35 UTC); if you intended to query to track this artifact as an input, use: ln.Artifact.get()
[92m→[0m returning existing artifact with same hash: Artifact(uid='fWN781TxuZibkBOR0000', is_latest=True, key='f6eb890063d13698feb11d39fa61e45a/secondary_analysis.h5ad', description='RNAseq data from the small intestine of a 67.0-year-old white female', suffix='.h5ad', otype='AnnData', size=888111371, hash='ian3P5CN68AAvoDMC6sZLw', space_id=1, storage_id=2, run_id=11, created_by_id=3, created_at=2025-01-28 14:16:39 UTC); if you intended to query to track this artifact as an input, use: ln.Arti

In [5]:
ln.finish()

[94m•[0m please hit CTRL + s to save the notebook in your editor .. [92m✓[0m
[92m→[0m finished Run('XCvmcC6e') after 1h at 2025-01-29 11:11:55 UTC
[92m→[0m go to: https://lamin.ai/laminlabs/hubmap/transform/WrlPTZWhrZZP0001
[92m→[0m to update your notebook from the CLI, run: lamin save /home/lukas/code/hubmap_registration/register_single_cell_rna.ipynb
