# Register single-cell RNA-seq datasets

In [None]:
import lamindb as ln
import bionty as bt
import wetlab as wl

[92m→[0m connected lamindb: laminlabs/hubmap


In [2]:
ln.track()

[92m→[0m found notebook register-single-cell-rna.ipynb, making new version
[92m→[0m created Transform('WrlPTZWhrZZP0002'), started new Run('9wCIRsb1...') at 2025-05-21 11:14:29 UTC
[92m→[0m notebook imports: bionty==1.3.2 fsspec==2025.3.2 lamindb==1.5.3 wetlab==1.2.0
[94m•[0m recommendation: to identify the notebook across renames, pass the uid: ln.track("WrlPTZWhrZZP")


In [3]:
# Corresponds to the curated metadata not the raw API output
metadata = ln.Artifact.get("Z8sZr9vMHjY5WjqL0005").load()

In [4]:
metadata.head()

Unnamed: 0,uuid,assay,rnaseq_assay_method,title,group_name,consortium,doi,publication_date,status,dataset_type,processing,organ,sample_category,analyte_class,bmi,age,ethnicity,sex,diseases,donor_id,sample_id,ancestor_id,raw_expr_url,expr_url,secondary_analysis_url,scvelo_url,tissue,suspension_type,tissue_type
0,8776e9183d5f85d90535a0b1b3b4e32a,10x 3' v3,Single Cell 3' v3,RNAseq data from the thymus of a 18-year-old b...,University of Florida TMC,HuBMAP,10.35079/HBM724.ZKSM.924,2020-08-22,protected,RNAseq,raw,TH,suspension,RNA,27.1,18,African American,male,[normal],HBM678.JKBB.893,HBM365.LNPG.969,268e8fb044f82a1497b5fd17918500ea,https://assets.hubmapconsortium.org/81a9fa68b2...,,,,thymus,cell,tissue
1,c03acf2de0caff5e5850e0f76d555e1b,10x 3' v3,Single Cell 3' v3,RNAseq data from the thymus of a 18-year-old b...,University of Florida TMC,HuBMAP,10.35079/HBM457.SQKR.279,2020-08-22,protected,RNAseq,raw,TH,suspension,RNA,27.1,18,African American,male,[normal],HBM678.JKBB.893,HBM363.KHLF.497,ac972fb45d1dc05548ecf400229a8038,https://assets.hubmapconsortium.org/e8d642084f...,,,,thymus,cell,tissue
2,b29f62452b8e333ffc62d2e69caa18fa,10x 3' v3,3`,RNAseq data from the large intestine of a 67-y...,Stanford TMC,HuBMAP,10.35079/HBM444.XJKC.552,2020-08-22,protected,RNAseq,raw,LI,block,RNA,30.2,67,European,female,"[hypertensive disorder, coronary artery disord...",HBM279.WPZP.978,HBM588.GSHN.453,e4ee92c09a755f8889cb8c37a669e160,https://assets.hubmapconsortium.org/c019a1cd35...,,,,large intestine,nucleus,tissue
3,a5234e06fed9a14ee8d29c5aa0258ba5,10x 3' v3,Single Cell 3' v3,RNAseq data from the lymph node of a 1-year-ol...,University of Florida TMC,HuBMAP,10.35079/HBM252.HMBK.543,2020-08-22,protected,RNAseq,raw,LY,suspension,RNA,21.8,1,European,male,[normal],HBM638.SMWG.276,HBM789.XWDB.222,6c717082627f452935b9f63d2d93f023,https://assets.hubmapconsortium.org/0576b972e0...,,,,lymph node,cell,tissue
4,20ee458e5ee361717b68ca72caf6044e,10x 3' v3,10x Chromium Single Cell 3' Reagent Kits v3.1,RNAseq data from the small intestine of a 67-y...,Stanford TMC,HuBMAP,10.35079/HBM983.LKMP.544,2022-11-30,protected,RNAseq,raw,SI,block,RNA,30.2,67,European,female,"[hypertensive disorder, coronary artery disord...",HBM279.WPZP.978,HBM555.LQJW.397,e80cd8fab25ec8e9cb41e3872e2129c7,https://assets.hubmapconsortium.org/f6eb890063...,https://assets.hubmapconsortium.org/f6eb890063...,https://assets.hubmapconsortium.org/f6eb890063...,https://assets.hubmapconsortium.org/f6eb890063...,small intestine,nucleus,tissue


In [5]:
from fsspec.asyn import FSTimeoutError

for idx, row in metadata.iterrows():
    ds = row.to_dict()

    try:
        # Some rows do not have URLs. These are high level dataset views that may have metadata.
        artifacts = []

        if url := ds["raw_expr_url"]:
            raw_af = ln.Artifact(url, description=f"{ds['title']}").save()
            raw_af.ulabels.add(ln.ULabel.filter(name="raw").one())
            artifacts.append(raw_af)

        if sec_url := ds["secondary_analysis_url"]:
            sec_af = ln.Artifact(sec_url, description=f"{ds['title']}").save()
            sec_af.ulabels.add(ln.ULabel.filter(name="secondary").one())
            artifacts.append(sec_af)

        if scv_url := ds["scvelo_url"]:
            scv_af = ln.Artifact(scv_url, description=f"{ds['title']}").save()
            scv_af.ulabels.add(ln.ULabel.filter(name="scvelo").one())
            artifacts.append(scv_af)

        if expr_url := ds["expr_url"]:
            expr_af = ln.Artifact(expr_url, description=f"{ds['title']}").save()
            expr_af.ulabels.add(ln.ULabel.filter(name="expr").one())
            artifacts.append(expr_af)

            expr_af.references.add(ln.Reference.filter(doi=ds["doi"]).one())
            expr_af.tissues.add(bt.Tissue.filter(name=ds["tissue"]).one())
            expr_af.ethnicities.add(bt.Ethnicity.filter(name=ds["ethnicity"]).one())
            expr_af.phenotypes.add(bt.Phenotype.filter(name=ds["sex"]).one())
            for disease_name in ds["diseases"]:
                expr_af.diseases.add(bt.Disease.filter(name=disease_name).one())
            expr_af.ulabels.add(ln.ULabel.filter(name=ds["suspension_type"]).one())
            expr_af.experimental_factors.add(
                bt.ExperimentalFactor.filter(name=ds["assay"]).one()
            )
            expr_af.donors.add(wl.Donor.filter(name=ds["donor_id"]).one())
            expr_af.biosamples.add(wl.Biosample.filter(name=ds["sample_id"]).one())
            expr_af.ulabels.add(ln.ULabel.filter(name="tissue").one())

        ds_cl = ln.Collection.filter(key=ds["uuid"]).one_or_none()

        if ds_cl:
            ds_cl.artifacts.add(*artifacts)
        elif artifacts:
            ds_cl = ln.Collection(
                artifacts, key=ds["uuid"], description=ds["title"]
            ).save()
    except FSTimeoutError:
        print(f"Timeout error for: {ds['title']} of collection UUID {ds['uuid']}")
        continue

[92m→[0m returning existing artifact with same hash: Artifact(uid='AzqCWQAKLMV3iTMA0000', is_latest=True, key='f6eb890063d13698feb11d39fa61e45a/raw_expr.h5ad', description='RNAseq data from the small intestine of a 67-year-old white female', suffix='.h5ad', otype='AnnData', size=67867992, hash='of_TeLP6cet2JBj3o_kZmQ', n_observations=6000, space_id=1, storage_id=2, run_id=11, created_by_id=3, created_at=2025-01-28 14:16:35 UTC); to track this artifact as an input, use: ln.Artifact.get()
[92m→[0m returning existing artifact with same hash: Artifact(uid='fWN781TxuZibkBOR0000', is_latest=True, key='f6eb890063d13698feb11d39fa61e45a/secondary_analysis.h5ad', description='RNAseq data from the small intestine of a 67-year-old white female', suffix='.h5ad', otype='AnnData', size=888111371, hash='ian3P5CN68AAvoDMC6sZLw', n_observations=5956, space_id=1, storage_id=2, run_id=11, created_by_id=3, created_at=2025-01-28 14:16:39 UTC); to track this artifact as an input, use: ln.Artifact.get()


In [6]:
ln.finish()

[94m•[0m please hit CMD + s to save the notebook in your editor  [92m✓[0m
[93m![0m cells [(0, None), (None, 2)] were not run consecutively
[92m→[0m finished Run('9wCIRsb1') after 1h at 2025-05-21 12:39:12 UTC
[92m→[0m go to: https://lamin.ai/laminlabs/hubmap/transform/WrlPTZWhrZZP0002
[92m→[0m to update your notebook from the CLI, run: lamin save /Users/altananamsaraeva/Desktop/Lamin/hubmap-registration/scrna/register-single-cell-rna.ipynb
