# Generate single-cell metadata table

In [1]:
!lamin connect laminlabs/hubmap

[92m→[0m connected lamindb: laminlabs/hubmap


In [None]:
from query import get_dataset_info, get_dataset_urls, scRNAseqDataset
from datetime import datetime, timezone

from rich.progress import (
    Progress,
    SpinnerColumn,
    TimeElapsedColumn,
    BarColumn,
    TextColumn,
)

import lamindb as ln
import pandas as pd

[92m→[0m connected lamindb: laminlabs/hubmap


In [3]:
ln.track("oWI9cCguyzHn0005")

[92m→[0m created Transform('oWI9cCguyzHn0005'), started new Run('vArdZ4aF...') at 2025-01-31 11:51:24 UTC
[92m→[0m notebook imports: lamindb==1.0.5 pandas==2.2.3 query rich==13.9.4


In [4]:
# Use the HubMAP metadata table that was downloaded using the UI button for all datasets
df = pd.read_csv(ln.Artifact.get("9G4UaeVKSY0zy7SX0000").cache(), sep="\t")

# Include all assays here for which we want to get metadata information
# We included a few RNA + ATAC assays here but just registered single-cell RNA-seq for now
single_cell_assays = [
    "SNARE-seq2",
    "SNARE2-RNAseq",
    "snRNAseq-10xGenomics-v3",
    "snRNAseq",
    "scRNAseq-10xGenomics",
    "scRNAseq-10xGenomics-v3",
    "sciRNAseq",
    "scRNAseq-10xGenomics-v2",
]
hubmap_sc_only = df[df["assay_type"].isin(single_cell_assays)]

In [None]:
def create_hubmap_metadata_df(hubmap_metadata: pd.DataFrame):
    data = []
    with Progress(
        SpinnerColumn(),
        TextColumn("[progress.description]{task.description}"),
        BarColumn(),
        TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
        TextColumn("[blue]{task.fields[uuid]}"),
        TimeElapsedColumn(),
    ) as progress:
        """Fetches dataset URLs and metadata using the API to collect it in a DataFrame."""
        task = progress.add_task(
            "[cyan]Processing datasets...", total=len(hubmap_metadata), uuid=""
        )
        for uuid in hubmap_metadata["uuid"].values:
            progress.update(task, uuid=uuid)
            dataset_info = get_dataset_info(uuid)[0]
            donor_metadata = (
                dataset_info.get("donor", {})
                .get("metadata", {})
                .get("organ_donor_data", [])
            )

            dataset_urls = get_dataset_urls(
                uuid,
                file_types=[
                    "raw_expr.h5ad",
                    "expr.h5ad",
                    "secondary_analysis.h5ad",
                    "scvelo.h5ad",
                ],
                dataset_class=scRNAseqDataset,
            )
            urls = {
                "raw_expr_url": dataset_urls.raw_expr or "",
                "expr_url": dataset_urls.expr or "",
                "secondary_analysis_url": dataset_urls.secondary_analysis or "",
                "scvelo_url": dataset_urls.scvelo or "",
            }

            row = {
                "assay": hubmap_metadata.loc[
                    hubmap_metadata["uuid"] == uuid, "assay_type"
                ].iloc[0],
                "rnaseq_assay_method": hubmap_metadata.loc[
                    hubmap_metadata["uuid"] == uuid, "rnaseq_assay_method"
                ].iloc[0],
                "title": dataset_info.get("title", ""),
                "group_name": dataset_info.get("group_name", ""),
                "consortium": "HuBMAP",
                "doi": dataset_info.get("registered_doi", ""),
                "publication_date": datetime.fromtimestamp(
                    dataset_info.get("published_timestamp", 0) / 1000, tz=timezone.utc
                ).strftime("%Y-%m-%d")
                if dataset_info.get("published_timestamp")
                else "",
                "status": dataset_info.get("data_access_level", ""),
                "dataset_type": dataset_info.get("dataset_type", ""),
                "processing": "raw",
                "organ": next(
                    (
                        sample.get("organ", "")
                        for sample in dataset_info.get("origin_samples", [])
                        if sample.get("organ")
                    ),
                    "",
                ),
                "sample_category": next(
                    (
                        sample.get("sample_category", "")
                        for sample in dataset_info.get("source_samples", [])
                        if sample.get("sample_category")
                    ),
                    "",
                ),
                "analyte_class": next(
                    (
                        a
                        for a in [
                            "RNA",
                            "Protein",
                            "DNA",
                            "Metabolite",
                            "Lipid",
                            "Nucleic acid + protein",
                            "Endogenous fluorophore",
                            "Polysaccharide",
                            "Peptide",
                            "DNA + RNA",
                            "Lipid + metabolite",
                        ]
                        if a in dataset_info.get("dataset_type", "")
                        or any(
                            a in d.get("dataset_type", "")
                            for d in dataset_info.get("descendants", [])
                        )
                    ),
                    "",
                ),
                "bmi": next(
                    (
                        item.get("data_value", "")
                        for item in donor_metadata
                        if item.get("grouping_concept_preferred_term")
                        == "Body Mass Index"
                    ),
                    "",
                ),
                "age": next(
                    (
                        item.get("data_value", "")
                        for item in donor_metadata
                        if item.get("grouping_concept_preferred_term") == "Age"
                    ),
                    "",
                ),
                "ethnicity": next(
                    (
                        item.get("data_value", "")
                        for item in donor_metadata
                        if item.get("grouping_concept_preferred_term") == "Race"
                    ),
                    "",
                ),
                "sex": next(
                    (
                        item.get("data_value", "")
                        for item in donor_metadata
                        if item.get("grouping_concept_preferred_term") == "Sex"
                    ),
                    "",
                ),
                "diseases": [
                    item.get("data_value", "")
                    for item in donor_metadata
                    if item.get("grouping_concept_preferred_term") == "Medical History"
                ]
                or ["normal"],
                "donor_id": dataset_info.get("donor", {}).get("hubmap_id", ""),
                "sample_id": next(
                    (
                        sample.get("hubmap_id", "")
                        for sample in dataset_info.get("source_samples", [])
                        if sample.get("hubmap_id")
                    ),
                    "",
                ),
                "collection_uuid": dataset_info.get("immediate_ancestor_ids", [])[0]
                if dataset_info.get("immediate_ancestor_ids")
                else "",  # Always the first ancestor ID
                **urls,
            }
            data.append(row)
            progress.update(task, advance=1)

    df = pd.DataFrame(data, index=hubmap_metadata["uuid"].values)
    df.index.name = "uuid"

    return df

In [6]:
single_cell_metadata_df = create_hubmap_metadata_df(hubmap_sc_only)
single_cell_metadata_df

Output()

Unnamed: 0_level_0,assay,rnaseq_assay_method,title,group_name,consortium,doi,publication_date,status,dataset_type,processing,organ,sample_category,analyte_class,bmi,age,ethnicity,sex,diseases,donor_id,sample_id,collection_uuid,raw_expr_url,expr_url,secondary_analysis_url,scvelo_url
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
a5234e06fed9a14ee8d29c5aa0258ba5,scRNAseq-10xGenomics,Single Cell 3' v3,RNAseq data from the lymph node of a 1.0-year-...,University of Florida TMC,HuBMAP,10.35079/HBM252.HMBK.543,2020-08-22,protected,RNAseq,raw,LY,suspension,RNA,21.80,1.0,White,Male,[normal],HBM638.SMWG.276,HBM789.XWDB.222,6c717082627f452935b9f63d2d93f023,,,,
c03acf2de0caff5e5850e0f76d555e1b,scRNAseq-10xGenomics,Single Cell 3' v3,RNAseq data from the thymus of a 18.0-year-old...,University of Florida TMC,HuBMAP,10.35079/HBM457.SQKR.279,2020-08-22,protected,RNAseq,raw,TH,suspension,RNA,27.10,18.0,Black or African American,Male,[normal],HBM678.JKBB.893,HBM363.KHLF.497,ac972fb45d1dc05548ecf400229a8038,,,,
8776e9183d5f85d90535a0b1b3b4e32a,scRNAseq-10xGenomics,Single Cell 3' v3,RNAseq data from the thymus of a 18.0-year-old...,University of Florida TMC,HuBMAP,10.35079/HBM724.ZKSM.924,2020-08-22,protected,RNAseq,raw,TH,suspension,RNA,27.10,18.0,Black or African American,Male,[normal],HBM678.JKBB.893,HBM365.LNPG.969,268e8fb044f82a1497b5fd17918500ea,,,,
b29f62452b8e333ffc62d2e69caa18fa,snRNAseq,3`,RNAseq data from the large intestine of a 67.0...,Stanford TMC,HuBMAP,10.35079/HBM444.XJKC.552,2020-08-22,protected,RNAseq,raw,LI,block,RNA,30.20,67.0,White,Female,"[Hypertension, Coronary Artery Disease, Cardia...",HBM279.WPZP.978,HBM588.GSHN.453,e4ee92c09a755f8889cb8c37a669e160,,,,
20ee458e5ee361717b68ca72caf6044e,snRNAseq-10xGenomics-v3,10x Chromium Single Cell 3' Reagent Kits v3.1,RNAseq data from the small intestine of a 67.0...,Stanford TMC,HuBMAP,10.35079/HBM983.LKMP.544,2022-11-30,protected,RNAseq,raw,SI,block,RNA,30.20,67.0,White,Female,"[Hypertension, Coronary Artery Disease, Cardia...",HBM279.WPZP.978,HBM555.LQJW.397,e80cd8fab25ec8e9cb41e3872e2129c7,https://assets.hubmapconsortium.org/f6eb890063...,https://assets.hubmapconsortium.org/f6eb890063...,https://assets.hubmapconsortium.org/f6eb890063...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
c45049b3115ed7c8d39f7ec3b0a06020,scRNAseq-10xGenomics-v3,"Chromium Single Cell 3' GEM, Library & Gel Bea...",RNAseq data from the knee (right) of a 24.0-ye...,TMC - University of Connecticut and Scripps,HuBMAP,10.35079/HBM629.QFBP.955,2023-12-18,protected,RNAseq,raw,RN,block,RNA,36.90,24.0,Unknown,Male,[normal],HBM993.XNPC.882,HBM452.VSCR.855,9c63cd124b1e33aa5ea5a1f54f01f309,https://assets.hubmapconsortium.org/e4d23bbb49...,https://assets.hubmapconsortium.org/e4d23bbb49...,https://assets.hubmapconsortium.org/e4d23bbb49...,
13fe2b69c62dc2c3bbe43c9177973033,snRNAseq-10xGenomics-v3,snRNAseq-10Xgenomics,RNAseq data from the bladder of a 19.0-year-ol...,University of California San Diego TMC,HuBMAP,10.35079/HBM325.FBQQ.678,2023-06-12,protected,RNAseq,raw,BL,section,RNA,27.70,19.0,Unknown,Male,[normal],HBM894.PKMC.242,HBM553.GXPD.548,a11a58186c6d136360c3d0b863b0c00c,https://assets.hubmapconsortium.org/494be31798...,https://assets.hubmapconsortium.org/494be31798...,https://assets.hubmapconsortium.org/494be31798...,
18ec0bb859d2a0422b68f2f6491a7e15,snRNAseq-10xGenomics-v3,snRNAseq-10Xgenomics,RNAseq data from the bladder of a 22.0-year-ol...,University of California San Diego TMC,HuBMAP,10.35079/HBM449.JKGC.452,2023-06-12,protected,RNAseq,raw,BL,section,RNA,,22.0,Black or African American,Male,[normal],HBM256.BRQS.425,HBM854.KCQP.863,a6920284fccd858507978948faca6d19,https://assets.hubmapconsortium.org/6c57274e7a...,https://assets.hubmapconsortium.org/6c57274e7a...,https://assets.hubmapconsortium.org/6c57274e7a...,
976144220fb6be94f1f71298062f0bda,snRNAseq-10xGenomics-v3,snRNAseq-10Xgenomics,RNAseq data from the bladder of a 43.0-year-ol...,University of California San Diego TMC,HuBMAP,10.35079/HBM735.WZWF.739,2023-06-12,protected,RNAseq,raw,BL,section,RNA,24.70,43.0,White,Male,[normal],HBM568.HLJW.252,HBM636.WLKR.529,4ff2d3993ea5aeeeb360d163ab217dc6,https://assets.hubmapconsortium.org/d67c61063b...,https://assets.hubmapconsortium.org/d67c61063b...,https://assets.hubmapconsortium.org/d67c61063b...,


In [7]:
sc_af = ln.Artifact.from_df(
    single_cell_metadata_df,
    key="2024-12-20_15-35-09/meta_scrna_original.parquet",
    description="Single-cell metadata information to use for ingestion.",
).save()

[92m→[0m creating new artifact version for key='2024-12-20_15-35-09/meta_scrna_original.parquet' (storage: 's3://lamin-us-west-2/sznqFqn7xUoI')
... uploading ZmKRFUAwmX5RK9d80004.parquet: 100.0%
[93m![0m The cache path /home/lukas/.cache/lamindb/lamin-us-west-2/sznqFqn7xUoI/2024-12-20_15-35-09/meta_scrna_original.parquet already exists, replacing it.


In [8]:
ln.finish()

[92m→[0m finished Run('vArdZ4aF') after 47m at 2025-01-31 12:38:31 UTC
[92m→[0m go to: https://lamin.ai/laminlabs/hubmap/transform/oWI9cCguyzHn0005
[92m→[0m to update your notebook from the CLI, run: lamin save /home/lukas/code/hubmap_registration/generate_single_cell_metadata_table.ipynb
