# Register single-cell metadata

Standardized columns:
- `tissue`
- `ethnicity`
- `sex`
- `diseases` # contains lists of terms
- `suspension_type`
- `assay`
- `donor_id`
- `biosample`
- `tissue_type`

In [1]:
import lamindb as ln
import bionty as bt
import wetlab as wl

ln.track()

[92m→[0m connected lamindb: laminlabs/hubmap
[92m→[0m found notebook register-single-cell-metadata.ipynb, making new version
[92m→[0m created Transform('5znJrS1UjwCi0006'), started new Run('6e5CL25d...') at 2025-05-21 10:48:25 UTC
[92m→[0m notebook imports: bionty==1.3.2 lamindb==1.5.3 wetlab==1.2.0
[94m•[0m recommendation: to identify the notebook across renames, pass the uid: ln.track("5znJrS1UjwCi")


In [2]:
meta_df = ln.Artifact.get("ZmKRFUAwmX5RK9d80006").load()
meta_df.head()

Unnamed: 0,uuid,assay,rnaseq_assay_method,title,group_name,consortium,doi,publication_date,status,dataset_type,processing,organ,sample_category,analyte_class,bmi,age,ethnicity,sex,diseases,donor_id,sample_id,ancestor_id,raw_expr_url,expr_url,secondary_analysis_url,scvelo_url
0,8776e9183d5f85d90535a0b1b3b4e32a,scRNAseq-10xGenomics,Single Cell 3' v3,RNAseq data from the thymus of a 18-year-old b...,University of Florida TMC,HuBMAP,10.35079/HBM724.ZKSM.924,2020-08-22,protected,RNAseq,raw,TH,suspension,RNA,27.1,18,Black or African American,Male,[normal],HBM678.JKBB.893,HBM365.LNPG.969,268e8fb044f82a1497b5fd17918500ea,https://assets.hubmapconsortium.org/81a9fa68b2...,,,
1,c03acf2de0caff5e5850e0f76d555e1b,scRNAseq-10xGenomics,Single Cell 3' v3,RNAseq data from the thymus of a 18-year-old b...,University of Florida TMC,HuBMAP,10.35079/HBM457.SQKR.279,2020-08-22,protected,RNAseq,raw,TH,suspension,RNA,27.1,18,Black or African American,Male,[normal],HBM678.JKBB.893,HBM363.KHLF.497,ac972fb45d1dc05548ecf400229a8038,https://assets.hubmapconsortium.org/e8d642084f...,,,
2,b29f62452b8e333ffc62d2e69caa18fa,snRNAseq,3`,RNAseq data from the large intestine of a 67-y...,Stanford TMC,HuBMAP,10.35079/HBM444.XJKC.552,2020-08-22,protected,RNAseq,raw,LI,block,RNA,30.2,67,White,Female,"[Hypertension, Coronary Artery Disease, Cardia...",HBM279.WPZP.978,HBM588.GSHN.453,e4ee92c09a755f8889cb8c37a669e160,https://assets.hubmapconsortium.org/c019a1cd35...,,,
3,a5234e06fed9a14ee8d29c5aa0258ba5,scRNAseq-10xGenomics,Single Cell 3' v3,RNAseq data from the lymph node of a 1-year-ol...,University of Florida TMC,HuBMAP,10.35079/HBM252.HMBK.543,2020-08-22,protected,RNAseq,raw,LY,suspension,RNA,21.8,1,White,Male,[normal],HBM638.SMWG.276,HBM789.XWDB.222,6c717082627f452935b9f63d2d93f023,https://assets.hubmapconsortium.org/0576b972e0...,,,
4,20ee458e5ee361717b68ca72caf6044e,snRNAseq-10xGenomics-v3,10x Chromium Single Cell 3' Reagent Kits v3.1,RNAseq data from the small intestine of a 67-y...,Stanford TMC,HuBMAP,10.35079/HBM983.LKMP.544,2022-11-30,protected,RNAseq,raw,SI,block,RNA,30.2,67,White,Female,"[Hypertension, Coronary Artery Disease, Cardia...",HBM279.WPZP.978,HBM555.LQJW.397,e80cd8fab25ec8e9cb41e3872e2129c7,https://assets.hubmapconsortium.org/f6eb890063...,https://assets.hubmapconsortium.org/f6eb890063...,https://assets.hubmapconsortium.org/f6eb890063...,https://assets.hubmapconsortium.org/f6eb890063...


## `Reference`

In [3]:
# collect all existing references with DOIs
existing_refs_by_doi = {ref.doi: ref for ref in ln.Reference.filter() if ref.doi}

reference_names = ln.Reference.inspect(meta_df.title, mute=True).non_validated

try:
    ln.settings.creation.search_names = False
    references = []

    for _, row in (
        meta_df[["title", "doi", "publication_date"]].drop_duplicates().iterrows()
    ):
        title = row.title
        doi = row.doi
        publication_date = row.publication_date

        if doi in existing_refs_by_doi:
            ref = existing_refs_by_doi[row.doi]

            if ref.name != title:
                print(f"Updating reference name from {ref.name} to {title}.")
                ref.name = title
                ref.save()

        elif title in reference_names:
            references.append(ln.Reference(name=title, doi=doi, date=publication_date))

finally:
    ln.settings.creation.search_names = True

Updating reference name from RNAseq data from the thymus of a 18.0-year-old black or african american male to RNAseq data from the thymus of a 18-year-old black or african american male.
Updating reference name from RNAseq data from the thymus of a 18.0-year-old black or african american male to RNAseq data from the thymus of a 18-year-old black or african american male.
Updating reference name from RNAseq data from the large intestine of a 67.0-year-old white female to RNAseq data from the large intestine of a 67-year-old white female.
Updating reference name from RNAseq data from the lymph node of a 1.0-year-old white male to RNAseq data from the lymph node of a 1-year-old white male.
Updating reference name from RNAseq data from the small intestine of a 67.0-year-old white female to RNAseq data from the small intestine of a 67-year-old white female.
Updating reference name from RNAseq data from the spleen of a 18.0-year-old white male to RNAseq data from the spleen of a 18-year-old 

In [5]:
references

[]

In [6]:
ln.save(references)

In [7]:
ln.Reference.filter(run_id=ln.context.run.id).count()

0

## Tissue

In [8]:
meta_df.organ.unique()

array(['TH', 'LI', 'LY', 'SI', 'SP', 'HT', 'LK', 'RK', 'LN', 'RN', 'BL',
       'RL', 'LV'], dtype=object)

In [9]:
mapper = {
    "LY": "lymph node",
    "TH": "thymus",
    "LI": "large intestine",
    "SI": "small intestine",
    "SP": "spleen",
    "HT": "heart",
    "LK": "kidney (left)",
    "RK": "kidney (right)",
    "LL": "lung (left)",
    "RL": "lung (right)",
    "LV": "liver",
    "LN": "knee (left)",
    "RN": "knee (right)",
    "BL": "bladder organ",
}
meta_df["tissue"] = meta_df.organ.map(mapper)

In [10]:
try:
    ln.settings.creation.search_names = False
    for name in [
        "kidney (left)",
        "kidney (right)",
        "lung (right)",
        "knee (right)",
        "knee (left)",
    ]:
        tissue = bt.Tissue.filter(name=name).one_or_none()
        if tissue is None:
            tissue = bt.Tissue(name=name).save()
            parent = bt.Tissue.using("laminlabs/hubmap").get(name=name.split(" ")[0])
            tissue.parents.add(parent)
finally:
    ln.settings.creation.search_names = True

In [11]:
bt.Tissue.inspect(meta_df.tissue).non_validated

[]

In [12]:
bt.Tissue.filter(run_id=ln.context.run.id).count()

0

## Ethnicity

In [13]:
meta_df.ethnicity.unique()

array(['Black or African American', 'White', '', 'Unknown'], dtype=object)

In [14]:
meta_df["ethnicity"] = meta_df.ethnicity.map(
    {
        "": "na",
        "Black or African American": "African American",
        "White": "European",
        "Unknown": "unknown",
    }
)

In [15]:
bt.Ethnicity.inspect(meta_df.ethnicity).non_validated

[]

## Sex

In [16]:
meta_df["sex"].unique()

array(['Male', 'Female', ''], dtype=object)

In [17]:
meta_df["sex"] = meta_df.sex.str.lower().replace([""], "unknown")

In [18]:
bt.Phenotype.inspect(meta_df.sex).non_validated

[]

## Disease

In [19]:
diseases = list(set(meta_df.diseases.apply(list).sum()))
diseases

['Type 2 Diabetes Mellitus',
 'Liver failure',
 'Gastrointestinal disease',
 'normal',
 'Coronary Heart Disease',
 'Coronary Artery Disease',
 'Asthma',
 'Autistic disorder',
 'Type 1 Diabetes Mellitus',
 'Heart failure with reduced ejection fraction',
 'Chronic Obstructive Lung Disease',
 'Cardiac Arrest',
 'Hyperlipidimia',
 'Hypertension',
 'Kidney cancer',
 'Obstructive sleep apnea']

In [20]:
synonyms_mapper = bt.Disease.standardize(diseases, return_mapper=True)

... synchronizing df_all__mondo__2024-06-04__Disease.parquet: 100.0%

In [21]:
# https://www.ebi.ac.uk/ols4/ontologies/snomed/classes/http%253A%252F%252Fsnomed.info%252Fid%252F703272007
# note: we use lower case for names to be consistent with mondo
bt.Disease(
    name="heart failure with reduced ejection fraction", ontology_id="SNOMED:703272007"
).save()

# the rest we map to mondo
synonyms_mapper.update(
    {
        "Hypertension": "hypertensive disorder",
        "Hyperlipidimia": "hyperlipidemia",
        "Autistic disorder": "autism",
    }
)

[92m→[0m returning existing Disease record with same name: 'heart failure with reduced ejection fraction'


In [22]:
meta_df["diseases"] = meta_df.diseases.apply(
    lambda x: [synonyms_mapper.get(i, i) for i in x]
)

In [23]:
bt.Disease.inspect(meta_df.diseases).non_validated

[]

## `suspension_type`

In [24]:
meta_df.assay.unique()

array(['scRNAseq-10xGenomics', 'snRNAseq', 'snRNAseq-10xGenomics-v3',
       'sciRNAseq', 'scRNAseq-10xGenomics-v3', 'scRNAseq-10xGenomics-v2'],
      dtype=object)

In [25]:
meta_df["suspension_type"] = meta_df.assay.map(
    {
        "scRNAseq-10xGenomics": "cell",
        "scRNAseq-10xGenomics-v2": "cell",
        "scRNAseq-10xGenomics-v3": "cell",
        "snRNAseq": "nucleus",
        "snRNAseq-10xGenomics-v3": "nucleus",
        "sciRNAseq": "nucleus",
    }
)

In [26]:
ln.ULabel.inspect(meta_df["suspension_type"]).non_validated

[]

## `assay`

In [27]:
meta_df.loc[meta_df.assay == "SNARE-seq2", "rnaseq_assay_method"] = "SNARE-seq2"

In [28]:
meta_df[["assay", "rnaseq_assay_method"]].drop_duplicates()

Unnamed: 0,assay,rnaseq_assay_method
0,scRNAseq-10xGenomics,Single Cell 3' v3
2,snRNAseq,3`
4,snRNAseq-10xGenomics-v3,10x Chromium Single Cell 3' Reagent Kits v3.1
6,snRNAseq,10x Chromium Single Cell 3_ Reagent Kits v7
7,snRNAseq,10x Chromium Single Cell 3_ Reagent Kits v8
9,snRNAseq-10xGenomics-v3,Single Cell Multiome ATAC + Gene Expression - ...
10,sciRNAseq,sciRNAseq
13,snRNAseq-10xGenomics-v3,snRNAseq-10Xgenomics
14,snRNAseq-10xGenomics-v3,10x Chromium multiome Reagent Kits
17,snRNAseq,snRNAseq-10Xgenomics


In [29]:
mapper = {
    "Single Cell 3' v3": "10x 3' v3",
    "3`": "10x 3' v3",
    "10x Chromium Single Cell 3' Reagent Kits v3.1": "10x 3' v3",
    "Single Cell 3' v1": "10x 3' v1",
    "10x Chromium Single Cell 3_ Reagent Kits v7": "10x 3' v3",
    "10x Chromium Single Cell 3_ Reagent Kits v8": "10x 3' v3",
    "Single Cell Multiome ATAC + Gene Expression - Chromium Next GEM Single Cell 3' v3.1 (dual index)": "10x multiome",
    "sciRNAseq": "sci-RNA-seq3",
    "10x Chromium multiome Reagent Kits": "10x multiome",
    "snRNAseq-10Xgenomics": "10x 3' v3",
    "SNARE2-RNAseq-RNA": "SNARE-seq2",
    "SNARE-Seq2-RNA": "SNARE-seq2",
    "3' end single cell RNA sequencing-bcl2fastq": "10x 3' v2",
    "Chromium Next GEM Single Cell 3' Kit v3.1, 4 rxns PN-1000269": "10x 3' v3",
    "Chromium Single Cell 3' GEM, Library & Gel Bead Kit v3, 4 rxns PN-1000092": "10x 3' v3",
    "10x Chromium Next GEM Single Cell Multiome ATAC + Gene Expression Reagent Bundle": "10x multiome",
    "10x Chromium Single Cell 3_ Reagent Kits v4": "10x 3' v3",
    "10x Chromium Single Cell 3_ Reagent Kits v9": "10x 3' v3",
    "10x Chromium Single Cell 3_ Reagent Kits v10": "10x 3' v3",
    "10x Chromium Single Cell 3_ Reagent Kits v3": "10x 3' v3",
    "10x Chromium Single Cell 3_ Reagent Kits v5": "10x 3' v3",
    "10x Chromium Single Cell 3_ Reagent Kits v6": "10x 3' v3",
}

In [30]:
meta_df["assay"] = meta_df.rnaseq_assay_method.replace(mapper)

In [31]:
bt.ExperimentalFactor.inspect(meta_df.assay).non_validated

[]

## `donor_id`

In [32]:
ethnicities = bt.Ethnicity.lookup().dict()

try:
    ln.settings.creation.search_names = False
    donors = []
    for _, row in (
        meta_df[["donor_id", "age", "bmi", "ethnicity"]].drop_duplicates().iterrows()
    ):
        donor = wl.Donor.filter(name=row.donor_id).one_or_none()
        if donor is None:
            ethnicity = ethnicities.get(row.ethnicity)
            if row.age == "":
                age = None
            else:
                age = int(float(row.age))
            bmi = None if row.bmi == "" else row.bmi
            donor = wl.Donor(
                name=row.donor_id,
                batch="unknown",
                age=age,
                bmi=bmi,
                ethnicity=ethnicity,
            )
            donors.append(donor)
finally:
    ln.settings.creation.search_names = True

In [33]:
donors

[]

In [34]:
ln.save(donors)

## `biosample`

In [35]:
try:
    ln.settings.creation.search_names = False
    samples = []
    for _, row in meta_df[["sample_id"]].drop_duplicates().iterrows():
        sample = wl.Biosample.filter(name=row.sample_id).one_or_none()
        if sample is None:
            sample = wl.Biosample(name=row.sample_id)
            samples.append(sample)
finally:
    ln.settings.creation.search_names = True

In [36]:
samples

[]

In [37]:
ln.save(samples)

## `tissue_type`

In [38]:
meta_df["tissue_type"] = "tissue"

## `expression types`

In [39]:
expression_type = ln.ULabel(name="expression_type").save()
raw_ul = ln.ULabel(name="raw").save()
expr_ul = ln.ULabel(name="expr").save()
secondary_ul = ln.ULabel(name="secondary").save()
scvelo_ul = ln.ULabel(name="scvelo").save()

expression_type.children.set([raw_ul, expr_ul, secondary_ul, scvelo_ul])

[92m→[0m returning existing ULabel record with same name: 'expression_type'
[92m→[0m returning existing ULabel record with same name: 'raw'
[92m→[0m returning existing ULabel record with same name: 'expr'
[92m→[0m returning existing ULabel record with same name: 'secondary'
[92m→[0m returning existing ULabel record with same name: 'scvelo'


## Save the curated metadata table

In [40]:
artifact = ln.Artifact.from_df(
    meta_df, key="hubmap_metadata/meta_scrna_curated.parquet"
).save()

artifact

[92m→[0m creating new artifact version for key='hubmap_metadata/meta_scrna_curated.parquet' (storage: 's3://lamin-us-west-2/sznqFqn7xUoI')
... uploading Z8sZr9vMHjY5WjqL0005.parquet: 100.0%
[93m![0m replacing the existing cache path /Users/altananamsaraeva/Library/Caches/lamindb/lamin-us-west-2/sznqFqn7xUoI/hubmap_metadata/meta_scrna_curated.parquet


Artifact(uid='Z8sZr9vMHjY5WjqL0005', is_latest=True, key='hubmap_metadata/meta_scrna_curated.parquet', suffix='.parquet', kind='dataset', otype='DataFrame', size=72410, hash='nY1xbR0gTF1K891-YQ5rig', n_observations=210, space_id=1, storage_id=1, run_id=34, created_by_id=5, created_at=2025-05-21 10:52:31 UTC)

In [41]:
ln.finish()

[94m•[0m please hit CMD + s to save the notebook in your editor  [92m✓[0m
[93m![0m cells [(3, 5)] were not run consecutively
[92m→[0m finished Run('6e5CL25d') after 4m at 2025-05-21 10:52:37 UTC
[92m→[0m go to: https://lamin.ai/laminlabs/hubmap/transform/5znJrS1UjwCi0006
[92m→[0m to update your notebook from the CLI, run: lamin save /Users/altananamsaraeva/Desktop/Lamin/hubmap-registration/scrna/register-single-cell-metadata.ipynb
