# Register metadata

Standardized columns:
- `tissue`
- `ethnicity`
- `sex`
- `diseases` # contains lists of terms
- `suspension_type`
- `assay`
- `donor_id`
- `biosample`
- `tissue_type`

In [1]:
import lamindb as ln
import bionty as bt
import wetlab as wl

ln.track()

[92m→[0m connected lamindb: laminlabs/hubmap
[92m→[0m there already is a notebook with `key` 'register-metadata.ipynb', creating new version '5znJrS1UjwCi0003'
[92m→[0m created Transform('5znJrS1UjwCi0003'), started new Run('kn6hazMA...') at 2025-01-31 13:30:36 UTC
[92m→[0m notebook imports: bionty==1.0.0 lamindb==1.0.5 wetlab==1.0.1


In [2]:
meta_df = ln.Artifact.get("ZmKRFUAwmX5RK9d80004").load()
meta_df.head()

Unnamed: 0_level_0,assay,rnaseq_assay_method,title,group_name,consortium,doi,publication_date,status,dataset_type,processing,organ,sample_category,analyte_class,bmi,age,ethnicity,sex,diseases,donor_id,sample_id,collection_uuid,raw_expr_url,expr_url,secondary_analysis_url,scvelo_url
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
a5234e06fed9a14ee8d29c5aa0258ba5,scRNAseq-10xGenomics,Single Cell 3' v3,RNAseq data from the lymph node of a 1.0-year-...,University of Florida TMC,HuBMAP,10.35079/HBM252.HMBK.543,2020-08-22,protected,RNAseq,raw,LY,suspension,RNA,21.8,1.0,White,Male,[normal],HBM638.SMWG.276,HBM789.XWDB.222,6c717082627f452935b9f63d2d93f023,,,,
c03acf2de0caff5e5850e0f76d555e1b,scRNAseq-10xGenomics,Single Cell 3' v3,RNAseq data from the thymus of a 18.0-year-old...,University of Florida TMC,HuBMAP,10.35079/HBM457.SQKR.279,2020-08-22,protected,RNAseq,raw,TH,suspension,RNA,27.1,18.0,Black or African American,Male,[normal],HBM678.JKBB.893,HBM363.KHLF.497,ac972fb45d1dc05548ecf400229a8038,,,,
8776e9183d5f85d90535a0b1b3b4e32a,scRNAseq-10xGenomics,Single Cell 3' v3,RNAseq data from the thymus of a 18.0-year-old...,University of Florida TMC,HuBMAP,10.35079/HBM724.ZKSM.924,2020-08-22,protected,RNAseq,raw,TH,suspension,RNA,27.1,18.0,Black or African American,Male,[normal],HBM678.JKBB.893,HBM365.LNPG.969,268e8fb044f82a1497b5fd17918500ea,,,,
b29f62452b8e333ffc62d2e69caa18fa,snRNAseq,3`,RNAseq data from the large intestine of a 67.0...,Stanford TMC,HuBMAP,10.35079/HBM444.XJKC.552,2020-08-22,protected,RNAseq,raw,LI,block,RNA,30.2,67.0,White,Female,"[Hypertension, Coronary Artery Disease, Cardia...",HBM279.WPZP.978,HBM588.GSHN.453,e4ee92c09a755f8889cb8c37a669e160,,,,
20ee458e5ee361717b68ca72caf6044e,snRNAseq-10xGenomics-v3,10x Chromium Single Cell 3' Reagent Kits v3.1,RNAseq data from the small intestine of a 67.0...,Stanford TMC,HuBMAP,10.35079/HBM983.LKMP.544,2022-11-30,protected,RNAseq,raw,SI,block,RNA,30.2,67.0,White,Female,"[Hypertension, Coronary Artery Disease, Cardia...",HBM279.WPZP.978,HBM555.LQJW.397,e80cd8fab25ec8e9cb41e3872e2129c7,https://assets.hubmapconsortium.org/f6eb890063...,https://assets.hubmapconsortium.org/f6eb890063...,https://assets.hubmapconsortium.org/f6eb890063...,


## `Reference`

In [3]:
reference_names = ln.Reference.inspect(meta_df.title, mute=True).non_validated
try:
    ln.settings.creation.search_names = False
    references = []
    for _, row in (
        meta_df[["title", "doi", "publication_date"]].drop_duplicates().iterrows()
    ):
        if row.title in reference_names:
            references.append(
                ln.Reference(name=row.title, doi=row.doi, date=row.publication_date)
            )
finally:
    ln.settings.creation.search_names = True

In [4]:
ln.save(references)

In [5]:
ln.Reference.filter(run_id=ln.context.run.id).count()

0

## Tissue

In [6]:
meta_df.organ.unique()

array(['LY', 'TH', 'LI', 'SI', 'SP', 'HT', 'LK', 'RK', 'RL', 'LV', 'RN',
       'LN', 'BL'], dtype=object)

In [7]:
mapper = {
    "LY": "lymph node",
    "TH": "thymus",
    "LI": "large intestine",
    "SI": "small intestine",
    "SP": "spleen",
    "HT": "heart",
    "LK": "kidney (left)",
    "RK": "kidney (right)",
    "LL": "lung (left)",
    "RL": "lung (right)",
    "LV": "liver",
    "LN": "knee (left)",
    "RN": "knee (right)",
    "BL": "bladder organ",
}
meta_df["tissue"] = meta_df.organ.map(mapper)

In [8]:
try:
    ln.settings.creation.search_names = False
    for name in [
        "kidney (left)",
        "kidney (right)",
        "lung (right)",
        "knee (right)",
        "knee (left)",
    ]:
        tissue = bt.Tissue.filter(name=name).one_or_none()
        if tissue is None:
            tissue = bt.Tissue(name=name).save()
            parent = bt.Tissue.get(name=name.split(" ")[0])
            tissue.parents.add(parent)
finally:
    ln.settings.creation.search_names = True

In [9]:
bt.Tissue.inspect(meta_df.tissue).non_validated

[]

In [10]:
bt.Tissue.filter(run_id=ln.context.run.id).count()

0

## Ethnicity

In [11]:
meta_df.ethnicity.unique()

array(['White', 'Black or African American', '', 'Unknown'], dtype=object)

In [12]:
meta_df["ethnicity"] = meta_df.ethnicity.map(
    {
        "": "na",
        "Black or African American": "African American",
        "White": "European",
        "Unknown": "unknown",
    }
)

In [13]:
bt.Ethnicity.inspect(meta_df.ethnicity).non_validated

[]

## Sex

In [14]:
meta_df["sex"].unique()

array(['Male', 'Female', ''], dtype=object)

In [15]:
meta_df["sex"] = meta_df.sex.str.lower().replace([""], "unknown")

In [16]:
bt.Phenotype.inspect(meta_df.sex).non_validated

[]

## Disease

In [17]:
diseases = list(set(meta_df.diseases.apply(list).sum()))
diseases

['Hypertension',
 'normal',
 'Kidney cancer',
 'Liver failure',
 'Type 2 Diabetes Mellitus',
 'Asthma',
 'Hyperlipidimia',
 'Obstructive sleep apnea',
 'Cardiac Arrest',
 'Autistic disorder',
 'Heart failure with reduced ejection fraction',
 'Coronary Heart Disease',
 'Coronary Artery Disease',
 'Chronic Obstructive Lung Disease',
 'Type 1 Diabetes Mellitus',
 'Gastrointestinal disease']

In [18]:
synonyms_mapper = bt.Disease.standardize(diseases, return_mapper=True)

In [19]:
# https://www.ebi.ac.uk/ols4/ontologies/snomed/classes/http%253A%252F%252Fsnomed.info%252Fid%252F703272007
# note: we use lower case for names to be consistent with mondo
bt.Disease(
    name="heart failure with reduced ejection fraction", ontology_id="SNOMED:703272007"
).save()
# the rest we map to mondo
synonyms_mapper.update(
    {
        "Hypertension": "hypertensive disorder",
        "Hyperlipidimia": "hyperlipidemia",
        "Autistic disorder": "autism",
    }
)

[92m→[0m returning existing Disease record with same name: 'heart failure with reduced ejection fraction'


In [20]:
meta_df["diseases"] = meta_df.diseases.apply(
    lambda x: [synonyms_mapper.get(i, i) for i in x]
)

In [21]:
bt.Disease.inspect(meta_df.diseases).non_validated

[]

## `suspension_type`

In [22]:
meta_df.assay.unique()

array(['scRNAseq-10xGenomics', 'snRNAseq', 'snRNAseq-10xGenomics-v3',
       'sciRNAseq', 'SNARE-seq2', 'SNARE2-RNAseq',
       'scRNAseq-10xGenomics-v2', 'scRNAseq-10xGenomics-v3'], dtype=object)

In [23]:
meta_df["suspension_type"] = meta_df.assay.map(
    {
        "scRNAseq-10xGenomics": "cell",
        "scRNAseq-10xGenomics-v2": "cell",
        "scRNAseq-10xGenomics-v3": "cell",
        "snRNAseq": "nucleus",
        "snRNAseq-10xGenomics-v3": "nucleus",
        "sciRNAseq": "nucleus",
        "SNARE-seq2": "nucleus",
        "SNARE2-RNAseq": "nucleus",
    }
)

In [24]:
ln.ULabel.inspect(meta_df["suspension_type"]).non_validated

[]

## `assay`

In [25]:
meta_df.loc[meta_df.assay == "SNARE-seq2", "rnaseq_assay_method"] = "SNARE-seq2"

In [26]:
meta_df[["assay", "rnaseq_assay_method"]].drop_duplicates()

Unnamed: 0_level_0,assay,rnaseq_assay_method
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1
a5234e06fed9a14ee8d29c5aa0258ba5,scRNAseq-10xGenomics,Single Cell 3' v3
b29f62452b8e333ffc62d2e69caa18fa,snRNAseq,3`
20ee458e5ee361717b68ca72caf6044e,snRNAseq-10xGenomics-v3,10x Chromium Single Cell 3' Reagent Kits v3.1
a04d0138ed6b28810c5afa01d392bbd5,snRNAseq,10x Chromium Single Cell 3_ Reagent Kits v7
a078805198f9f7f022b83de898a608a9,snRNAseq,10x Chromium Single Cell 3_ Reagent Kits v8
53d1bd99fee337798f475832c2ba4fa6,snRNAseq-10xGenomics-v3,Single Cell Multiome ATAC + Gene Expression - ...
42b73e4e346571372f4babe678688dfd,sciRNAseq,sciRNAseq
0523eb73b97a09a4ec7397fc1906988f,snRNAseq-10xGenomics-v3,10x Chromium multiome Reagent Kits
db9e2006effe3168b4b4f28b96e65d34,snRNAseq-10xGenomics-v3,snRNAseq-10Xgenomics
e81e134b69e297f7442c6e4050ffb464,SNARE-seq2,SNARE-seq2


In [27]:
mapper = {
    "Single Cell 3' v3": "10x 3' v3",
    "3`": "10x 3' v3",
    "10x Chromium Single Cell 3' Reagent Kits v3.1": "10x 3' v3",
    "Single Cell 3' v1": "10x 3' v1",
    "10x Chromium Single Cell 3_ Reagent Kits v7": "10x 3' v3",
    "10x Chromium Single Cell 3_ Reagent Kits v8": "10x 3' v3",
    "Single Cell Multiome ATAC + Gene Expression - Chromium Next GEM Single Cell 3' v3.1 (dual index)": "10x multiome",
    "sciRNAseq": "sci-RNA-seq3",
    "10x Chromium multiome Reagent Kits": "10x multiome",
    "snRNAseq-10Xgenomics": "10x 3' v3",
    "SNARE2-RNAseq-RNA": "SNARE-seq2",
    "SNARE-Seq2-RNA": "SNARE-seq2",
    "3' end single cell RNA sequencing-bcl2fastq": "10x 3' v2",
    "Chromium Next GEM Single Cell 3' Kit v3.1, 4 rxns PN-1000269": "10x 3' v3",
    "Chromium Single Cell 3' GEM, Library & Gel Bead Kit v3, 4 rxns PN-1000092": "10x 3' v3",
    "10x Chromium Next GEM Single Cell Multiome ATAC + Gene Expression Reagent Bundle": "10x multiome",
    "10x Chromium Single Cell 3_ Reagent Kits v4": "10x 3' v3",
    "10x Chromium Single Cell 3_ Reagent Kits v9": "10x 3' v3",
    "10x Chromium Single Cell 3_ Reagent Kits v10": "10x 3' v3",
    "10x Chromium Single Cell 3_ Reagent Kits v3": "10x 3' v3",
    "10x Chromium Single Cell 3_ Reagent Kits v5": "10x 3' v3",
    "10x Chromium Single Cell 3_ Reagent Kits v6": "10x 3' v3",
}

In [28]:
bt.ExperimentalFactor(name="SNARE-seq2").save()

[92m→[0m returning existing ExperimentalFactor record with same name: 'SNARE-seq2'


ExperimentalFactor(uid='4hJHgoOo', name='SNARE-seq2', created_by_id=1, run_id=3, space_id=1, created_at=2025-01-24 13:13:02 UTC)

In [29]:
meta_df["assay"] = meta_df.rnaseq_assay_method.replace(mapper)

In [30]:
bt.ExperimentalFactor.inspect(meta_df.assay).non_validated

[]

## `donor_id`

In [31]:
ethnicities = bt.Ethnicity.lookup().dict()

try:
    ln.settings.creation.search_names = False
    donors = []
    for _, row in (
        meta_df[["donor_id", "age", "bmi", "ethnicity"]].drop_duplicates().iterrows()
    ):
        donor = wl.Donor.filter(name=row.donor_id).one_or_none()
        if donor is None:
            ethnicity = ethnicities.get(row.ethnicity)
            if row.age == "":
                age = None
            else:
                age = int(float(row.age))
            bmi = None if row.bmi == "" else row.bmi
            donor = wl.Donor(
                name=row.donor_id,
                batch="unknown",
                age=age,
                bmi=bmi,
                ethnicity=ethnicity,
            )
            donors.append(donor)
finally:
    ln.settings.creation.search_names = True

In [32]:
ln.save(donors)

## `biosample`

In [33]:
try:
    ln.settings.creation.search_names = False
    samples = []
    for _, row in meta_df[["sample_id"]].drop_duplicates().iterrows():
        sample = wl.Biosample.filter(name=row.sample_id).one_or_none()
        if sample is None:
            sample = wl.Biosample(name=row.sample_id)
            samples.append(sample)
finally:
    ln.settings.creation.search_names = True

In [34]:
ln.save(samples)

## `tissue_type`

In [35]:
meta_df["tissue_type"] = "tissue"

## `expression types`

In [36]:
expression_type = ln.ULabel(name="expression_type").save()
raw_ul = ln.ULabel(name="raw").save()
expr_ul = ln.ULabel(name="expr").save()
secondary_ul = ln.ULabel(name="secondary").save()
scvelo_ul = ln.ULabel(name="scvelo").save()
expression_type.children.set([raw_ul, expr_ul, secondary_ul, scvelo_ul])

[92m→[0m returning existing ULabel record with same name: 'expression_type'
[92m→[0m returning existing ULabel record with same name: 'raw'
[92m→[0m returning existing ULabel record with same name: 'expr'
[92m→[0m returning existing ULabel record with same name: 'secondary'


## Save the curated metadata table

In [37]:
artifact = ln.Artifact.from_df(
    meta_df, key="2024-12-20_15-35-09/meta_scrna_curated.parquet"
).save()
artifact

[92m→[0m creating new artifact version for key='2024-12-20_15-35-09/meta_scrna_curated.parquet' (storage: 's3://lamin-us-west-2/sznqFqn7xUoI')
... uploading Z8sZr9vMHjY5WjqL0002.parquet: 100.0%
[93m![0m The cache path /home/lukas/.cache/lamindb/lamin-us-west-2/sznqFqn7xUoI/2024-12-20_15-35-09/meta_scrna_curated.parquet already exists, replacing it.


Artifact(uid='Z8sZr9vMHjY5WjqL0002', is_latest=True, key='2024-12-20_15-35-09/meta_scrna_curated.parquet', suffix='.parquet', kind='dataset', otype='DataFrame', size=87365, hash='bgNFzbe7m_Xci37nncSV4g', space_id=1, storage_id=1, run_id=13, created_by_id=3, created_at=2025-01-31 13:31:55 UTC)

In [38]:
ln.finish()

[92m→[0m finished Run('kn6hazMA') after 1m at 2025-01-31 13:31:58 UTC
[92m→[0m go to: https://lamin.ai/laminlabs/hubmap/transform/5znJrS1UjwCi0003
[92m→[0m to update your notebook from the CLI, run: lamin save /home/lukas/code/hubmap_registration/register-metadata.ipynb
