# Register bulk metadata

Standardized columns:
- `tissue`
- `ethnicity`
- `sex`
- `diseases` # contains lists of terms
- `suspension_type`
- `assay`
- `donor_id`
- `biosample`
- `tissue_type`

In [1]:
import lamindb as ln
import bionty as bt
import wetlab as wl

ln.track()

[92m→[0m connected lamindb: laminlabs/hubmap
[92m→[0m there already is a notebook with key 'register-bulk-metadata.ipynb', creating new version '7vMsNv3hPGOg0001'
[92m→[0m created Transform('7vMsNv3hPGOg0001'), started new Run('s8BQzj6s...') at 2025-05-13 14:41:46 UTC
[92m→[0m notebook imports: bionty==1.3.0 lamindb==1.4.0 wetlab==1.2.0


In [2]:
meta_df = ln.Artifact.get("j5RQEPmtKuTeo0lO0002").load()
meta_df.head()

Unnamed: 0,uuid,assay,rnaseq_assay_method,title,group_name,consortium,doi,publication_date,status,dataset_type,processing,organ,sample_category,analyte_class,bmi,age,ethnicity,sex,diseases,donor_id,sample_id,ancestor_id,expression_matrices_url
0,2c77b1cdf33dbed3dbfb74e4b578300e,bulk RNA,NEBNext Ultra II RNA Library Prep Kit,RNAseq data from the large intestine of a 67-y...,Stanford TMC,HuBMAP,10.35079/HBM756.GJDX.884,2020-08-22,protected,RNAseq,raw,LI,block,RNA,30.2,67.0,White,Female,"[Hypertension, Coronary Artery Disease, Cardia...",HBM279.WPZP.978,HBM488.XJKW.383,d0cb764459fd1d1c6f04fdce2e982831,https://assets.hubmapconsortium.org/9e7b040f23...
1,f84c8edc36a65f248c2649ebbe52ad35,bulk RNA,NEBNext Ultra II RNA Library Prep Kit,RNAseq data from the large intestine of a 67-y...,Stanford TMC,HuBMAP,10.35079/HBM454.ZWSD.895,2020-08-22,protected,RNAseq,raw,LI,block,RNA,30.2,67.0,White,Female,"[Hypertension, Coronary Artery Disease, Cardia...",HBM279.WPZP.978,HBM875.RVTT.868,3d742332ef8ca26f34f7ee5b9da3381c,https://assets.hubmapconsortium.org/1c141b1558...
2,311837bf483627cc967e40092a251096,bulk RNA,NEBNext Ultra II RNA Library Prep Kit,RNAseq data from the small intestine of a 67-y...,Stanford TMC,HuBMAP,10.35079/HBM656.ZCCL.743,2020-08-22,protected,RNAseq,raw,SI,block,RNA,30.2,67.0,White,Female,"[Hypertension, Coronary Artery Disease, Cardia...",HBM279.WPZP.978,HBM229.NKWB.488,35e16f13caab262f446836f63cf4ad42,https://assets.hubmapconsortium.org/a8a603e704...
3,c67e024e8fcc25166aac2e2e883208df,bulk-RNA,KAPA RNA HyperPrep Kit with RiboErase,RNAseq data from the placenta of a 34-year-old...,TMC - University of California San Diego focus...,HuBMAP,10.35079/HBM669.BFZR.956,2025-02-26,protected,RNAseq,raw,PL,block,RNA,,,,,[normal],HBM683.TPQL.293,HBM265.XLDG.695,0236e72a6ce04abc0008ac2a40cf1b35,https://assets.hubmapconsortium.org/7d7ba54632...
4,73507265c43d750e6ec5e48d1e2e0b92,bulk-RNA,KAPA RNA HyperPrep Kit with RiboErase,RNAseq data from the placenta of a 34-year-old...,TMC - University of California San Diego focus...,HuBMAP,10.35079/HBM765.RPVQ.584,2025-02-26,protected,RNAseq,raw,PL,block,RNA,,,,,[normal],HBM864.XRGJ.897,HBM464.GLHP.566,c97c86db1d6c7f62445c06d70289c87a,https://assets.hubmapconsortium.org/fc80dbe141...


## `Reference`

In [3]:
reference_names = ln.Reference.inspect(meta_df.title, mute=True).non_validated
try:
    ln.settings.creation.search_names = False
    references = []
    for _, row in (
        meta_df[["title", "doi", "publication_date"]].drop_duplicates().iterrows()
    ):
        if row.title in reference_names:
            references.append(
                ln.Reference(name=row.title, doi=row.doi, date=row.publication_date)
            )
finally:
    ln.settings.creation.search_names = True

In [4]:
ln.save(references)

In [5]:
ln.Reference.filter(run_id=ln.context.run.id).count()

0

## Tissue

In [6]:
meta_df.organ.unique()

array(['LI', 'SI', 'PL'], dtype=object)

In [7]:
mapper = {
    "LY": "lymph node",
    "TH": "thymus",
    "LI": "large intestine",
    "SI": "small intestine",
    "SP": "spleen",
    "HT": "heart",
    "LK": "kidney (left)",
    "RK": "kidney (right)",
    "LL": "lung (left)",
    "RL": "lung (right)",
    "LV": "liver",
    "LN": "knee (left)",
    "RN": "knee (right)",
    "BL": "bladder organ",
    "PL": "placenta",
}
meta_df["tissue"] = meta_df.organ.map(mapper)

In [8]:
try:
    ln.settings.creation.search_names = False
    for name in [
        "kidney (left)",
        "kidney (right)",
        "lung (right)",
        "knee (right)",
        "knee (left)",
    ]:
        tissue = bt.Tissue.filter(name=name).one_or_none()
        if tissue is None:
            tissue = bt.Tissue(name=name).save()
            parent = bt.Tissue.get(name=name.split(" ")[0])
            tissue.parents.add(parent)
finally:
    ln.settings.creation.search_names = True

In [9]:
bt.Tissue.inspect(meta_df.tissue).non_validated

[]

In [10]:
bt.Tissue.filter(run_id=ln.context.run.id).count()

0

## Ethnicity

In [11]:
meta_df.ethnicity.unique()

array(['White', ''], dtype=object)

In [12]:
meta_df["ethnicity"] = meta_df.ethnicity.map(
    {
        "": "na",
        "Black or African American": "African American",
        "White": "European",
        "Unknown": "unknown",
    }
)

In [13]:
bt.Ethnicity.inspect(meta_df.ethnicity).non_validated

[]

## Sex

In [14]:
meta_df["sex"].unique()

array(['Female', ''], dtype=object)

In [15]:
meta_df["sex"] = meta_df.sex.str.lower().replace([""], "unknown")

In [16]:
bt.Phenotype.inspect(meta_df.sex).non_validated

[]

## Disease

In [17]:
diseases = list(set(meta_df.diseases.apply(list).sum()))
diseases

['Cardiac Arrest', 'Coronary Artery Disease', 'Hypertension', 'normal']

In [18]:
synonyms_mapper = bt.Disease.standardize(diseases, return_mapper=True)

In [19]:
# https://www.ebi.ac.uk/ols4/ontologies/snomed/classes/http%253A%252F%252Fsnomed.info%252Fid%252F703272007
# note: we use lower case for names to be consistent with mondo
bt.Disease(
    name="heart failure with reduced ejection fraction", ontology_id="SNOMED:703272007"
).save()
# the rest we map to mondo
synonyms_mapper.update(
    {
        "Hypertension": "hypertensive disorder",
        "Hyperlipidimia": "hyperlipidemia",
        "Autistic disorder": "autism",
    }
)

[92m→[0m returning existing Disease record with same name: 'heart failure with reduced ejection fraction'


In [20]:
meta_df["diseases"] = meta_df.diseases.apply(
    lambda x: [synonyms_mapper.get(i, i) for i in x]
)

In [21]:
bt.Disease.inspect(meta_df.diseases).non_validated

[]

## `suspension_type`

In [22]:
meta_df.assay.unique()

array(['bulk RNA', 'bulk-RNA'], dtype=object)

In [23]:
meta_df["suspension_type"] = meta_df.assay.map(
    {
        "bulk RNA": "tissue",
        "bulk-RNA": "tissue",
    }
)

In [24]:
ln.ULabel.inspect(meta_df["suspension_type"]).non_validated

[]

## `assay`

In [25]:
meta_df[["assay", "rnaseq_assay_method"]].drop_duplicates()

Unnamed: 0,assay,rnaseq_assay_method
0,bulk RNA,NEBNext Ultra II RNA Library Prep Kit
3,bulk-RNA,KAPA RNA HyperPrep Kit with RiboErase


In [26]:
mapper = {
    "NEBNext Ultra II RNA Library Prep Kit": "NEBNext Ultra II",
    "KAPA RNA HyperPrep Kit with RiboErase": "KAPA RNA HyperPrep + RiboErase",
}

In [27]:
meta_df["assay"] = meta_df.rnaseq_assay_method.replace(mapper)

In [28]:
bt.ExperimentalFactor.inspect(meta_df.assay).non_validated

[]

In [29]:
# bt.ExperimentalFactor(name="NEBNext Ultra II").save()

In [30]:
# bt.ExperimentalFactor(name="KAPA RNA HyperPrep + RiboErase").save()

## `donor_id`

In [31]:
ethnicities = bt.Ethnicity.lookup().dict()

try:
    ln.settings.creation.search_names = False
    donors = []
    for _, row in (
        meta_df[["donor_id", "age", "bmi", "ethnicity"]].drop_duplicates().iterrows()
    ):
        donor = wl.Donor.filter(name=row.donor_id).one_or_none()
        if donor is None:
            ethnicity = ethnicities.get(row.ethnicity)
            if row.age == "":
                age = None
            else:
                age = int(float(row.age))
            bmi = None if row.bmi == "" else row.bmi
            donor = wl.Donor(
                name=row.donor_id,
                batch="unknown",
                age=age,
                bmi=bmi,
                ethnicity=ethnicity,
            )
            donors.append(donor)
finally:
    ln.settings.creation.search_names = True

In [32]:
ln.save(donors)

## `biosample`

In [33]:
try:
    ln.settings.creation.search_names = False
    samples = []
    for _, row in meta_df[["sample_id"]].drop_duplicates().iterrows():
        sample = wl.Biosample.filter(name=row.sample_id).one_or_none()
        if sample is None:
            sample = wl.Biosample(name=row.sample_id)
            samples.append(sample)
finally:
    ln.settings.creation.search_names = True

In [34]:
ln.save(samples)

## `tissue_type`

In [35]:
meta_df["tissue_type"] = "tissue"

## `expression types`

In [36]:
expression_type = ln.ULabel(name="expression_type").save()
raw_ul = ln.ULabel(name="raw").save()
expr_ul = ln.ULabel(name="expr").save()
secondary_ul = ln.ULabel(name="secondary").save()
scvelo_ul = ln.ULabel(name="scvelo").save()
expression_type.children.set([raw_ul, expr_ul, secondary_ul, scvelo_ul])

[92m→[0m returning existing ULabel record with same name: 'expression_type'
[92m→[0m returning existing ULabel record with same name: 'raw'
[92m→[0m returning existing ULabel record with same name: 'expr'
[92m→[0m returning existing ULabel record with same name: 'secondary'
[92m→[0m returning existing ULabel record with same name: 'scvelo'


## Save the curated metadata table

In [37]:
artifact = ln.Artifact.from_df(
    meta_df, key="hubmap_metadata/meta_bulkrna_curated.parquet"
).save()
artifact

[92m→[0m creating new artifact version for key='hubmap_metadata/meta_bulkrna_curated.parquet' (storage: 's3://lamin-us-west-2/sznqFqn7xUoI')
... uploading ajd285FK90Z0ZBN10001.parquet: 100.0%
[93m![0m replacing the existing cache path /Users/altananamsaraeva/Library/Caches/lamindb/lamin-us-west-2/sznqFqn7xUoI/hubmap_metadata/meta_bulkrna_curated.parquet


Artifact(uid='ajd285FK90Z0ZBN10001', is_latest=True, key='hubmap_metadata/meta_bulkrna_curated.parquet', suffix='.parquet', kind='dataset', otype='DataFrame', size=27716, hash='H3Equ2kOSqMQdo4hQ_MClA', n_observations=80, space_id=1, storage_id=1, run_id=32, created_by_id=5, created_at=2025-05-13 14:43:39 UTC)

In [38]:
ln.finish()

[94m•[0m please hit CMD + s to save the notebook in your editor . [92m✓[0m
[92m→[0m finished Run('s8BQzj6s') after 1m at 2025-05-13 14:43:45 UTC
[92m→[0m go to: https://lamin.ai/laminlabs/hubmap/transform/7vMsNv3hPGOg0001
[92m→[0m to update your notebook from the CLI, run: lamin save /Users/altananamsaraeva/Desktop/Lamin/hubmap-registration/bulk/register-bulk-metadata.ipynb
