# Clean Clinical Data

Clean the clinical data spreadsheet by:
- Standardizing ontologies
- Fix spelling, categorical data, etc...
- Adjust Features and Labels
- Add to `LaminDB` as a parquet file

## Setup

In [None]:
import re

import bionty as bt
import lamindb as ln
import more_itertools as mit
import natsort as ns
import pandas as pd
from upath import UPath

In [None]:
ln.settings.sync_git_repo = "https://github.com/karadavis-lab/nbl.git"

In [None]:
ln.track(project="Neuroblastoma")

In [None]:
bt.settings.organism = "human"

## Validate Clinical Data

In [None]:
raw_data_path = UPath("../../data/raw/")
clinical_data_path = raw_data_path / "Clinical Data" / "FOVs_UIDv2.xlsx"

fov_dir = raw_data_path / "nbl_cohort" / "images"
label_dir = raw_data_path / "nbl_cohort" / "segmentation" / "labels"

In [None]:
clinical_data: pd.DataFrame = pd.read_excel(clinical_data_path)

Standardize Field of View Names / IDs

In [None]:
all_fovs = ns.natsorted(fov_dir.glob("[!.]*/"))
control_pattern = re.compile(pattern=r"Hu-*")
all_fov_names = [fov.name for fov in all_fovs]

fov_names = list(filter(lambda f: not control_pattern.search(f), all_fov_names))

In [None]:
def convert_fov(row: pd.Series, fovs: list[str]):
    """Adjusts the name of the FOV.

    Parameters
    ----------
    row : pd.DataFrame
        The row of the clinical data.

    Returns
    -------
    str
        The full name of the FOV.
    """
    for fov in fovs:
        if row["fov"] == fov.split("-")[2]:
            return fov
    return None


clinical_data["fov"] = clinical_data.apply(convert_fov, fovs=fov_names, axis=1)

Removing Misc Column Name / Values Whitespace

In [None]:
clinical_data.columns = clinical_data.columns.str.strip()

Paried Sequence Cleaning

In [None]:
clinical_data["Paired sequence"] = clinical_data["Paired sequence"].map(lambda x: False if x == "No" else True)

In [None]:
tissues = bt.Tissue.public()
ethnicitys = bt.Ethnicity.public()
tissues_lookup = tissues.lookup()
ethnicity_lookup = ethnicitys.lookup()

In [None]:
clinical_data = clinical_data.replace(
    to_replace={
        "Classification of specimen": {
            "Diagnosis": "Diagnosis",
            "post-chemotherapy, local control surgery ": "Post-Chemotherapy",
            "Diagnosis ": "Diagnosis",
            "post-chemotherpy, local control surgery (mild paraspinal disease progression requiring laminectomy)": "Post-Chemotherapy",
            "post-chemotherapy (local control surgery, 4 cycles of ANBL0531) ": "Post-Chemotherapy",
            "relapse (after 2 cycles of topo/cyclo)": "Relapse",
            "Progressive disease (re-resection, s/p chemotherapy) ": "Disease Progression",
            "post-chemotherapy, local control surgery (s/p 4 cycles of induction chemo per ANBL0531) ": "Post-Chemotherapy",
            "post-chemotherapy (5 cycles ANBL0532) ": "Post-Chemotherapy",
            "Relapsed": "Relapse",
            "CCHS, post-chemo therapy, local control surgery (7 cycles of ANBL0531, stable disease after 6 cycles and then 1 cycle of topo/cyclo) ": "Post-Chemotherapy",
            "relapse, brain metastases": "Relapse",
            "post-chemotherapy, local control surgery (2nd)": "Post-Chemotherapy",
            "post-chemotherapy, local control surgery (s/p 4 cycles of induction per ANBL0531) ": "Post-Chemotherapy",
            "post-chemotherapy, local control surgery (8 cyles of ANBL0531 therapy with minimal response)  ": "Post-Chemotherapy",
            "Diagnosis (after a period of observation) ": "Diagnosis",
            "disease progression after upfront surgery (posterior mediastinum)": "Disease Progression",
            "post-chemotherapy, local control surgery": "Post-Chemotherapy",
        },
        "Sex": {s: s.strip().lower().capitalize() for s in clinical_data["Sex"].unique()},
        "Race": {
            "Black": ethnicity_lookup.african.name,
            "White": ethnicity_lookup.european.name,
            "white": ethnicity_lookup.european.name,
            "Other": ethnicity_lookup.undefined_ancestry_population.name,
            "Arabic ": ethnicity_lookup.arab.name,
            "Asian ": ethnicity_lookup.asian.name,
            "other (egyptian)": ethnicity_lookup.egyptian.name,
            "?black ": ethnicity_lookup.african.name,
            "white ": ethnicity_lookup.european.name,
        },
        "Biopsy/surgery location": {
            "abdominal mass": tissues_lookup.abdominal_segment_element.name,
            "letfy adrenal mass": tissues_lookup.left_adrenal_gland.name,
            "Right adrenal ": tissues_lookup.right_adrenal_gland.name,
            "Abdominal mass": tissues_lookup.abdominal_segment_element.name,
            "Spinal/paraspinal ": tissues_lookup.paraspinal_region.name,
            "RP mass ": tissues_lookup.retroperitoneal_space.name,
            "abdominal mass/thoracic region mass excision": tissues_lookup.thoracic_cavity_element.name,
            "abdominal mass/diagphramtic mass": tissues_lookup.diaphragm.name,
            "left adrenal tumor": tissues_lookup.left_adrenal_gland.name,
            "pelvic mass": tissues_lookup.pelvic_region_element.name,
            "abdominal tumor resection ": tissues_lookup.abdominal_segment_element.name,
            "Retroperitoneal": tissues_lookup.retroperitoneal_space.name,
            "Abdominal/Retroperitoneal": tissues_lookup.retroperitoneal_space.name,
            "Pelvic mass, s/p 2 cycles of ANBL0531, limited response to chemo with tumor growth": tissues_lookup.pelvic_region_element.name,
            "Paraspinal ": tissues_lookup.paraspinal_region.name,
            "paraspinal ": tissues_lookup.paraspinal_region.name,
            "Right Adrenal": tissues_lookup.right_adrenal_gland.name,
            "Liver": tissues_lookup.liver.name,
            "abdominal tumor": tissues_lookup.abdominal_segment_element.name,
            "Abdominal tumor, lymph nodes": tissues_lookup.abdominal_lymph_node.name,
            "Brain mets, relapse during maintenance GD2 antibody": tissues_lookup.brain.name,
            "paraspinal mass": tissues_lookup.paraspinal_region.name,
            "abdominal tumor resection": tissues_lookup.abdominal_segment_element.name,
            "right adrenal mass": tissues_lookup.right_adrenal_gland.name,
            "right adrenal gland resection ": tissues_lookup.right_adrenal_gland.name,
            "retroperitoneal mass": tissues_lookup.retroperitoneal_space.name,
            "neck mass": tissues_lookup.neck.name,
            "abdominal/paraspinal mass resection": tissues_lookup.paraspinal_region.name,
            "right chect, posterior mediastinal ": tissues_lookup.posterior_mediastinum.name,
            "retroperitoneal": tissues_lookup.retroperitoneal_space.name,
            "abdominal tumor resection after 4 cycles of ANBL0531 ": tissues_lookup.abdominal_segment_element.name,
            "right adrenal gland": tissues_lookup.right_adrenal_gland.name,
            "right apical chest mass resection": tissues_lookup.chest.name,
            "abdominal mass/liver nodule": tissues_lookup.liver.name,
            "pelvic tumor": tissues_lookup.pelvic_region_element.name,
            "right neck mass": tissues_lookup.neck.name,
            "Abd mass": tissues_lookup.abdominal_segment_element.name,
            "abdominal mass biopsy": tissues_lookup.abdominal_segment_element.name,
            "right axilla": tissues_lookup.axilla.name,
            "thoracic tumor": tissues_lookup.thoracic_cavity_element.name,
            "left chest mass": tissues_lookup.chest.name,
            "b/l adrenal masses": tissues_lookup.adrenal_tissue.name,
            "adrenalectomy": tissues_lookup.adrenal_tissue.name,
            "lefty adrenal mass": tissues_lookup.left_adrenal_gland.name,
        },
        "Risk": {
            "Intermediate": "Intermediate",
            "High": "High",
            "Inrtermediate, mild disease progression": "Intermediate",
            "intermediate": "Intermediate",
            "High (relapsed)": "High",
            "Intermediate ": "Intermediate",
            "Low": "Low",
            "High ": "High",
            "Low (would be IR now?)": "Low",
            "High (due to nodular ganglioneuroblastoma)": "High",
            "Intermediate  ": "Intermediate",
        },
    },
)

In [None]:
cols_to_cat_one = ["Classification of specimen", "Risk", "Biopsy/surgery location", "Sex", "Race"]
clinical_data[cols_to_cat_one] = clinical_data[cols_to_cat_one].astype("category")

In [None]:
u_vma_hva = "Urine VMA/HVA (g/g Cr)"

In [None]:
clinical_data = clinical_data.replace(to_replace={u_vma_hva: {"n/a ": pd.NA, ">227/>227": "227/227"}})
clinical_data = clinical_data.fillna({u_vma_hva: pd.NA})

In [None]:
vma_hva_df = (
    clinical_data[u_vma_hva]
    .str.split("/", expand=True)
    .rename(columns={0: "VMA (g Cr)", 1: "HVA (g Cr)"})
    .apply(pd.to_numeric, errors="coerce")
)

In [None]:
clinical_data = clinical_data.drop(columns=[u_vma_hva])

In [None]:
hva_vma_days_from_biopsy = "HVA/VMA days from biopsy"

In [None]:
clinical_data = clinical_data.fillna({hva_vma_days_from_biopsy: pd.NA})

In [None]:
for c in ["17q gain", "11q loss/LOH", "7q gain", "1p loss/LOH", "ALK"]:
    clinical_data[c] = clinical_data[c].str.rstrip().str.lstrip().str.capitalize()

In [None]:
clinical_data = clinical_data.replace(
    to_replace={
        "17q gain": {
            "Yes  (wc, relative, 4n)": "Yes|WC|relative|4N",
            "Yes  (relative, 5n)": "Yes|relative|5N",
            "Yes , (relative, (5n)": "Yes|relative|5N",
            "Yes, relative, 4n)": "Yes|relative|4N",
            "Yes (wc, relative 5n)": "Yes|WC|relative|5N",
            "Yes (wc, relative 4n)": "Yes|WC|relative|4N",
            "Yes (relative wc, 4-5n)": "Yes|WC|relative|4N|5N",
            "Yes (relative, wc, 6n)": "Yes|WC|relative|6N",
            "Yes, 4n (relative)": "Yes|relative|4N",
            "Yes, wc, relative, 4n)": "Yes|WC|relative|4N",
            "Yes (6n)": "Yes|6N",
            "Yes (wc, relative, 4n)": "Yes|WC|relative|4N",
            "Yes (wc, relatve, 4n)": "Yes|WC|relative|4N",
            "Yes (relative, 4n)": "Yes|relative|4N",
            "Yes (wc relative gain)": "Yes|WC|relative|gain",
            "Yes (wc, 4n)": "Yes|WC|4N",
            "Yes 9wc, relative, 4n)": "Yes|WC|relative|4N",
            "Yes (wc, releative, 4n)": "Yes|WC|relative|4N",
        },
        "7q gain": {
            "No ": "No",
            "Yes ": "Yes",
            "Yes (wc)": "Yes|WC",
            "yes (wc, relative, 4n)": "Yes|WC|relative|4N)",
            "Yes": "Yes",
            "Yes (relative, 6n)": "Yes|relative|6N",
            "No": "No",
            "Yes (wc, relagtive 5n)": "Yes|WC|relative|5N",
            "Yes (relative, 4-5n)": "Yes|relative|4N|5N|",
            "Yes (wc, relative, 4n)": "Yes|WC|relative|4N",
            "Yes, wc, relative, 4n)": "Yes|WC|relative|4N",
            "Yes (wc, 4n)": "Yes|WC|4N",
            "Yes (relative, 4n)": "Yes|relative|4N",
            "Yes 9wc, relative, 4Nn": "Yes|WC|relative|4N",
            "Yes (wc, releative, 4n)": "Yes|WC|relative|4N",
            "Yes (wc, relative, 4n) ": "Yes|WC|relative|4N",
            "Yes (wc relative gain)": "Yes|WC|relative|gain",
            "Yes 9wc, relative, 4n)": "Yes|WC|relative|4N",
        },
        "1p loss/LOH": {
            "No": "No",
            "Yes (relative, 2n)": "Yes|relative|2N",
            "Yes (relative, 2n, cnloh)": "Yes|relative|2N|cnLOH",
            "Yes": "Yes",
            "Yes (wc)": "Yes|WC",
            "No?": "No",
        },
        "11q loss/LOH": {
            "No": "No",
            "Yes": "Yes",
            "Yes, (cn neutral loh)": "Yes|neutral cnLOH",
            "Yes (relative, 2n, cnloh)": "Yes|relative|2N|cnLOH",
            "Yes (deletion)": "Yes|deletion",
            "Yes (relative, 2n, wc, cn loh)": "Yes|relative|WC|2N|cnLOH",
            "Yes (wc)": "Yes|WC",
            "Yes (relative, wc, 2n)": "Yes|relative|WC|2N",
            "Yes (wc, relative loss)": "Yes|relative|WC",
            "Yes (cn neutral loh)": "Yes|neutral|cnLOH",
        },
        "ALK": {
            "Wt": "WT",
            "F1245l (somatic)": "F1245L|somatic",
            "Wt (alk gain)": "WT|ALK gain",
            "Wt (phox2b wt)": "WT|Phox2B WT",
            "Wt/phox2b with a heterozygous polyalanine expansion (20/33).": "WT|Phox2B with a heterozygous polyalanine expansion (20/33)",
            "N/a": pd.NA,
            "Arg1275gln": "Arg1275Gln",
            "Wt (diagnosis and this specimen)": "WT",
            "wt": "WT",
            "WT (PHOX2b WT)": "WT|Phox2B WT",
            "Wt / phox2b wt": "WT|Phox2B WT",
            "F1174l": "F1174L",
        },
        "Other mutations (source)": {
            "none (FoundationOne)": pd.NA,
            "NUDT15 (NM_018283.2), c.415C>T (p.Arg139Cys)": "NUDT15 (NM_018283.2)|c.415C>T (p.Arg139Cys)",
            "BRAF Gly469Ala (CHOP NGS)": "BRAF Gly469Ala|CHOP NGS",
        },
        "Genomic studies done": {
            "SNP array, ALK seq, MYCN FISH": "SNP array|ALK seq|MYCN FISH",
            "SNP, ALK seq": "SNP array|ALK seq",
            "SNP array, ALK seq": "SNP array|ALK seq",
            "SNP, ALK/PHOX2B sequencing ": "SNP array|ALK/PHOX2B sequencing",
            "CHOP NGS, SNP array": "CHOP NGS|SNP array",
            "CHOP NGS": "CHOP NGS",
            "SNP array": "SNP array",
            "SNP array, ALK seq (tumor and germline)": "SNP array|ALK seq (tumor and germline)",
            "SNP array, ALK seq ": "SNP array|ALK seq",
            "SNP, ALK seq, Foundation one (no mutations) ": "SNP array|ALK seq|Foundation one (no mutations)",
            "SNP array (post chemo), ALK seq": "SNP array (post chemo)|ALK seq",
            "SNP array, CHOP NGS": "SNP array|CHOP NGS",
            "b/l SNP and CHOP NGS": "b/l SNP array|CHOP NGS",
        },
    },
)

In [None]:
cols_set_one = [
    "17q gain",
    "11q loss/LOH",
    "7q gain",
    "1p loss/LOH",
    "ALK",
    "Other mutations (source)",
    "Genomic studies done",
]

clinical_data[cols_set_one] = clinical_data[cols_set_one].astype("category")

In [None]:
for c in [
    "INSS stage",
    "INRG stage",
    "Ploidy value",
    "MKI",
    "Degree of differentiation",
    "Histolgic classification - INPC",
]:
    clinical_data[c] = clinical_data[c].str.rstrip().str.lstrip().str.capitalize()

In [None]:
clinical_data["Degree of differentiation"] = clinical_data["Degree of differentiation"].replace(
    "\n|\xa0", "", regex=True
)

In [None]:
clinical_data = clinical_data.rename(columns={"Histolgic classification - INPC": "Histologic classification - INPC"})

In [None]:
from typing import Any

import rapidfuzz as rfuzz

choices = ["Poorly Differentiated", "Differentiating", "Undifferentiated"]


def map_rapidfuzz(text: Any):
    """Map a text to a choice using rapidfuzz.

    Parameters
    ----------
    text
        Text to map.

    Returns
    -------
        The mapped choice.
    """
    if not isinstance(text, str):
        return None

    result = rfuzz.process.extractOne(text, choices, scorer=rfuzz.fuzz.partial_ratio)

    # result is a tuple: (choice, score, index) or None if choices are empty
    c, s, i = result
    if result and s > 80:  # Apply a confidence threshold (e.g., 80)
        return c  # Return the best matching choice
    else:
        print(f"Warning: Low rapidfuzz match score for: {text} -> {result}")
        return None

In [None]:
clinical_data["Degree of differentiation"].apply(map_rapidfuzz)

In [None]:
clinical_data = clinical_data.replace(
    to_replace={
        "INSS stage": {"4s": "4S", "2a": "2A", "2b": "2B", "2b??": "2B"},
        "INRG stage": {
            "M": "M",
            "L2": "L2",
            "Ms": "MS",
            "M (from diagnosis)": "M",
            "L1": "L1",
        },
        "Ploidy value": {
            "Hyperdiplod (3-4n)": "Hyperdiploid|3N|4N",
            "Diploid": "Diploid",
            "Diploid (diagnosis)": "Diploid",
            "Hyperdiploid (3-4n)": "Hyperdiploid",
            "Hyeperdiplod (3n)": "Hyperdiploid|3N",
            "Hyperdiploid (3n)": "Hyperdiploid|3N",
            "Hyperdiploid (3-4n) with scas": "Hyperdiploid|3N|4N",
            "Hyperdiploid (3n of 10 chromosomes)": "Hyperdiploid|3N",
            "Hyperdipoid (3n)": "Hyperdiploid|3N",
            "Hyperdiploid (3n) w/ scas": "Hyperdiploid|3N",
            "Diploid (hyperdiploid at diagnosis?)": "Diploid",
            "Hyperdip)loid (3n)": "Hyperdiploid|3N",
            "Hyperidiploid/with scas": "Hyperdiploid",
            "Hyperploid (3n)": "Hyperdiploid|3N",
            "Hyperdiploid": "Hyperdiploid",
            "Hyperdiploid (near 4n)": "Hyperdiploid|4N",
            "Hyeperdiplid (3n)": "Hyperdiploid|3N",
        },
        "MKI": {
            "Intermediate": "Intermediate",
            "Low": "Low",
            "High": "High",
            "High (diagnostic)": "High",
            "Low (<1%, diagnostic)": "Low",
            "Intermediate (diagnosis)": "Intermediate",
            "Low/intermediate": "Intermediate",
            "Diagnosis = low": "Low",
            "High (from diagnosis)": "High",
            "Low (diagnosis)": "Low",
            "Low (diagnostic)": "Low",
            "High (and one clone with low)": "High",
        },
        "Histologic classification - INPC": {
            "Favorable histology": "Favorable",
            "Favorbale histology, diagnosis = favorable histology": "Favorable",
            "Unfavorable histology": "Unfavorable",
            "Unfavorable histology (diagnosis)": "Unfavorable",
            "Favorable (diagnosis)": "Favorable",
            "Diagnosis = unfavorable histology": "Unfavorable",
            "Unfavorable histology (from diagnosis)": "Unfavorable",
            "Favorable histology (diagnosis)": "Favorable",
            "Favorable histology (diagnostic)": "Favorable",
            "N/a": pd.NA,
            "Unfavorable histology (diagnostic tumor)": "Unfavorable",
            "Unfavorable histology = diagnosis": "Unfavorable",
            "Unfavorable histology (nodular gangloneuroblastoma with a poorly differentiated neuroblastic component)": "Unfavorable",
            "Diagnosis = favorable histology": "Favorable",
            "Favorable (diagnostic)": "Favorable",
            "Unfavorable": "Unfavorable",
            "Unfavorable hiostology (diagnostic)": "Unfavorable",
            "Favorable  (diagnosis)": "Favorable",
            "Unfavorbale histology": "Unfavorable",
        },
        "Genomics source": {
            "This specimen ": "This specimen",
            "Diagnostic specimen": "Diagnostic specimen",
            "none": pd.NA,
            "this specimen": "This specimen",
            "Diagnostic tumor ": "Diagnostic specimen",
            "This specimen": "This specimen",
            "This specimen?": "This specimen",
        },
    },
)
clinical_data["Degree of differentiation"] = clinical_data["Degree of differentiation"].apply(map_rapidfuzz)
clinical_data["MYCN amplification"] = clinical_data["MYCN amplification"].map(lambda x: False if "No" in x else True)

In [None]:
cols_set_two = [
    "INSS stage",
    "INRG stage",
    "Ploidy value",
    "MKI",
    "Degree of differentiation",
    "Histologic classification - INPC",
    "Genomics source",
    "UID",
    # "patient_ID",
]

clinical_data[cols_set_two] = clinical_data[cols_set_two].astype("category")

In [None]:
clinical_data = clinical_data.rename(
    columns={
        "Race": "Ethnicity",
        "Biopsy/surgery location": "Tissue",
        "11q loss/LOH": "11q LOH",
        "1p loss/LOH": "1p LOH",
        "Classification of specimen": "Classification",
        "fov": "FOV",
        "Age (days) at time of diagnosis (relapse)": "Age (days) at Diagnosis",
        "Age (days) at time of biospy": "Age (days) at Biopsy",
        "HVA/VMA days from biospy": "HVA/VMA (days) from biopsy",
    },
)
clinical_data["FOV"] = clinical_data["FOV"].astype("category")

In [None]:
clinical_data = clinical_data.fillna(value=pd.NA)

## Validate Clinical Data Schema

In [None]:
schema = ln.Schema(
    name="Neuroblastoma Clinical Schema",
    features=[
        ln.Feature(name="FOV", dtype=ln.ULabel, description="Identifier for the imaging Field of View").save(),
        ln.Feature(name="patient_ID", dtype=int, description="Unique identifier for the patient").save(),
        ln.Feature(name="UID", dtype=ln.ULabel, description="Unique identifier for the sample or record").save(),
        ln.Feature(
            name="Age (days) at Diagnosis", dtype=int, description="Patient's age in days at the time of diagnosis"
        ).save(),
        ln.Feature(
            name="Classification",
            dtype=ln.ULabel,
            description="Clinical or histological classification of the specimen/tumor",
        ).save(),
        ln.Feature(
            name="Age (days) at Biopsy",
            dtype=int,
            nullable=True,
            description="Patient's age in days at the time of biopsy",
        ).save(),
        ln.Feature(name="Sex", dtype=ln.ULabel, nullable=True, description="Biological sex of the patient").save(),
        ln.Feature(
            name="Ethnicity",
            dtype=bt.Ethnicity,
            nullable=True,
            description="Self-reported or assigned ethnicity of the patient",
        ).save(),
        ln.Feature(name="Tissue", dtype=bt.Tissue, description="Type of tissue biopsied or imaged").save(),
        ln.Feature(
            name="HVA/VMA days from biopsy",
            dtype=float,
            nullable=True,
            description="Number of days between HVA/VMA measurement and biopsy",
        ).save(),
        ln.Feature(
            name="Clinical presentation",
            dtype=str,
            nullable=True,
            description="Description of the patient's symptoms and presentation at diagnosis",
        ).save(),
        ln.Feature(
            name="Risk", dtype=ln.ULabel, description="Assigned clinical risk group (e.g., low, intermediate, high)"
        ).save(),
        ln.Feature(
            name="INSS stage",
            dtype=ln.ULabel,
            nullable=True,
            description="International Neuroblastoma Staging System (INSS) stage",
        ).save(),
        ln.Feature(
            name="INRG stage",
            dtype=ln.ULabel,
            nullable=True,
            description="International Neuroblastoma Risk Group (INRG) staging system stage",
        ).save(),
        ln.Feature(
            name="Ploidy value",
            dtype=ln.ULabel,
            nullable=True,
            description="Ploidy status of the tumor cells (e.g., diploid, hyperdiploid)",
        ).save(),
        ln.Feature(
            name="MKI", dtype=ln.ULabel, nullable=True, description="Mitotic-Karyorrhectic Index (MKI) value"
        ).save(),
        ln.Feature(
            name="Degree of differentiation",
            dtype=ln.ULabel,
            nullable=True,
            description="Histological degree of tumor cell differentiation",
        ).save(),
        ln.Feature(
            name="Histologic classification - INPC",
            dtype=ln.ULabel,
            nullable=True,
            description="International Neuroblastoma Pathology Classification (INPC) category",
        ).save(),
        ln.Feature(
            name="Genomics source",
            dtype=ln.ULabel,
            nullable=True,
            description="Source material used for genomic analysis (e.g., tumor, blood)",
        ).save(),
        ln.Feature(
            name="MYCN amplification",
            dtype=bool,
            nullable=True,
            description="Presence (True) or absence (False) of MYCN gene amplification",
        ).save(),
        ln.Feature(
            name="17q gain", dtype=ln.ULabel, nullable=True, description="Presence or status of chromosome 17q gain"
        ).save(),
        ln.Feature(
            name="7q gain", dtype=ln.ULabel, nullable=True, description="Presence or status of chromosome 7q gain"
        ).save(),
        ln.Feature(
            name="1p LOH",
            dtype=ln.ULabel,
            nullable=True,
            description="Presence or status of Loss of Heterozygosity (LOH) on chromosome 1p",
        ).save(),
        ln.Feature(
            name="11q LOH",
            dtype=ln.ULabel,
            nullable=True,
            description="Presence or status of Loss of Heterozygosity (LOH) on chromosome 11q",
        ).save(),
        ln.Feature(
            name="ALK",
            dtype=ln.ULabel,
            nullable=True,
            description="Status of ALK (Anaplastic Lymphoma Kinase) gene alteration (e.g., mutation, amplification)",
        ).save(),
        ln.Feature(
            name="Other mutations (source)",
            dtype=ln.ULabel,
            nullable=True,
            description="Details of other relevant mutations identified and their source",
        ).save(),
        ln.Feature(
            name="Genomic studies done",
            dtype=ln.ULabel,
            nullable=True,
            description="Description of the types of genomic studies performed",
        ).save(),
        ln.Feature(
            name="treatment btw biopsies",
            dtype=str,
            nullable=True,
            description="Details of any treatment received between biopsies",
        ).save(),
        ln.Feature(
            name="OS time (days)",
            dtype=int,
            nullable=True,
            description="Overall Survival (OS) time in days from diagnosis",
        ).save(),
    ],
).save()

In [None]:
ln.Artifact.filter(key__contains="clinical_data").one()

In [None]:
ln.Schema.filter().one().delete()

for f in ln.Feature.lookup():
    match f:
        case ln.Feature():
            f.delete()
        case _:
            pass
for l in ln.ULabel.lookup():
    match l:
        case ln.ULabel():
            l.delete()
        case _:
            pass

In [None]:
curator = ln.curators.DataFrameCurator(clinical_data, schema)

curator.validate()

In [None]:
for c in curator.cat.non_validated:
    curator.cat.add_new_from(c)

In [None]:
curator.validate()

In [None]:
curator.save_artifact(key="clinical_data.parquet", description="Sample Level Clinical Data")

In [None]:
cols_to_explode = [
    "Ploidy value",
    "17q gain",
    "7q gain",
    "1p LOH",
    "11q LOH",
    "ALK",
    "Other mutations (source)",
    "Genomic studies done",
]

for col in cols_to_explode:
    print((col, set(filter(lambda x: len(x) > 0, mit.collapse(clinical_data[col].cat.categories.str.split("|"))))))

In [None]:
tissues = bt.Tissue.from_values(clinical_data["Tissue"].unique().tolist())
ln.save(tissues)

In [None]:
ethnicities = bt.Ethnicity.from_values(clinical_data["Ethnicity"])
ln.save(ethnicities)

Channel Validation

In [None]:
sample_fov_markers = set(ns.natsorted(m.stem for m in fov_dir.glob("*/*.tiff")))

In [None]:
cell_markers = bt.CellMarker.public()

inspected_markers = cell_markers.inspect(values=sample_fov_markers, field=cell_markers.name)

In [None]:
standardized_markers_mapper = cell_markers.standardize(
    values=sample_fov_markers, field=cell_markers.name, return_mapper=True
)

In [None]:
copied_markers = [
    standardized_markers_mapper[m] if m in standardized_markers_mapper.keys() else m for m in sample_fov_markers.copy()
]

In [None]:
inspected_markers2 = cell_markers.inspect(values=copied_markers, field=cell_markers.name)
manually_added_markers = [bt.CellMarker(name=n) for n in inspected_markers2.non_validated]

In [None]:
valdiated_markers = bt.CellMarker.from_values(values=inspected_markers2.validated, field="name")

In [None]:
ln.save(valdiated_markers)
ln.save(manually_added_markers)

Finishing up


In [None]:
ln.finish()