This notebook performs profile annotation.
The platemap is mapped back to the profile to retain the sample metadata.


In [1]:
import argparse
import pathlib
import sys

import pandas as pd

cwd = pathlib.Path.cwd()

if (cwd / ".git").is_dir():
    root_dir = cwd
else:
    root_dir = None
    for parent in cwd.parents:
        if (parent / ".git").is_dir():
            root_dir = parent
            break
sys.path.append(str(root_dir / "utils"))
from arg_parsing_utils import parse_args
from notebook_init_utils import bandicoot_check, init_notebook

root_dir, in_notebook = init_notebook()

profile_base_dir = bandicoot_check(pathlib.Path("~/mnt/bandicoot").resolve(), root_dir)

In [2]:
if not in_notebook:
    args = parse_args()
    patient = args["patient"]

else:
    patient = "NF0014_T1"

In [3]:
def annotate_profiles(
    profile_df: pd.DataFrame, platemap_df: pd.DataFrame, patient: str
) -> pd.DataFrame:
    """
    Annotate profiles with treatment, dose, and unit information from the platemap.

        Parameters
        ----------
        profile_df : pd.DataFrame
            Profile DataFrame containing image_set information.
            Could be either single-cell or organoid profiles.
        platemap_df : pd.DataFrame
            Platmap DataFrame containing well_position, treatment, dose, and unit.
        patient : str
            Patient ID to annotate the profiles with.

        Returns
        -------
        pd.DataFrame
            Annotated profile DataFrame with additional columns for treatment, dose, and unit.
    """
    drug_information = pd.read_csv(
        pathlib.Path(
            f"{root_dir}/4.processing_image_based_profiles/data/drugs/drug_information.csv"
        )
    )
    profile_df["Well"] = profile_df["image_set"].str.split("-").str[0]
    profile_df.insert(2, "Well", profile_df.pop("Well"))
    profile_df = pd.merge(
        profile_df,
        platemap_df[["well_position", "treatment", "dose", "unit"]],
        left_on="Well",
        right_on="well_position",
        how="left",
    ).drop(columns=["well_position"])
    profile_df = profile_df.merge(
        drug_information, how="left", left_on="treatment", right_on="Treatment"
    )
    profile_df.drop(columns=["Treatment"], inplace=True)
    for col in ["treatment", "dose", "unit"]:
        profile_df.insert(1, col, profile_df.pop(col))
    profile_df.insert(0, "patient", patient)
    return profile_df

In [4]:
# pathing


sc_merged_path = pathlib.Path(
    f"{profile_base_dir}/data/{patient}/image_based_profiles/1.combined_profiles/sc.parquet"
).resolve(strict=True)
organoid_merged_path = pathlib.Path(
    f"{profile_base_dir}/data/{patient}/image_based_profiles/1.combined_profiles/organoid.parquet"
).resolve(strict=True)

platemap_path = pathlib.Path(
    f"{profile_base_dir}/data/{patient}/platemap/platemap.csv"
).resolve(strict=True)

# output path
sc_annotated_output_path = pathlib.Path(
    f"{profile_base_dir}/data/{patient}/image_based_profiles/2.annotated_profiles/sc_anno.parquet"
).resolve()
organoid_annotated_output_path = pathlib.Path(
    f"{profile_base_dir}/data/{patient}/image_based_profiles/2.annotated_profiles/organoid_anno.parquet"
).resolve()

organoid_annotated_output_path.parent.mkdir(parents=True, exist_ok=True)

In [5]:
# read data
sc_merged = pd.read_parquet(sc_merged_path)
organoid_merged = pd.read_parquet(organoid_merged_path)
# read platemap
platemap = pd.read_csv(platemap_path)
platemap.head()

Unnamed: 0,WellRow,WellCol,well_position,treatment,dose,unit
0,C,2,C2,Staurosporine,10,nM
1,D,2,D2,Digoxin,1,uM
2,E,2,E2,Digoxin,1,uM
3,F,2,F2,Onalespib,1,uM
4,G,2,G2,Staurosporine,10,nM


In [6]:
sc_merged = annotate_profiles(sc_merged, platemap, patient)
organoid_merged = annotate_profiles(organoid_merged, platemap, patient)

In [7]:
sc_merged.rename(columns={"patient": "patient_tumor"}, inplace=True)
organoid_merged.rename(columns={"patient": "patient_tumor"}, inplace=True)
sc_merged[["patient", "tumor"]] = sc_merged["patient_tumor"].str.split("_", expand=True)
organoid_merged[["patient", "tumor"]] = organoid_merged["patient_tumor"].str.split(
    "_", expand=True
)

In [8]:
metadata_features_list = [
    "patient_tumor",
    "patient",
    "tumor",
    "object_id",
    "unit",
    "dose",
    "Well",
    "treatment",
    "image_set",
    "parent_organoid",
    "single_cell_count",
    "Target",
    "Class",
    "Therapeutic_Categories",
]
# prepend "Metadata_" to metadata features
sc_merged = sc_merged.rename(
    columns={col: f"Metadata_{col}" for col in metadata_features_list}
)
organoid_merged = organoid_merged.rename(
    columns={col: f"Metadata_{col}" for col in metadata_features_list}
)

In [9]:
sc_merged.head()

Unnamed: 0,Metadata_patient_tumor,Metadata_object_id,Metadata_unit,Metadata_dose,Metadata_treatment,Metadata_image_set,Metadata_Well,Metadata_parent_organoid,Area.Size.Shape_Nuclei_VOLUME,Area.Size.Shape_Nuclei_CENTER.X,...,Intensity_Cytoplasm_Mito_MIN.INTENSITY.EDGE_y,Intensity_Cytoplasm_Mito_STD.INTENSITY_y,Intensity_Cytoplasm_Mito_STD.INTENSITY.EDGE_y,Intensity_Cytoplasm_Mito_UPPER.QUARTILE.INTENSITY_y,Intensity_Cytoplasm_Mito_VOLUME_y,Metadata_Target,Metadata_Class,Metadata_Therapeutic_Categories,Metadata_patient,Metadata_tumor
0,NF0014_T1,255,uM,1,Fimepinostat,E5-2,E5,19,31267.0,1247.273682,...,,,,,,PI3K and HDAC inhibitor,Small Molecule,Investigational,NF0014,T1
1,NF0014_T1,25,uM,1,Fimepinostat,D5-2,D5,-1,35478.0,1115.733765,...,,,,,,PI3K and HDAC inhibitor,Small Molecule,Investigational,NF0014,T1
2,NF0014_T1,51,uM,1,Fimepinostat,D5-2,D5,-1,9615.0,1050.518799,...,,,,,,PI3K and HDAC inhibitor,Small Molecule,Investigational,NF0014,T1
3,NF0014_T1,63,uM,1,Fimepinostat,D5-2,D5,-1,3784.0,708.924438,...,,,,,,PI3K and HDAC inhibitor,Small Molecule,Investigational,NF0014,T1
4,NF0014_T1,76,uM,1,Fimepinostat,D5-2,D5,-1,2883.0,414.661469,...,,,,,,PI3K and HDAC inhibitor,Small Molecule,Investigational,NF0014,T1


In [10]:
organoid_merged.head()

Unnamed: 0,Metadata_patient_tumor,Metadata_object_id,Metadata_unit,Metadata_dose,Metadata_treatment,Metadata_image_set,Metadata_Well,Metadata_single_cell_count,Area.Size.Shape_Organoid_VOLUME,Area.Size.Shape_Organoid_CENTER.X,...,Texture_Organoid_Mito_Inverse.Difference.Moment_256.3,Texture_Organoid_Mito_Sum.Average_256.3,Texture_Organoid_Mito_Sum.Entropy_256.3,Texture_Organoid_Mito_Sum.Variance_256.3,Texture_Organoid_Mito_Variance_256.3,Metadata_Target,Metadata_Class,Metadata_Therapeutic_Categories,Metadata_patient,Metadata_tumor
0,NF0014_T1,19,uM,1,Fimepinostat,E5-2,E5,1.0,3268026.0,938.570068,...,0.888459,8.625683,1.346913,702.361139,188.51203,PI3K and HDAC inhibitor,Small Molecule,Investigational,NF0014,T1
1,NF0014_T1,15,uM,1,Fimepinostat,D5-2,D5,11.0,5570203.0,950.582825,...,0.853993,7.177906,1.556027,297.955168,85.399048,PI3K and HDAC inhibitor,Small Molecule,Investigational,NF0014,T1
2,NF0014_T1,47,uM,1,Ketotifen,G6-1,G6,9.0,6134128.0,795.807983,...,0.875041,13.244656,1.343332,1153.970316,303.579081,histamine H1 receptor antagonist,Small Molecule,Anti-Allergic Agents,NF0014,T1
3,NF0014_T1,40,uM,10,Mirdametinib,G8-1,G8,8.0,6416256.0,769.689636,...,0.943387,5.15355,0.835848,479.284825,121.910734,MEK1/2 inhibitor,Small Molecule,Kinase Inhibitor,NF0014,T1
4,NF0014_T1,37,uM,10,Mirdametinib,C9-2,C9,13.0,14239851.0,817.411133,...,0.818021,12.778783,2.293907,971.36421,258.637076,MEK1/2 inhibitor,Small Molecule,Kinase Inhibitor,NF0014,T1


In [11]:
# save annotated profiles
sc_merged.to_parquet(sc_annotated_output_path, index=False)
organoid_merged.to_parquet(organoid_annotated_output_path, index=False)