# Perform organoid-level quality control

In [1]:
import pathlib
import sys

import pandas as pd
from cosmicqc import find_outliers

cwd = pathlib.Path.cwd()

if (cwd / ".git").is_dir():
    root_dir = cwd
else:
    root_dir = None
    for parent in cwd.parents:
        if (parent / ".git").is_dir():
            root_dir = parent
            break
sys.path.append(str(root_dir / "utils"))
from arg_parsing_utils import parse_args
from notebook_init_utils import bandicoot_check, init_notebook

root_dir, in_notebook = init_notebook()

profile_base_dir = bandicoot_check(
    pathlib.Path("~/mnt/bandicoot").resolve(),
    # pathlib.Path("/home/jenna/mnt/bandicoot").resolve(), # for Jenna's machine
    root_dir,
)

## Load in all the organoid profiles and concat together

In [2]:
# Path to patient folders
path_to_patients = pathlib.Path(f"{profile_base_dir}/data/")

# Get all organoid profiles per patient folder and concatenate them
dfs = []
for patient_folder in path_to_patients.iterdir():
    organoid_file = (
        patient_folder / "image_based_profiles/1.combined_profiles" / "organoid.parquet"
    )
    if organoid_file.exists():
        df = pd.read_parquet(organoid_file)
        df["patient_id"] = patient_folder.name
        # Group by image_set and count organoids
        organoid_counts = (
            df.groupby("image_set")["object_id"].count().rename("organoid_count")
        )
        df = df.merge(organoid_counts, on="image_set", how="left")
        dfs.append(df)
orig_organoid_profiles_df = pd.concat(dfs, ignore_index=True)

# Print the shape and head of the combined organoid profiles DataFrame
print(orig_organoid_profiles_df.shape)
orig_organoid_profiles_df.head()

(1914, 456)


Unnamed: 0,object_id,image_set,single_cell_count,Area.Size.Shape_Organoid_VOLUME,Area.Size.Shape_Organoid_CENTER.X,Area.Size.Shape_Organoid_CENTER.Y,Area.Size.Shape_Organoid_CENTER.Z,Area.Size.Shape_Organoid_BBOX.VOLUME,Area.Size.Shape_Organoid_MIN.X,Area.Size.Shape_Organoid_MAX.X,...,Texture_Organoid_Mito_Entropy_256.3,Texture_Organoid_Mito_Information.Measure.of.Correlation.1_256.3,Texture_Organoid_Mito_Information.Measure.of.Correlation.2_256.3,Texture_Organoid_Mito_Inverse.Difference.Moment_256.3,Texture_Organoid_Mito_Sum.Average_256.3,Texture_Organoid_Mito_Sum.Entropy_256.3,Texture_Organoid_Mito_Sum.Variance_256.3,Texture_Organoid_Mito_Variance_256.3,patient_id,organoid_count
0,21,C11-4,,5282315.0,1002.888428,679.062927,7.713356,35986212.0,16.0,1540.0,...,1.564286,-0.244654,0.523945,0.874871,2.938546,1.289419,73.722398,24.623565,NF0040_T1,1
1,10,D11-5,2.0,1889020.0,975.019226,975.120178,6.426039,15213330.0,451.0,1540.0,...,0.989514,-0.224091,0.354557,0.915352,1.661935,0.845816,44.580736,16.679122,NF0040_T1,1
2,47,G2-5,9.0,9465658.0,712.868835,763.379944,13.076691,43674840.0,178.0,1523.0,...,2.293802,-0.278417,0.676309,0.831954,7.074844,1.832367,240.844218,74.031633,NF0040_T1,1
3,42,B4-2,9.0,6224228.0,795.943298,836.297668,11.416876,17215416.0,448.0,1305.0,...,1.359151,-0.505711,0.77373,0.923982,3.88675,1.07432,149.021829,38.515785,NF0040_T1,1
4,32,C6-6,12.0,17594120.0,641.527527,777.975464,13.057177,49613848.0,0.0,1285.0,...,2.902803,-0.434663,0.891047,0.824446,9.783484,2.193819,435.787377,115.327795,NF0040_T1,1


## Perform a first round of QC by flagging any row with NaNs in metadata

We check for NaNs in the `object_id` and/or the `single_cell_count` column and flag them because:
   - An organoid can not exist if there aren't any cells.
   - NaN in object_id would be incorrect as that means the object/organoid does not exist (will have all NaNs in the feature space).

In [3]:
organoid_profiles_df = orig_organoid_profiles_df.copy()
organoid_profiles_df["cqc.nan_detected"] = (
    organoid_profiles_df[["object_id", "single_cell_count"]].isna().any(axis=1)
)

# Print the number of organoids flagged
flagged_count = organoid_profiles_df["cqc.nan_detected"].sum()
print(f"Number of organoids flagged: {flagged_count}")

organoid_profiles_df.head()

Number of organoids flagged: 861


Unnamed: 0,object_id,image_set,single_cell_count,Area.Size.Shape_Organoid_VOLUME,Area.Size.Shape_Organoid_CENTER.X,Area.Size.Shape_Organoid_CENTER.Y,Area.Size.Shape_Organoid_CENTER.Z,Area.Size.Shape_Organoid_BBOX.VOLUME,Area.Size.Shape_Organoid_MIN.X,Area.Size.Shape_Organoid_MAX.X,...,Texture_Organoid_Mito_Information.Measure.of.Correlation.1_256.3,Texture_Organoid_Mito_Information.Measure.of.Correlation.2_256.3,Texture_Organoid_Mito_Inverse.Difference.Moment_256.3,Texture_Organoid_Mito_Sum.Average_256.3,Texture_Organoid_Mito_Sum.Entropy_256.3,Texture_Organoid_Mito_Sum.Variance_256.3,Texture_Organoid_Mito_Variance_256.3,patient_id,organoid_count,cqc.nan_detected
0,21,C11-4,,5282315.0,1002.888428,679.062927,7.713356,35986212.0,16.0,1540.0,...,-0.244654,0.523945,0.874871,2.938546,1.289419,73.722398,24.623565,NF0040_T1,1,True
1,10,D11-5,2.0,1889020.0,975.019226,975.120178,6.426039,15213330.0,451.0,1540.0,...,-0.224091,0.354557,0.915352,1.661935,0.845816,44.580736,16.679122,NF0040_T1,1,False
2,47,G2-5,9.0,9465658.0,712.868835,763.379944,13.076691,43674840.0,178.0,1523.0,...,-0.278417,0.676309,0.831954,7.074844,1.832367,240.844218,74.031633,NF0040_T1,1,False
3,42,B4-2,9.0,6224228.0,795.943298,836.297668,11.416876,17215416.0,448.0,1305.0,...,-0.505711,0.77373,0.923982,3.88675,1.07432,149.021829,38.515785,NF0040_T1,1,False
4,32,C6-6,12.0,17594120.0,641.527527,777.975464,13.057177,49613848.0,0.0,1285.0,...,-0.434663,0.891047,0.824446,9.783484,2.193819,435.787377,115.327795,NF0040_T1,1,False


## Process non-NaN rows to detect abnormally small and large organoids and flag them

In [4]:
# Set the metadata columns to be used in the QC process
metadata_columns = [
    "patient_id",
    "image_set",
    "object_id",
    "single_cell_count",
    "organoid_count",
    "cqc.nan_detected",
]

In [5]:
# Process each plate (patient_id) independently in the combined dataframe
for plate_name, plate_df in organoid_profiles_df.groupby("patient_id"):
    print(f"Processing plate: {plate_name}")

    # Only process the rows that are not flagged
    filtered_plate_df = plate_df[~plate_df["cqc.nan_detected"]]

    # Find outlier organoids based on the 'Area.Size.Shape_Organoid_VOLUME' column
    print("Finding small organoid outliers...")
    small_size_outliers = find_outliers(
        df=filtered_plate_df,
        metadata_columns=metadata_columns,
        feature_thresholds={
            "Area.Size.Shape_Organoid_VOLUME": -1,  # Detect very small organoids
        },
    )

    # Ensure the column exists before assignment
    plate_df["cqc.small_organoid_outlier"] = False
    plate_df.loc[small_size_outliers.index, "cqc.small_organoid_outlier"] = True

    print("Finding large organoid outliers...")
    large_size_outliers = find_outliers(
        df=filtered_plate_df,
        metadata_columns=metadata_columns,
        feature_thresholds={
            "Area.Size.Shape_Organoid_VOLUME": 3,  # Detect very large organoids
        },
    )

    # Ensure the column exists before assignment
    plate_df["cqc.large_organoid_outlier"] = False
    plate_df.loc[large_size_outliers.index, "cqc.large_organoid_outlier"] = True

    # Update original dataframe so flags persist
    organoid_profiles_df.loc[plate_df.index, :] = plate_df

    # Print number of outliers (only in filtered rows)
    small_count = filtered_plate_df.index.intersection(small_size_outliers.index).shape[
        0
    ]
    large_count = filtered_plate_df.index.intersection(large_size_outliers.index).shape[
        0
    ]
    print(f"Small organoid outliers found: {small_count}")
    print(f"Large organoid outliers found: {large_count}")

    # Save updated plate_df with flag columns included
    output_folder = (
        path_to_patients / plate_name / "image_based_profiles/1a.qc_profiles"
    )
    output_folder.mkdir(parents=True, exist_ok=True)
    output_file = output_folder / "organoid_flagged_outliers.parquet"
    plate_df.to_parquet(output_file, index=False)
    print(f"Saved organoid profiles with outlier flags to {output_file}\n")

Processing plate: NF0014_T1
Finding small organoid outliers...
Number of outliers: 9 (9.47%)
Outliers Range:
Area.Size.Shape_Organoid_VOLUME Min: 1187129.0
Area.Size.Shape_Organoid_VOLUME Max: 2721536.0
Finding large organoid outliers...
Number of outliers: 2 (2.11%)
Outliers Range:
Area.Size.Shape_Organoid_VOLUME Min: 45764776.0
Area.Size.Shape_Organoid_VOLUME Max: 66231340.0
Small organoid outliers found: 9
Large organoid outliers found: 2
Saved organoid profiles with outlier flags to ~/mnt/bandicoot/NF1_organoid_data/data/NF0014_T1/image_based_profiles/1a.qc_profiles/organoid_flagged_outliers.parquet

Processing plate: NF0016_T1
Finding small organoid outliers...
Number of outliers: 2 (3.85%)
Outliers Range:
Area.Size.Shape_Organoid_VOLUME Min: 138472.0
Area.Size.Shape_Organoid_VOLUME Max: 1473650.0
Finding large organoid outliers...
Number of outliers: 0 (0.00%)
Outliers Range:
Area.Size.Shape_Organoid_VOLUME Min: nan
Area.Size.Shape_Organoid_VOLUME Max: nan
Small organoid outliers

In [6]:
# Print example output of the flagged organoid profiles
print(f"Example flagged organoid profiles: {plate_name}")
print(plate_df.shape)
plate_df.head()

Example flagged organoid profiles: SARCO361_T1
(354, 459)


Unnamed: 0,object_id,image_set,single_cell_count,Area.Size.Shape_Organoid_VOLUME,Area.Size.Shape_Organoid_CENTER.X,Area.Size.Shape_Organoid_CENTER.Y,Area.Size.Shape_Organoid_CENTER.Z,Area.Size.Shape_Organoid_BBOX.VOLUME,Area.Size.Shape_Organoid_MIN.X,Area.Size.Shape_Organoid_MAX.X,...,Texture_Organoid_Mito_Inverse.Difference.Moment_256.3,Texture_Organoid_Mito_Sum.Average_256.3,Texture_Organoid_Mito_Sum.Entropy_256.3,Texture_Organoid_Mito_Sum.Variance_256.3,Texture_Organoid_Mito_Variance_256.3,patient_id,organoid_count,cqc.nan_detected,cqc.small_organoid_outlier,cqc.large_organoid_outlier
631,,C11-4,,,,,,,,,...,,,,,,SARCO361_T1,0,True,False,False
632,1.0,D11-5,2.0,148301.0,889.718628,941.887329,5.000566,1084102.0,349.0,1242.0,...,0.993754,0.281485,0.077583,16.484955,6.564278,SARCO361_T1,1,False,True,False
633,,G2-5,,,,,,,,,...,,,,,,SARCO361_T1,0,True,False,False
634,,C6-6,,,,,,,,,...,,,,,,SARCO361_T1,0,True,False,False
635,,E5-2,,,,,,,,,...,,,,,,SARCO361_T1,0,True,False,False
