# Process single cell profiles

## Import libraries

In [1]:
import pathlib
import pprint

import pandas as pd

from pycytominer import annotate, normalize, feature_select

## Set paths and variables

In [2]:
# Set this flag to True for cleaned data (applied QC), or False for no QC applied
use_cleaned_data = False

# Path to directories
converted_dir = pathlib.Path("./data/converted_profiles")
cleaned_dir = pathlib.Path("./data/cleaned_profiles")

# Set the directory based on the flag
data_dir = cleaned_dir if use_cleaned_data else converted_dir

# output path for single-cell profiles
output_dir = pathlib.Path("./data/single_cell_profiles")
output_dir.mkdir(parents=True, exist_ok=True)

# operations to perform for feature selection
feature_select_ops = [
    "variance_threshold",
    "correlation_threshold",
    "blocklist",
    "drop_na_columns",
]

# Extract the plate names from the file name
plate_names = [
    file.stem.replace("_converted", "") for file in converted_dir.rglob("*.parquet")
]

# Pick suffix based on use_cleaned_data flag
qc_suffix = "_QC" if use_cleaned_data else "_no_QC"

# Filter out plates that already exist in output_dir (any file starting with that plate name)
to_process = []
for plate in plate_names:
    pattern = f"{plate}*{qc_suffix}*.parquet"
    processed = any(output_dir.glob(pattern))
    if not processed:
        to_process.append(plate)

print("Plate names to process:")
pprint.pprint(to_process)

Plate names to process:
['CARD-CelIns-CX7_251110170001']


## Set dictionary with plates to process

In [3]:
# Create plate info dictionary
plate_info_dictionary = {
    name: {
        "profile_path": str(
            pathlib.Path(list(data_dir.rglob(f"{name}_*.parquet"))[0]).resolve(
                strict=True
            )
        ),
        "platemap_path": str("../0.download_data/metadata/dmso_training_platemap.csv"),
    }
    for name in to_process
}

# View the dictionary to assess that all info is added correctly
pprint.pprint(plate_info_dictionary, indent=4)

{   'CARD-CelIns-CX7_251110170001': {   'platemap_path': '../0.download_data/metadata/dmso_training_platemap.csv',
                                        'profile_path': '/home/jenna/predicting_cardiac_fibrosis_etiologies/3.preprocessing_profiles/data/converted_profiles/Plate_2_redo/CARD-CelIns-CX7_251110170001_converted.parquet'}}


## Process data with pycytominer

In [4]:
# Determine suffix based on use_cleaned_data
suffix = "_no_QC" if not use_cleaned_data else ""

for plate, info in plate_info_dictionary.items():
    print(f"Performing pycytominer pipeline for {plate}")

    # Dynamically set output file names based on the suffix
    output_annotated_file = str(
        pathlib.Path(f"{output_dir}/{plate}_sc_annotated{suffix}.parquet")
    )
    output_normalized_file = str(
        pathlib.Path(f"{output_dir}/{plate}_sc_normalized{suffix}.parquet")
    )
    output_feature_select_file = str(
        pathlib.Path(f"{output_dir}/{plate}_sc_feature_selected{suffix}.parquet")
    )

    profile_df = pd.read_parquet(info["profile_path"])
    platemap_df = pd.read_csv(info["platemap_path"])

    # Rename Image_FileName and Image_PathName and BoundingBox columns to keep downstream
    profile_df.rename(
        columns={
            col: (
                col.replace("Image_FileName", "Metadata_Image_FileName").replace(
                    "Image_PathName", "Metadata_Image_PathName"
                )
                if "Image_FileName" in col or "Image_PathName" in col
                else (
                    f"Metadata_{col}"
                    if "BoundingBox" in col and not col.startswith("Metadata_")
                    else col
                )
            )
            for col in profile_df.columns
        },
        inplace=True,
    )

    print("Performing annotation for", plate, "...")
    # Step 1: Annotation
    annotate(
        profiles=profile_df,
        platemap=platemap_df,
        join_on=["Metadata_well_position", "Image_Metadata_Well"],
        output_file=output_annotated_file,
        output_type="parquet",
    )

    # Load the annotated parquet file to fix metadata columns names
    annotated_df = pd.read_parquet(output_annotated_file)

    # Rename columns using the rename() function
    column_name_mapping = {
        "Image_Metadata_Site": "Metadata_Site",
    }

    annotated_df.rename(columns=column_name_mapping, inplace=True)

    # Fix NaN treatment issue in Metadata_treatment column
    annotated_df["Metadata_treatment"] = (
        annotated_df["Metadata_treatment"].replace({None: "None"}).fillna("None")
    )

    # Save the modified DataFrame back to the same location
    annotated_df.to_parquet(output_annotated_file, index=False)

    # Normalize to the None treatments
    samples = "Metadata_heart_number == 2 and Metadata_treatment == 'None'"

    print(
        "Performing normalization for", plate, "using this samples parameter:", samples
    )

    # Step 2: Normalization
    normalized_df = normalize(
        profiles=output_annotated_file,
        method="standardize",
        output_file=output_normalized_file,
        output_type="parquet",
        samples=samples,
    )

    print("Performing feature selection for", plate, "...")
    # Step 3: Feature selection
    feature_select(
        output_normalized_file,
        operation=feature_select_ops,
        na_cutoff=0,
        output_file=output_feature_select_file,
        output_type="parquet",
    )

    # Load back in the feature selected data to drop specific features that leaked in (Costes and Location features)
    feature_selected_df = pd.read_parquet(output_feature_select_file)
    cols_to_drop = [
        col
        for col in feature_selected_df.columns
        if ("Costes" in col or "Location" in col) and not col.startswith("Metadata_")
    ]
    feature_selected_df.drop(columns=cols_to_drop, inplace=True)
    feature_selected_df.to_parquet(output_feature_select_file, index=False)
    print(
        f"Annotation, normalization, and feature selection have been performed for {plate}"
    )

Performing pycytominer pipeline for CARD-CelIns-CX7_251110170001
Performing annotation for CARD-CelIns-CX7_251110170001 ...
Performing normalization for CARD-CelIns-CX7_251110170001 using this samples parameter: Metadata_heart_number == 2 and Metadata_treatment == 'None'
Performing feature selection for CARD-CelIns-CX7_251110170001 ...
Annotation, normalization, and feature selection have been performed for CARD-CelIns-CX7_251110170001


In [5]:
# Check output file
test_df = pd.read_parquet(output_feature_select_file)

# Test if Costes and Location features were dropped (not including columns that start with Metadata_)
for col in test_df.columns:
    # Skip metadata columns
    if col.startswith("Metadata_"):
        continue
    if "Costes" in col or "Location" in col:
        raise ValueError(
            f"Feature selection failed to drop {col} from the feature selected data."
        )

# Print the number of features (do not have Metadata_* prefix)
non_metadata_features = [
    col for col in test_df.columns if not col.startswith("Metadata_")
]
print(f"Number of features: {len(non_metadata_features)}")

print(test_df.shape)
test_df.head(2)

Number of features: 1007
(9933, 1051)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_heart_number,Metadata_treatment,Metadata_cell_type,Metadata_heart_failure_type,Metadata_Nuclei_Location_Center_X,Metadata_Nuclei_Location_Center_Y,Metadata_Cells_Location_Center_X,Metadata_Cells_Location_Center_Y,...,Nuclei_Texture_InfoMeas2_Mitochondria_3_02_256,Nuclei_Texture_InfoMeas2_PM_3_00_256,Nuclei_Texture_InfoMeas2_PM_3_02_256,Nuclei_Texture_InverseDifferenceMoment_ER_3_02_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_00_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_02_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_00_256,Nuclei_Texture_SumEntropy_Mitochondria_3_01_256,Nuclei_Texture_SumVariance_Mitochondria_3_03_256,Nuclei_Texture_SumVariance_PM_3_03_256
0,B,2,2,DMSO,Healthy,,658.950181,133.526354,655.840801,145.906197,...,1.06654,0.508829,1.255416,-0.343713,-1.050713,-0.850892,0.125617,1.346377,0.496842,-0.148968
1,B,2,2,DMSO,Healthy,,447.02193,78.739348,484.574235,96.405302,...,1.252567,2.124057,2.248762,0.211392,-1.420681,-1.165493,-0.01707,2.290233,2.231246,0.249139
