# Split DMSO plate into data splits for three different models

1. Healthy versus failing (IDC) hearts; whole plate
2. Healthy versus failing (DCM subtype) hearts; filter plate
3. Healthy versus failing (HCM subtype) hearts; filter plate

In [1]:
import pathlib
import random

import pandas as pd
from sklearn.model_selection import train_test_split

## Set paths and variables

In [2]:
# Set random state for the whole notebook to ensure reproducibility
random_state = 0
random.seed(random_state)

# Set if training model on redo plate (true) or original (false)
redo_plate = True

# Path to directory with feature selected profiles
path_to_feature_selected_data = pathlib.Path(
    "../3.preprocessing_profiles/data/single_cell_profiles/"
).resolve(strict=True)

if redo_plate:  # redo plate processing
    # Find feature selected parquet file (QC applied)
    feature_selected_path = pathlib.Path(
        path_to_feature_selected_data
        / "CARD-CelIns-CX7_251110170001_sc_feature_selected.parquet"
    )
    # Make directory for split data
    output_dir = pathlib.Path("./data_splits/redo_DMSO_plate")
    output_dir.mkdir(exist_ok=True)
else:  # process original plate
    # Find feature selected parquet file (QC applied)
    feature_selected_path = pathlib.Path(
        path_to_feature_selected_data
        / "CARD-CelIns-CX7_251023130003_sc_feature_selected.parquet"
    )
    # Make directory for split data
    output_dir = pathlib.Path("./data_splits/original_DMSO_plate")
    output_dir.mkdir(exist_ok=True)

# Print out the files found
print(f"Found feature selected file: {feature_selected_path.stem.split('_')[:2]}")

Found feature selected file: ['CARD-CelIns-CX7', '251110170001']


## Load in feature selected data

In [3]:
# Load the feature selected file as a DataFrame
feature_selected_df = pd.read_parquet(feature_selected_path)

print(f"Loaded file: {feature_selected_path.name}")
print(f"Plate name: {feature_selected_path.stem.split('_')[0]}")
print(f"Shape: {feature_selected_df.shape}")
feature_selected_df.head()

Loaded file: CARD-CelIns-CX7_251110170001_sc_feature_selected.parquet
Plate name: CARD-CelIns-CX7
Shape: (8852, 1030)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_heart_number,Metadata_treatment,Metadata_cell_type,Metadata_heart_failure_type,Metadata_Nuclei_Location_Center_X,Metadata_Nuclei_Location_Center_Y,Metadata_Cells_Location_Center_X,Metadata_Cells_Location_Center_Y,...,Nuclei_Texture_InfoMeas2_PM_3_02_256,Nuclei_Texture_InfoMeas2_PM_3_03_256,Nuclei_Texture_InverseDifferenceMoment_ER_3_03_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_00_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_02_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_00_256,Nuclei_Texture_SumVariance_ER_3_01_256,Nuclei_Texture_SumVariance_Hoechst_3_03_256,Nuclei_Texture_SumVariance_Mitochondria_3_03_256,Nuclei_Texture_SumVariance_PM_3_01_256
0,B,2,2,DMSO,Healthy,,658.950181,133.526354,655.840801,145.906197,...,1.503626,0.573396,0.682092,-1.192413,-0.972346,0.088417,0.181171,-0.227029,0.945871,-0.176102
1,B,2,2,DMSO,Healthy,,714.793001,165.772897,650.947077,167.051936,...,0.296864,0.045975,0.216872,0.151119,0.249376,-0.134875,-0.322306,-0.142915,-0.398037,-0.119133
2,B,2,2,DMSO,Healthy,,652.687541,189.770385,619.407101,199.34778,...,0.115447,-0.101385,0.173914,-0.259197,-0.084661,-0.122071,-0.245183,-0.273957,-0.405488,-0.141397
3,B,2,2,DMSO,Healthy,,888.717949,208.954635,871.204118,158.089794,...,1.183911,0.966689,-2.448364,-2.023826,-1.889796,-1.007221,1.067133,-0.254258,0.636258,0.246546
4,B,2,2,DMSO,Healthy,,599.612215,235.496689,568.763322,236.223716,...,0.295199,0.763092,-0.951318,0.042545,-2.385016,-0.173919,-0.194049,-0.216982,0.629716,-0.033578


## Drop all rows from heart #47 (due to over-confluence)

In [4]:
# Drop rows from heart #47 (due to over-confluence)
feature_selected_df = feature_selected_df[
    feature_selected_df["Metadata_heart_number"] != 47
]
print(f"Shape after dropping Heart #47: {feature_selected_df.shape}")

Shape after dropping Heart #47: (7431, 1030)


## Perform splits for model #1 (healthy versus failing IDC)

Holdout all of heart #2 (media) and one random well from each heart from the model.

In [5]:
# Set output directory for this model
model_output_dir = output_dir / "model_all_hearts"
model_output_dir.mkdir(parents=True, exist_ok=True)

In [6]:
# Hold out all rows where treatment is None (only applies to heart 2)
holdout_mask = feature_selected_df["Metadata_treatment"] == "None"

# Randomly hold out one well per other heart (make sampling reproducible)
random_wells = (
    feature_selected_df[~holdout_mask]
    .groupby("Metadata_heart_number")["Metadata_Well"]
    .apply(lambda x: x.dropna().sample(1, random_state=random_state))
    .explode()
)
print(f"Randomly selected wells for holdout: {random_wells.tolist()}")

# Combine with heart 2 / 'None' treatment rows
holdout_idx = feature_selected_df[holdout_mask].index.union(
    feature_selected_df.index[feature_selected_df["Metadata_Well"].isin(random_wells)]
)

# Create holdout and remaining dataframes
holdout_df = feature_selected_df.loc[holdout_idx].copy()
model_1_df = feature_selected_df.drop(holdout_idx).copy()

# Save holdout set for model_1
holdout_df.to_parquet(model_output_dir / "holdout_split.parquet")

print(f"Holdout set shape: {holdout_df.shape}")

Randomly selected wells for holdout: ['D05', 'G07', 'F09', 'B06', 'E11']
Holdout set shape: (2042, 1030)


In [7]:
print(f"Model 1 data shape (after dropping holdout rows): {model_1_df.shape}")

# Sanity check
assert (
    holdout_df.shape[0] + model_1_df.shape[0] == feature_selected_df.shape[0]
), "Holdout + remaining does not equal original after splitting"

# Set the ratio of the test data to 30% (training data will be 70%)
test_ratio = 0.30

# Split data into training and test sets
train_df, test_df = train_test_split(
    model_1_df,
    test_size=test_ratio,
    stratify=model_1_df["Metadata_cell_type"],
    random_state=random_state,
)

# Save training and test data
train_df.to_parquet(model_output_dir / "training_split.parquet")
test_df.to_parquet(model_output_dir / "testing_split.parquet")

print(f"Training data shape: {train_df.shape}")
print(f"Testing data shape: {test_df.shape}")

# Print out the number of samples per cell type in each split
print("\nCell type distribution in training set:")
print(train_df["Metadata_cell_type"].value_counts(normalize=True))

Model 1 data shape (after dropping holdout rows): (5389, 1030)
Training data shape: (3772, 1030)
Testing data shape: (1617, 1030)

Cell type distribution in training set:
Failing    0.673383
Healthy    0.326617
Name: Metadata_cell_type, dtype: float64


## Perform splits for model #2 (healthy versus failing DCM subtype)

Filter for only healthy and failing hearts from DCM patients. Holdout all of heart #2 (media) and one random well from each heart from the model.

In [8]:
# Set output directory for this model
model_output_dir = output_dir / "model_DCM"
model_output_dir.mkdir(parents=True, exist_ok=True)

In [9]:
# Filter the feature_selected_df for healthy hearts and failing hearts with DCM
filtered_df = feature_selected_df[
    (feature_selected_df["Metadata_cell_type"] == "Healthy")
    | (
        (feature_selected_df["Metadata_cell_type"] == "Failing")
        & (feature_selected_df["Metadata_heart_failure_type"] == "DCM")
    )
].copy()

print(f"Filtered DataFrame shape (Healthy + Failing DCM): {filtered_df.shape}")
print(
    "Unique heart numbers in filtered DataFrame:",
    filtered_df["Metadata_heart_number"].unique(),
)
filtered_df.head()

Filtered DataFrame shape (Healthy + Failing DCM): (5100, 1030)
Unique heart numbers in filtered DataFrame: [ 2  7 23 25]


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_heart_number,Metadata_treatment,Metadata_cell_type,Metadata_heart_failure_type,Metadata_Nuclei_Location_Center_X,Metadata_Nuclei_Location_Center_Y,Metadata_Cells_Location_Center_X,Metadata_Cells_Location_Center_Y,...,Nuclei_Texture_InfoMeas2_PM_3_02_256,Nuclei_Texture_InfoMeas2_PM_3_03_256,Nuclei_Texture_InverseDifferenceMoment_ER_3_03_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_00_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_02_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_00_256,Nuclei_Texture_SumVariance_ER_3_01_256,Nuclei_Texture_SumVariance_Hoechst_3_03_256,Nuclei_Texture_SumVariance_Mitochondria_3_03_256,Nuclei_Texture_SumVariance_PM_3_01_256
0,B,2,2,DMSO,Healthy,,658.950181,133.526354,655.840801,145.906197,...,1.503626,0.573396,0.682092,-1.192413,-0.972346,0.088417,0.181171,-0.227029,0.945871,-0.176102
1,B,2,2,DMSO,Healthy,,714.793001,165.772897,650.947077,167.051936,...,0.296864,0.045975,0.216872,0.151119,0.249376,-0.134875,-0.322306,-0.142915,-0.398037,-0.119133
2,B,2,2,DMSO,Healthy,,652.687541,189.770385,619.407101,199.34778,...,0.115447,-0.101385,0.173914,-0.259197,-0.084661,-0.122071,-0.245183,-0.273957,-0.405488,-0.141397
3,B,2,2,DMSO,Healthy,,888.717949,208.954635,871.204118,158.089794,...,1.183911,0.966689,-2.448364,-2.023826,-1.889796,-1.007221,1.067133,-0.254258,0.636258,0.246546
4,B,2,2,DMSO,Healthy,,599.612215,235.496689,568.763322,236.223716,...,0.295199,0.763092,-0.951318,0.042545,-2.385016,-0.173919,-0.194049,-0.216982,0.629716,-0.033578


In [10]:
# Hold out all rows where treatment is None (only applies to heart 2)
holdout_mask = filtered_df["Metadata_treatment"] == "None"

# Randomly hold out one well per other heart (make sampling reproducible)
random_wells = (
    filtered_df[~holdout_mask]
    .groupby("Metadata_heart_number")["Metadata_Well"]
    .apply(lambda x: x.dropna().sample(1, random_state=random_state))
    .explode()
)
print(f"Randomly selected wells for holdout: {random_wells.tolist()}")

# Combine with heart 2 / 'None' treatment rows
holdout_idx = filtered_df[holdout_mask].index.union(
    filtered_df.index[filtered_df["Metadata_Well"].isin(random_wells)]
)

# Create holdout and remaining dataframes
holdout_df = filtered_df.loc[holdout_idx].copy()
model_2_df = filtered_df.drop(holdout_idx).copy()

# Save holdout set for model_2
holdout_df.to_parquet(model_output_dir / "holdout_split.parquet")

print(f"Holdout set shape: {holdout_df.shape}")

Randomly selected wells for holdout: ['D05', 'G07', 'F09', 'B06']
Holdout set shape: (1748, 1030)


In [11]:
print(f"Model 1 data shape (after dropping holdout rows): {model_2_df.shape}")

# Sanity check
assert (
    holdout_df.shape[0] + model_2_df.shape[0] == filtered_df.shape[0]
), "Holdout + remaining does not equal original after splitting"

# Set the ratio of the test data to 30% (training data will be 70%)
test_ratio = 0.30

# Split data into training and test sets
train_df, test_df = train_test_split(
    model_2_df,
    test_size=test_ratio,
    stratify=model_2_df["Metadata_cell_type"],
    random_state=random_state,
)

# Save training and test data
train_df.to_parquet(model_output_dir / "training_split.parquet")
test_df.to_parquet(model_output_dir / "testing_split.parquet")

print(f"Training data shape: {train_df.shape}")
print(f"Testing data shape: {test_df.shape}")

Model 1 data shape (after dropping holdout rows): (3352, 1030)
Training data shape: (2346, 1030)
Testing data shape: (1006, 1030)


## Perform splits for model #3 (healthy versus failing HCM subtype)

Filter for only healthy and failing hearts from HCM patients. Holdout all of heart #2 (media) and one random well from each heart from the model.

In [12]:
# Set output directory for this model
model_output_dir = output_dir / "model_HCM"
model_output_dir.mkdir(parents=True, exist_ok=True)

In [13]:
# Filter the feature_selected_df for healthy hearts and failing hearts with HCM
filtered_df = feature_selected_df[
    (feature_selected_df["Metadata_cell_type"] == "Healthy")
    | (
        (feature_selected_df["Metadata_cell_type"] == "Failing")
        & (feature_selected_df["Metadata_heart_failure_type"] == "HCM")
    )
].copy()

print(f"Filtered DataFrame shape (Healthy + Failing HCM): {filtered_df.shape}")
print(
    "Unique heart numbers in filtered DataFrame:",
    filtered_df["Metadata_heart_number"].unique(),
)
filtered_df.head()

Filtered DataFrame shape (Healthy + Failing HCM): (5576, 1030)
Unique heart numbers in filtered DataFrame: [ 2  7 46]


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_heart_number,Metadata_treatment,Metadata_cell_type,Metadata_heart_failure_type,Metadata_Nuclei_Location_Center_X,Metadata_Nuclei_Location_Center_Y,Metadata_Cells_Location_Center_X,Metadata_Cells_Location_Center_Y,...,Nuclei_Texture_InfoMeas2_PM_3_02_256,Nuclei_Texture_InfoMeas2_PM_3_03_256,Nuclei_Texture_InverseDifferenceMoment_ER_3_03_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_00_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_02_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_00_256,Nuclei_Texture_SumVariance_ER_3_01_256,Nuclei_Texture_SumVariance_Hoechst_3_03_256,Nuclei_Texture_SumVariance_Mitochondria_3_03_256,Nuclei_Texture_SumVariance_PM_3_01_256
0,B,2,2,DMSO,Healthy,,658.950181,133.526354,655.840801,145.906197,...,1.503626,0.573396,0.682092,-1.192413,-0.972346,0.088417,0.181171,-0.227029,0.945871,-0.176102
1,B,2,2,DMSO,Healthy,,714.793001,165.772897,650.947077,167.051936,...,0.296864,0.045975,0.216872,0.151119,0.249376,-0.134875,-0.322306,-0.142915,-0.398037,-0.119133
2,B,2,2,DMSO,Healthy,,652.687541,189.770385,619.407101,199.34778,...,0.115447,-0.101385,0.173914,-0.259197,-0.084661,-0.122071,-0.245183,-0.273957,-0.405488,-0.141397
3,B,2,2,DMSO,Healthy,,888.717949,208.954635,871.204118,158.089794,...,1.183911,0.966689,-2.448364,-2.023826,-1.889796,-1.007221,1.067133,-0.254258,0.636258,0.246546
4,B,2,2,DMSO,Healthy,,599.612215,235.496689,568.763322,236.223716,...,0.295199,0.763092,-0.951318,0.042545,-2.385016,-0.173919,-0.194049,-0.216982,0.629716,-0.033578


In [14]:
# Hold out all rows where treatment is None (only applies to heart 2)
holdout_mask = filtered_df["Metadata_treatment"] == "None"

# Randomly hold out one well per other heart (make sampling reproducible)
random_wells = (
    filtered_df[~holdout_mask]
    .groupby("Metadata_heart_number")["Metadata_Well"]
    .apply(lambda x: x.dropna().sample(1, random_state=random_state))
    .explode()
)
print(f"Randomly selected wells for holdout: {random_wells.tolist()}")

# Combine with heart 2 / 'None' treatment rows
holdout_idx = filtered_df[holdout_mask].index.union(
    filtered_df.index[filtered_df["Metadata_Well"].isin(random_wells)]
)

# Create holdout and remaining dataframes
holdout_df = filtered_df.loc[holdout_idx].copy()
model_3_df = filtered_df.drop(holdout_idx).copy()

# Save holdout set for model_3
holdout_df.to_parquet(model_output_dir / "holdout_split.parquet")

print(f"Holdout set shape: {holdout_df.shape}")

Randomly selected wells for holdout: ['D05', 'G07', 'E11']
Holdout set shape: (1779, 1030)


In [15]:
print(f"Model 1 data shape (after dropping holdout rows): {model_3_df.shape}")

# Sanity check
assert (
    holdout_df.shape[0] + model_3_df.shape[0] == filtered_df.shape[0]
), "Holdout + remaining does not equal original after splitting"

# Set the ratio of the test data to 30% (training data will be 70%)
test_ratio = 0.30

# Split data into training and test sets
train_df, test_df = train_test_split(
    model_3_df,
    test_size=test_ratio,
    stratify=model_3_df["Metadata_cell_type"],
    random_state=random_state,
)

# Save training and test data
train_df.to_parquet(model_output_dir / "training_split.parquet")
test_df.to_parquet(model_output_dir / "testing_split.parquet")

print(f"Training data shape: {train_df.shape}")
print(f"Testing data shape: {test_df.shape}")

Model 1 data shape (after dropping holdout rows): (3797, 1030)


Training data shape: (2657, 1030)
Testing data shape: (1140, 1030)
