# Split DMSO plate into data splits for three different models

1. Healthy versus failing (IDC) hearts; whole plate
2. Healthy versus failing (DCM subtype) hearts; filter plate
3. Healthy versus failing (HCM subtype) hearts; filter plate

In [1]:
import pathlib
import random

import pandas as pd
from sklearn.model_selection import train_test_split

## Set paths and variables

In [2]:
# Set random state for the whole notebook to ensure reproducibility
random_state = 0
random.seed(random_state)

# Path to directory with feature selected profiles
path_to_feature_selected_data = pathlib.Path(
    "../3.preprocessing_profiles/data/single_cell_profiles/"
).resolve(strict=True)

# Find feature selected parquet file (QC applied)
feature_selected_files = list(
    path_to_feature_selected_data.glob("*_feature_selected.parquet")
)

# Make directory for split data
output_dir = pathlib.Path("./data_splits")
output_dir.mkdir(exist_ok=True)

# Print out the files found
print(f"Found {len(feature_selected_files)} feature selected files:")
for file in feature_selected_files:
    print(f"- {file.name}")

Found 1 feature selected files:
- CARD-CelIns-CX7_251023130003_sc_feature_selected.parquet


## Load in feature selected data

In [3]:
# Load the feature selected file as a DataFrame
feature_selected_file = feature_selected_files[0]
plate = pathlib.Path(feature_selected_file).stem.split("_")[0]
feature_selected_df = pd.read_parquet(feature_selected_file)

print(f"Loaded file: {feature_selected_file.name}")
print(f"Plate name: {plate}")
print(f"Shape: {feature_selected_df.shape}")
feature_selected_df.head()

Loaded file: CARD-CelIns-CX7_251023130003_sc_feature_selected.parquet
Plate name: CARD-CelIns-CX7
Shape: (10849, 1017)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_heart_number,Metadata_treatment,Metadata_cell_type,Metadata_heart_failure_type,Metadata_Nuclei_Location_Center_X,Metadata_Nuclei_Location_Center_Y,Metadata_Cells_Location_Center_X,Metadata_Cells_Location_Center_Y,...,Nuclei_Texture_InverseDifferenceMoment_Actin_3_00_256,Nuclei_Texture_InverseDifferenceMoment_Actin_3_02_256,Nuclei_Texture_InverseDifferenceMoment_ER_3_00_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_00_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_00_256,Nuclei_Texture_SumVariance_Actin_3_01_256,Nuclei_Texture_SumVariance_ER_3_01_256,Nuclei_Texture_SumVariance_Hoechst_3_01_256,Nuclei_Texture_SumVariance_Mitochondria_3_03_256,Nuclei_Texture_SumVariance_PM_3_03_256
0,B,2,2,DMSO,Healthy,,222.51552,91.821668,227.192782,133.566315,...,0.765752,0.661913,0.010225,0.548116,0.614551,-0.222328,-0.267423,-0.193864,-0.321576,-0.178085
1,B,2,2,DMSO,Healthy,,257.395516,243.480384,229.375293,218.997448,...,-0.668562,0.549299,-1.121856,-0.809301,-1.058964,-0.213325,-0.225344,-0.153449,-0.249951,-0.109043
2,B,2,2,DMSO,Healthy,,544.413009,128.311129,568.12414,217.837756,...,1.850038,1.872326,0.394096,-1.071822,-0.33206,-0.297086,-0.272545,0.122763,-0.125239,-0.184862
3,B,2,2,DMSO,Healthy,,461.582781,64.372517,477.790045,65.533781,...,-0.932234,-0.673369,-2.366211,-1.47799,-2.033278,0.772664,3.76417,0.975736,0.815612,1.578534
4,B,2,2,DMSO,Healthy,,306.047773,204.265332,293.359455,192.636878,...,-1.209433,0.117505,1.429133,0.690794,0.497838,-0.177676,-0.46283,-0.38081,-0.28234,-0.229182


## Drop all rows from heart #47 (due to over-confluence)

In [4]:
# Drop rows from heart #47 (due to over-confluence)
feature_selected_df = feature_selected_df[
    feature_selected_df["Metadata_heart_number"] != 47
]
print(f"Shape after dropping Heart #47: {feature_selected_df.shape}")

Shape after dropping Heart #47: (9180, 1017)


## Perform splits for model #1 (healthy versus failing IDC)

Holdout all of heart #2 (media) and one random well from each heart from the model.

In [5]:
# Set output directory for this model
model_output_dir = output_dir / "model_all_hearts"
model_output_dir.mkdir(parents=True, exist_ok=True)

In [6]:
# Hold out all rows where treatment is None (only applies to heart 2)
holdout_mask = feature_selected_df["Metadata_treatment"] == "None"

# Randomly hold out one well per other heart (make sampling reproducible)
random_wells = (
    feature_selected_df[~holdout_mask]
    .groupby("Metadata_heart_number")["Metadata_Well"]
    .apply(lambda x: x.dropna().sample(1, random_state=random_state))
    .explode()
)
print(f"Randomly selected wells for holdout: {random_wells.tolist()}")

# Combine with heart 2 / 'None' treatment rows
holdout_idx = feature_selected_df[holdout_mask].index.union(
    feature_selected_df.index[feature_selected_df["Metadata_Well"].isin(random_wells)]
)

# Create holdout and remaining dataframes
holdout_df = feature_selected_df.loc[holdout_idx].copy()
model_1_df = feature_selected_df.drop(holdout_idx).copy()

# Save holdout set for model_1
holdout_df.to_parquet(model_output_dir / "holdout_split.parquet")

print(f"Holdout set shape: {holdout_df.shape}")

Randomly selected wells for holdout: ['F08', 'D06', 'G02', 'E10', 'C11']
Holdout set shape: (2600, 1017)


In [7]:
print(f"Model 1 data shape (after dropping holdout rows): {model_1_df.shape}")

# Sanity check
assert (
    holdout_df.shape[0] + model_1_df.shape[0] == feature_selected_df.shape[0]
), "Holdout + remaining does not equal original after splitting"

# Set the ratio of the test data to 30% (training data will be 70%)
test_ratio = 0.30

# Split data into training and test sets
train_df, test_df = train_test_split(
    model_1_df,
    test_size=test_ratio,
    stratify=model_1_df["Metadata_cell_type"],
    random_state=random_state,
)

# Save training and test data
train_df.to_parquet(model_output_dir / "training_split.parquet")
test_df.to_parquet(model_output_dir / "testing_split.parquet")

print(f"Training data shape: {train_df.shape}")
print(f"Testing data shape: {test_df.shape}")

# Print out the number of samples per cell type in each split
print("\nCell type distribution in training set:")
print(train_df["Metadata_cell_type"].value_counts(normalize=True))

Model 1 data shape (after dropping holdout rows): (6580, 1017)
Training data shape: (4606, 1017)
Testing data shape: (1974, 1017)

Cell type distribution in training set:
Failing    0.680634
Healthy    0.319366
Name: Metadata_cell_type, dtype: float64


## Perform splits for model #2 (healthy versus failing DCM subtype)

Filter for only healthy and failing hearts from DCM patients. Holdout all of heart #2 (media) and one random well from each heart from the model.

In [8]:
# Set output directory for this model
model_output_dir = output_dir / "model_DCM"
model_output_dir.mkdir(parents=True, exist_ok=True)

In [9]:
# Filter the feature_selected_df for healthy hearts and failing hearts with DCM
filtered_df = feature_selected_df[
    (feature_selected_df["Metadata_cell_type"] == "Healthy")
    | (
        (feature_selected_df["Metadata_cell_type"] == "Failing")
        & (feature_selected_df["Metadata_heart_failure_type"] == "DCM")
    )
].copy()

print(f"Filtered DataFrame shape (Healthy + Failing DCM): {filtered_df.shape}")
print(
    "Unique heart numbers in filtered DataFrame:",
    filtered_df["Metadata_heart_number"].unique(),
)
filtered_df.head()

Filtered DataFrame shape (Healthy + Failing DCM): (6521, 1017)
Unique heart numbers in filtered DataFrame: [ 2  7 23 25]


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_heart_number,Metadata_treatment,Metadata_cell_type,Metadata_heart_failure_type,Metadata_Nuclei_Location_Center_X,Metadata_Nuclei_Location_Center_Y,Metadata_Cells_Location_Center_X,Metadata_Cells_Location_Center_Y,...,Nuclei_Texture_InverseDifferenceMoment_Actin_3_00_256,Nuclei_Texture_InverseDifferenceMoment_Actin_3_02_256,Nuclei_Texture_InverseDifferenceMoment_ER_3_00_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_00_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_00_256,Nuclei_Texture_SumVariance_Actin_3_01_256,Nuclei_Texture_SumVariance_ER_3_01_256,Nuclei_Texture_SumVariance_Hoechst_3_01_256,Nuclei_Texture_SumVariance_Mitochondria_3_03_256,Nuclei_Texture_SumVariance_PM_3_03_256
0,B,2,2,DMSO,Healthy,,222.51552,91.821668,227.192782,133.566315,...,0.765752,0.661913,0.010225,0.548116,0.614551,-0.222328,-0.267423,-0.193864,-0.321576,-0.178085
1,B,2,2,DMSO,Healthy,,257.395516,243.480384,229.375293,218.997448,...,-0.668562,0.549299,-1.121856,-0.809301,-1.058964,-0.213325,-0.225344,-0.153449,-0.249951,-0.109043
2,B,2,2,DMSO,Healthy,,544.413009,128.311129,568.12414,217.837756,...,1.850038,1.872326,0.394096,-1.071822,-0.33206,-0.297086,-0.272545,0.122763,-0.125239,-0.184862
3,B,2,2,DMSO,Healthy,,461.582781,64.372517,477.790045,65.533781,...,-0.932234,-0.673369,-2.366211,-1.47799,-2.033278,0.772664,3.76417,0.975736,0.815612,1.578534
4,B,2,2,DMSO,Healthy,,306.047773,204.265332,293.359455,192.636878,...,-1.209433,0.117505,1.429133,0.690794,0.497838,-0.177676,-0.46283,-0.38081,-0.28234,-0.229182


In [10]:
# Hold out all rows where treatment is None (only applies to heart 2)
holdout_mask = filtered_df["Metadata_treatment"] == "None"

# Randomly hold out one well per other heart (make sampling reproducible)
random_wells = (
    filtered_df[~holdout_mask]
    .groupby("Metadata_heart_number")["Metadata_Well"]
    .apply(lambda x: x.dropna().sample(1, random_state=random_state))
    .explode()
)
print(f"Randomly selected wells for holdout: {random_wells.tolist()}")

# Combine with heart 2 / 'None' treatment rows
holdout_idx = filtered_df[holdout_mask].index.union(
    filtered_df.index[filtered_df["Metadata_Well"].isin(random_wells)]
)

# Create holdout and remaining dataframes
holdout_df = filtered_df.loc[holdout_idx].copy()
model_2_df = filtered_df.drop(holdout_idx).copy()

# Save holdout set for model_2
holdout_df.to_parquet(model_output_dir / "holdout_split.parquet")

print(f"Holdout set shape: {holdout_df.shape}")

Randomly selected wells for holdout: ['F08', 'D06', 'G02', 'E10']
Holdout set shape: (2251, 1017)


In [11]:
print(f"Model 1 data shape (after dropping holdout rows): {model_2_df.shape}")

# Sanity check
assert (
    holdout_df.shape[0] + model_2_df.shape[0] == filtered_df.shape[0]
), "Holdout + remaining does not equal original after splitting"

# Set the ratio of the test data to 30% (training data will be 70%)
test_ratio = 0.30

# Split data into training and test sets
train_df, test_df = train_test_split(
    model_2_df,
    test_size=test_ratio,
    stratify=model_2_df["Metadata_cell_type"],
    random_state=random_state,
)

# Save training and test data
train_df.to_parquet(model_output_dir / "training_split.parquet")
test_df.to_parquet(model_output_dir / "testing_split.parquet")

print(f"Training data shape: {train_df.shape}")
print(f"Testing data shape: {test_df.shape}")

Model 1 data shape (after dropping holdout rows): (4270, 1017)
Training data shape: (2989, 1017)
Testing data shape: (1281, 1017)


## Perform splits for model #3 (healthy versus failing HCM subtype)

Filter for only healthy and failing hearts from HCM patients. Holdout all of heart #2 (media) and one random well from each heart from the model.

In [12]:
# Set output directory for this model
model_output_dir = output_dir / "model_HCM"
model_output_dir.mkdir(parents=True, exist_ok=True)

In [13]:
# Filter the feature_selected_df for healthy hearts and failing hearts with HCM
filtered_df = feature_selected_df[
    (feature_selected_df["Metadata_cell_type"] == "Healthy")
    | (
        (feature_selected_df["Metadata_cell_type"] == "Failing")
        & (feature_selected_df["Metadata_heart_failure_type"] == "HCM")
    )
].copy()

print(f"Filtered DataFrame shape (Healthy + Failing HCM): {filtered_df.shape}")
print(
    "Unique heart numbers in filtered DataFrame:",
    filtered_df["Metadata_heart_number"].unique(),
)
filtered_df.head()

Filtered DataFrame shape (Healthy + Failing HCM): (6719, 1017)
Unique heart numbers in filtered DataFrame: [ 2  7 46]


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_heart_number,Metadata_treatment,Metadata_cell_type,Metadata_heart_failure_type,Metadata_Nuclei_Location_Center_X,Metadata_Nuclei_Location_Center_Y,Metadata_Cells_Location_Center_X,Metadata_Cells_Location_Center_Y,...,Nuclei_Texture_InverseDifferenceMoment_Actin_3_00_256,Nuclei_Texture_InverseDifferenceMoment_Actin_3_02_256,Nuclei_Texture_InverseDifferenceMoment_ER_3_00_256,Nuclei_Texture_InverseDifferenceMoment_Mitochondria_3_00_256,Nuclei_Texture_InverseDifferenceMoment_PM_3_00_256,Nuclei_Texture_SumVariance_Actin_3_01_256,Nuclei_Texture_SumVariance_ER_3_01_256,Nuclei_Texture_SumVariance_Hoechst_3_01_256,Nuclei_Texture_SumVariance_Mitochondria_3_03_256,Nuclei_Texture_SumVariance_PM_3_03_256
0,B,2,2,DMSO,Healthy,,222.51552,91.821668,227.192782,133.566315,...,0.765752,0.661913,0.010225,0.548116,0.614551,-0.222328,-0.267423,-0.193864,-0.321576,-0.178085
1,B,2,2,DMSO,Healthy,,257.395516,243.480384,229.375293,218.997448,...,-0.668562,0.549299,-1.121856,-0.809301,-1.058964,-0.213325,-0.225344,-0.153449,-0.249951,-0.109043
2,B,2,2,DMSO,Healthy,,544.413009,128.311129,568.12414,217.837756,...,1.850038,1.872326,0.394096,-1.071822,-0.33206,-0.297086,-0.272545,0.122763,-0.125239,-0.184862
3,B,2,2,DMSO,Healthy,,461.582781,64.372517,477.790045,65.533781,...,-0.932234,-0.673369,-2.366211,-1.47799,-2.033278,0.772664,3.76417,0.975736,0.815612,1.578534
4,B,2,2,DMSO,Healthy,,306.047773,204.265332,293.359455,192.636878,...,-1.209433,0.117505,1.429133,0.690794,0.497838,-0.177676,-0.46283,-0.38081,-0.28234,-0.229182


In [14]:
# Hold out all rows where treatment is None (only applies to heart 2)
holdout_mask = filtered_df["Metadata_treatment"] == "None"

# Randomly hold out one well per other heart (make sampling reproducible)
random_wells = (
    filtered_df[~holdout_mask]
    .groupby("Metadata_heart_number")["Metadata_Well"]
    .apply(lambda x: x.dropna().sample(1, random_state=random_state))
    .explode()
)
print(f"Randomly selected wells for holdout: {random_wells.tolist()}")

# Combine with heart 2 / 'None' treatment rows
holdout_idx = filtered_df[holdout_mask].index.union(
    filtered_df.index[filtered_df["Metadata_Well"].isin(random_wells)]
)

# Create holdout and remaining dataframes
holdout_df = filtered_df.loc[holdout_idx].copy()
model_3_df = filtered_df.drop(holdout_idx).copy()

# Save holdout set for model_3
holdout_df.to_parquet(model_output_dir / "holdout_split.parquet")

print(f"Holdout set shape: {holdout_df.shape}")

Randomly selected wells for holdout: ['F08', 'D06', 'C11']
Holdout set shape: (2308, 1017)


In [15]:
print(f"Model 1 data shape (after dropping holdout rows): {model_3_df.shape}")

# Sanity check
assert (
    holdout_df.shape[0] + model_3_df.shape[0] == filtered_df.shape[0]
), "Holdout + remaining does not equal original after splitting"

# Set the ratio of the test data to 30% (training data will be 70%)
test_ratio = 0.30

# Split data into training and test sets
train_df, test_df = train_test_split(
    model_3_df,
    test_size=test_ratio,
    stratify=model_3_df["Metadata_cell_type"],
    random_state=random_state,
)

# Save training and test data
train_df.to_parquet(model_output_dir / "training_split.parquet")
test_df.to_parquet(model_output_dir / "testing_split.parquet")

print(f"Training data shape: {train_df.shape}")
print(f"Testing data shape: {test_df.shape}")

Model 1 data shape (after dropping holdout rows): (4411, 1017)
Training data shape: (3087, 1017)
Testing data shape: (1324, 1017)
