In [1]:
import anndata as ad
import os
import re
import numpy as np
import squidpy as sq
import scanpy as sc

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [56]:
# Ave Int Harmony UMAP 26 assigned clusters
adata = ad.read_h5ad("protein_ave_int_25_clusters.h5ad")

In [68]:
# Export cell IDs in a cluster for cropping nuclear images
# paste cell ID list in Step_6_Nuclear 3D image cropping docker/crop_open.sh

cell_names = adata.obs[adata.obs['leiden'] == "23"].index.tolist()   
cell_string = ",".join(cell_names)
print(cell_string)

138309,138317,133832,136579,136632,136720,136767,136864,136883,136929,137008,137020,137032,137071,137076,137109,137137,137254,137280,137503,137512,137513,137546,137549,137553,137560,137598,137610,137620,137627,137634,137635,137638,137643,137644,137647,137658,137663,137665,137683,137686,137688,137689,137691,137692,137693,137701,137702,137704,137706,137710,137718,137731,137733,137734,137737,137748,137761,137762,137771,137775,137785,137787,137791,137794,137801,137802,137804,137813,137814,137815,137818,137821,137823,137828,137830,137836,137837,137849,137850,137851,137855,137859,137876,137880,137881,137883,137886,137892,137908,137913,137914,137924,137925,137927,137928,137935,137937,137943,137949,137958,137963,137966,137978,137982,137993,137994,138001,138005,138006,138018,138019,138021,138030,138031,138034,138042,138045,138053,138054,138056,138059,138065,138067,138073,138074,138079,138082,138087,138088,138089,138096,138097,138100,138108,138109,138110,138119,138120,138124,138132,138134,138136

In [62]:
# Select clusters to extract the image dataset for explainable machine learning
# Import the generated .csv file into Step_7_and_12_nuclei_measure_and_write_ML_images/Step_12_write_ML_image_dataset.py

np.random.seed(42)
cluster_col = "leiden"  # change if using a different clustering key

# Specify clusters for image classes 
selected_clusters = ["0", "1", "2", "5"]

# Prepare a list to hold sampled data
sampled_dfs = []

# Iterate over each cluster and sample 2000 cells
for cluster, group in adata.obs.groupby(cluster_col):
    if cluster not in selected_clusters:
        continue
    n_cells = len(group)
    n_sample = min(2000, n_cells)  # Avoid errors if cluster has <2000 cells

    sampled_cells = group.sample(n=n_sample, replace=False, random_state=42)
    sampled_cells = sampled_cells[[cluster_col]].copy()
    sampled_cells.index.name = "cell_id"
    sampled_dfs.append(sampled_cells)

# Concatenate all sampled cells into a single DataFrame
result_df = pd.concat(sampled_dfs).reset_index()

adata.obs_names = adata.obs_names.astype(str)

# Only for the specific way of image file organization
# Match the folder names which are based on RNA clusters
sampled_ids = result_df["cell_id"]

# another column for foldder name
new_cluster_col = "cluster" # RNA cluster

# Filter adata to include only the sampled cells
matching_cells = adata.obs.loc[sampled_ids]

# Extract the RNA cluster assignment
new_cluster_df = matching_cells[[new_cluster_col]].copy()
new_cluster_df.index.name = "cell_id"
new_cluster_df.reset_index(inplace=True)

leiden_map = adata.obs["leiden"].to_dict()
new_cluster_df["protein_cluster"] = new_cluster_df["cell_id"].map(leiden_map)

assignment_dfs = []

for cluster, group in new_cluster_df.groupby("protein_cluster"):
    n = len(group)
    n_train = int(n * 0.7)

    shuffled = group.sample(frac=1, random_state=42)
    train_df = shuffled.iloc[:n_train].copy()
    val_df = shuffled.iloc[n_train:].copy()

    train_df["assignment"] = "train"
    val_df["assignment"] = "val"

    assignment_dfs.extend([train_df, val_df])

# Save to CSV
split_df = pd.concat(assignment_dfs).reset_index(drop=True)
split_df.to_csv("./sampling_2000_cells_cluster/protein_clusters_cells.csv", index=False)