In [2]:
import scanpy as sc
import numpy as np

# Define datasets and cell type keys
datasets = ["forebrain", "pancreas", "gastrulation_erythroid", "dentategyrus_lamanno_P5"]
cell_type_keys = ["Clusters", "clusters", "celltype", "clusters"]

# Loop through datasets and process each
for dataset, cell_type_key in zip(datasets, cell_type_keys):
    adata = sc.read_h5ad(f"benchmark/imVelo/{dataset}/imVelo_{dataset}.h5ad")
    
    # Get the gene program activations from adata.obsm["z"]
    z = adata.obsm["z"]
    
    # Get the cell type labels
    cell_types = adata.obs[cell_type_key]
    
    # Get gene program names from adata.uns["terms"]
    gene_program_names = np.array(adata.uns["terms"])
    
    # For each cell type, compute the mean activation of each gene program
    unique_cell_types = cell_types.unique()
    
    for cell_type in unique_cell_types:
        # Subset data for the current cell type
        cell_type_mask = (cell_types == cell_type)
        z_cell_type = z[cell_type_mask, :]
        
        # Calculate the mean activation across cells for each gene program
        mean_activation = np.mean(z_cell_type, axis=0)
        
        # Find top 5 most activated (positive) and inactivated (negative) gene programs
        top_5_activated_idx = np.argsort(mean_activation)[-5:][::-1]  # Top 5 positive
        top_5_inactivated_idx = np.argsort(mean_activation)[:5]  # Top 5 negative
        
        # Print results for this cell type
        print(f"Top 5 activated gene programs for cell type: {cell_type} in dataset: {dataset}")
        for idx in top_5_activated_idx:
            print(f"{gene_program_names[idx]}: {mean_activation[idx]}")
        
        print(f"Top 5 inactivated gene programs for cell type: {cell_type} in dataset: {dataset}")
        for idx in top_5_inactivated_idx:
            print(f"{gene_program_names[idx]}: {mean_activation[idx]}")
        
        print("-" * 50)


Top 5 activated gene programs for cell type: Neuroblast 2 in dataset: forebrain
PREFOLDIN_MEDIATED_TRANSFER_OF: 16.43832520212993
PLATELET_ACTIVATION_SIGNALING_: 10.233322860725433
HEMOSTASIS: 9.225308384761274
PYRAMIDAL_CELLS: 8.447613077470098
3_UTR_MEDIATED_TRANSLATIONAL_R: 5.8996798446379515
Top 5 inactivated gene programs for cell type: Neuroblast 2 in dataset: forebrain
RESPONSE_TO_ELEVATED_PLATELET_: -29.640441971131597
LOSS_OF_NLP_FROM_MITOTIC_CENTR: -8.113931739665418
CARDIOMYOCYTES: -6.831082119999162
AXON_GUIDANCE: -6.097901012524065
ACTIVATION_OF_THE_MRNA_UPON_BI: -5.934456652905568
--------------------------------------------------
Top 5 activated gene programs for cell type: Radial Glia 2 in dataset: forebrain
MUSCLE_CONTRACTION: 8.715580810983496
TRAF6_MEDIATED_INDUCTION_OF_NF: 6.246776203434151
APOPTOTIC_EXECUTION_PHASE: 6.046907224748508
DESTABILIZATION_OF_MRNA_BY_AUF: 5.756696156349527
CIRCADIAN_REPRESSION_OF_EXPRES: 5.667605136352849
Top 5 inactivated gene programs f