In [None]:
import pandas as pd                     # Data manipulation and analysis (DataFrames)
import matplotlib.pyplot as plt         # Plotting and figure generation
import seaborn as sns                   # Statistical visualization with clean styles
import scanpy as sc                     # Single-cell / spatial omics analysis toolkit
import numpy as np                      # Numerical computing and arrays
from sklearn.decomposition import PCA   # Dimensionality reduction (principal components)
from sklearn.neighbors import NearestNeighbors  # k-NN search / graph construction
import igraph as ig                     # Graph data structures and algorithms
import leidenalg as la                  # Leiden community detection (graph clustering)
from umap.umap_ import fuzzy_simplicial_set  # UMAP fuzzy graph construction (advanced)
import umap                             # UMAP embeddings for DR/visualization
import os                               # OS utilities (paths, files)
import math                             # Math functions
import warnings                         # Control/suppress warnings
warnings.filterwarnings('ignore')       # Hide non-critical warnings for cleaner logs
import gzip                             # Read/write .gz compressed files
import json                             # JSON (configs/metadata I/O)
import pickle                           # Serialize/deserialize Python objects

# import tensorflow as tf               # (Optional) Deep learning backend (currently disabled)
import sys                              # Access Python path, argv, etc.
sys.path.append('/home/shamini/')       # Add custom code directory to Python path

# --- Duplicates below (already imported above) ---
from sklearn.decomposition import PCA   # (Duplicate) PCA already imported
from sklearn.neighbors import NearestNeighbors  # (Duplicate) already imported
import igraph as ig                     # (Duplicate) already imported
import leidenalg as la                  # (Duplicate) already imported
# from umap.umap_ import fuzzy_simplicial_set  # (Duplicate) already imported
import umap                             # (Duplicate) already imported

from sklearn.model_selection import train_test_split  # Split arrays into train/test sets

In [None]:
colors_palette = [
    '#ebac23', #\n",
    '#b80058', #lipstick 0,140,249 \n",
    '#008cf9', #azure 0,110,0 \n",
    '#006e00', #green 0,187,173 \n",
    '#00bbad', #caribbean 209,99,230 \n",
    '#d163e6', #lavender 178,69,2 \n",
    '#b24502', #brown 255,146,135 \n",
    '#ff9287', #coral 89,84,214 \n",
    '#5954d6', #indigo 0,198,248 \n",
    '#00c6f8', #turquoise 135,133,0 \n",
    '#878500', #olive 0,167,108 \n",
    '#00a76c', #jade 189,189,189 \n",
    '#274d52', #plantation 199,162,166 \n",
    '#c7a2a6', #eunry 129,139,112 \n",
    '#818b70', #battleship 96,78,60 \n",
    '#604e3c', #kabul 140,159,183 \n",
    '#8c9fb7', #balihai 121,104,128 \n",
    '#796880', #rum,\n",
    '#56641a', #fernfrond 192,175,251 \n",
    '#c0affb', #perfume 230,161,118 \n",
    '#e6a176', #apricot 0,103,138 \n",
    '#00678a', #orient 152,68,100 \n",
    '#984464', #vinrouge 94,204,171 \n",
    '#5eccab', #downy\n",
    '#bdbdbd'] #gray\n"

In [None]:
working_dir = '/home/mystique27m/ext_gpu_hd/hackathon/'
# Main working directory for the hackathon project

### create output directories
main_out = working_dir+'out/'
# Base output folder where all results are stored

src_obj_dir = main_out+'script01b_output_objects/'
# Source directory: processed objects from step script01b

dst_obj_dir = main_out+'script01c_output_objects/'
# Destination directory: will hold output objects from step script01c

os.makedirs(dst_obj_dir, exist_ok=True)
# Create the destination directory if it does not already exist

In [None]:
adatas_filenames = os.listdir(src_obj_dir)
# List all filenames in the source object directory (script01b outputs)

gbmap = sc.read_h5ad('/home/mystique27m/ext_gpu_hd/xenium/data/single_cell/gbmap/gbmap.h5ad')
# Load a reference AnnData object (gbmap) containing single-cell gene expression data

#gbmap = sc.read_h5ad('/home/shamini/data1/data_orig/data/single_cell/gbmap/core_gbmap.h5ad')
# (Alternative path) Load a different version of the gbmap dataset (commented out)

In [None]:
annotations_to_compare = [
    'annotation_level_1',   # Broadest classification (high-level categories)
    'annotation_level_2',   # Intermediate classification (sub-categories)
    'annotation_level_3',   # Fine-grained classification (detailed cell types)
    'celltype_original',    # Original cell type labels from the dataset provider
    'cell_type'             # Standardized or harmonized cell type labels
]

In [None]:
adata = sc.read_h5ad(src_obj_dir+adatas_filenames[0])
# Load the first AnnData object from the script01b outputs

genes = adata.var_names
# Extract the list of gene names from the dataset

### subset gbmap
gbmap.var['gene'] = [gene.split('_')[0] for gene in gbmap.var['feature_name']]
# Create a new column 'gene' in gbmap by stripping suffixes from feature_name (keep base gene symbol)

shared_genes = sorted(set(gbmap.var['gene']).intersection(set(adata.var_names)))
# Find the intersection between gbmap genes and adata genes (common genes)

gbmap_var_names = gbmap.var.index[gbmap.var['gene'].isin(shared_genes)]
# Get indices of gbmap variables that are among the shared genes

gbmap = gbmap[:, gbmap_var_names]
# Subset gbmap to only include shared genes

gbmap_obs = pd.DataFrame(gbmap.X.toarray(), index=gbmap.obs.index, columns=gbmap.var.gene)
# Convert gbmap expression matrix into a DataFrame:
#   - Rows = cells (obs)
#   - Columns = shared genes
#   - Values = expression counts

#print(f'------ shape of gbmap after filtering: {gbmap_red.shape}')
# Debug: check shape of filtered gbmap (commented out)

#gbmap_obs['annotation'] = gbmap_red.obs[annotation]
# (Commented) Add annotation labels to the expression DataFrame

#df = gbmap_obs.groupby('annotation').mean().rename_axis('', axis=1).rename_axis('', axis=0).T
# (Commented) Group by annotation, compute mean expression per group, and transpose

In [None]:
import gc          # Import Python's garbage collection module (manages memory cleanup)
gc.collect()       # Manually trigger garbage collection to free unused memory

In [None]:
gbmap_obs   # DataFrame: cell × gene expression matrix (only shared genes)
            # - Rows (index): cells from gbmap.obs.index
            # - Columns: shared gene names
            # - Values: expression levels for each cell–gene pair

In [None]:
adata = adata[:, shared_genes]
# Subset the AnnData object to only include shared genes between adata and gbmap

adf = pd.DataFrame(adata.X.toarray(), index=adata.obs.index, columns=adata.var_names).T
# Convert expression matrix to DataFrame, transpose so:
# rows = genes, columns = Xenium cells

for annotation in annotations_to_compare:
    print(f'--- Annotation: {annotation}')
    # Loop through each annotation type to compare (e.g., annotation_level_1, cell_type)

    #df_gbmap = gbmap.loc[:, gbmap.columns[gbmap.columns.isin(adata.var.index)]]
    # (Commented) Example: filter gbmap to only adata genes

    print(f'------ shape of gbmap after filtering: {gbmap.shape}')
    # Print gbmap shape after filtering to shared genes

    gbmap_obs['annotation'] = gbmap.obs[annotation]
    # Add the current annotation labels to gbmap_obs DataFrame

    df = gbmap_obs.groupby('annotation').mean().rename_axis('', axis=1).rename_axis('', axis=0).T
    # Group gbmap cells by annotation, compute average expression per annotation
    # Result: df = genes × annotations (mean profiles)

    print(f'------ shape of gbmap after grouping: {df.shape} \n')
    # Print shape after grouping to annotation profiles

    ### calculate pearson correlation for each xenium cell with the averaged cell type scores from gbmap df above
    ### empty list to store the correlation values
    final_correlations = []
    # Initialize list to hold correlation results for each Xenium cell

    for column in adf.columns:
        # Loop over each Xenium cell
        corrs = []
        for column2 in df.columns:
            # Loop over each annotation class
            corr = np.corrcoef(adf[column].astype(float), df[column2].astype(float))[0, 1]
            # Compute Pearson correlation between Xenium cell profile and annotation profile
            corrs.append(corr)

        final_correlations.append(pd.DataFrame(corrs))
        # Store correlations for this cell

    final_correlations = pd.concat(final_correlations, axis=1)
    # Combine correlations for all cells into one DataFrame

    final_correlations.columns = adf.columns
    # Columns = Xenium cell IDs
    final_correlations.index = df.columns
    # Rows = annotation labels

    scores = final_correlations.idxmax(axis=0)
    # For each cell, find the annotation with the highest correlation (predicted label)
    values = final_correlations.max(axis=0)
    # For each cell, record the maximum correlation value

    scores = scores.reindex(adata.obs.index)
    values = values.reindex(adata.obs.index)
    # Align scores and values with the cell order in adata.obs

    adata.obs[f'{annotation}'] = scores.values
    # Save predicted annotation label for each cell
    adata.obs[f'{annotation}_corr'] = values.values
    # Save corresponding correlation score for each cell

adata.write_h5ad(dst_obj_dir+'adata_gbmap_labeled.h5ad')
# Save the updated AnnData object with gbmap-based annotations

print('Completed correlation analysis for adata')
# Print completion message