In [None]:
import pandas as pd                     # Data manipulation and analysis (DataFrames)
import matplotlib.pyplot as plt         # Plotting and figure generation
import seaborn as sns                   # Statistical visualization with clean styles
import scanpy as sc                     # Single-cell / spatial omics analysis toolkit
import numpy as np                      # Numerical computing and arrays

from sklearn.decomposition import PCA   # Dimensionality reduction (principal components)
from sklearn.neighbors import NearestNeighbors  # k-NN graph construction

import igraph as ig                     # Graph data structures and algorithms
import leidenalg as la                  # Leiden community detection (clustering)

from umap.umap_ import fuzzy_simplicial_set  # Build UMAP fuzzy simplicial graph
import umap                             # UMAP dimensionality reduction and visualization

import os                               # OS utilities (file paths, directories)
import math                             # Math functions and constants
import pickle                           # Save/load serialized Python objects

import warnings                         # Control/suppress warnings
warnings.filterwarnings('ignore')       # Suppress non-critical warnings for cleaner logs

# import tensorflow as tf               # (Optional) Deep learning framework (disabled)

import sys                              # Access system-specific parameters and Python path

In [None]:
home_dir = os.path.expanduser('~')
# Get the current user's home directory path

sys.path.append(os.path.join(home_dir, 'Banksy_py'))
# Add Banksy project main folder to Python path
sys.path.append(os.path.join(home_dir, 'Banksy_py', 'banksy'))
# Add Banksy submodule folder to Python path
sys.path.append(os.path.join(home_dir, 'Banksy_py', 'banksy_utils'))
# Add Banksy utilities folder to Python path

working_dir = os.path.join(home_dir, 'ext_gpu_hd' ,'hackathon')
# Define main working directory for hackathon project

main_out = os.path.join(working_dir, 'out')
# Path to general output folder

src_adatas = os.path.join(main_out, 'script01c_output_objects')
# Source folder containing AnnData objects from script01c step

out_obj_destdir = os.path.join(main_out, 'script02a_output_banksy_objects/')
# Destination folder where Banksy-processed objects will be saved (script02a)

os.makedirs(out_obj_destdir, exist_ok=True)
# Create the destination folder if it doesn’t exist

In [None]:
os.listdir(src_adatas)
# List all files and folders inside the source AnnData directory (script01c outputs)
# → Typically contains .h5ad files generated in the previous step

In [None]:
'''
1. FIRST WE WILL PERFORM A NEAREST NEIGHBOR BASED DISTANCE CALCULATION
   TO COMPUTE THE NECESSARY DISTANCES BETWEEN THE CELLS
'''
# Step 1: Define the nearest-neighbor distance framework for Banksy analysis

from banksy.main import median_dist_to_nearest_neighbour
# Function to compute median distance to nearest neighbors between cells

from banksy.initialize_banksy import initialize_banksy
# Function to initialize Banksy with geometry and neighbor graph parameters

# set params
# ==========
plot_graph_weights = True                   # Flag to visualize neighbor graph weights
k_geom = 30                                 # Number of neighbors (geometry-based k for fixed type)
max_m = 1                                   # Azimuthal transform order (up to kth order)
nbr_weight_decay = "scaled_gaussian"        # Decay function for neighbor weighting
                                            # Options: "reciprocal", "uniform", "ranked", or "scaled_gaussian"

adata = sc.read_h5ad(os.path.join(src_adatas, os.listdir(src_adatas)[1]))
# Load the second AnnData object from script01c outputs for Banksy processing

In [None]:
adata.obsm['spatial'] = adata.obs[['x', 'y']].values
# Store spatial coordinates (x, y) into AnnData's .obsm under key 'spatial'

# Find median distance to closest neighbours, the median distance will be `sigma`
nbrs = median_dist_to_nearest_neighbour(adata, key='spatial')
# Compute median nearest-neighbor distance (used as scaling factor `sigma` for Banksy)

banksy_dict = initialize_banksy(
    adata,
    ('x', 'y', 'spatial'),      # Keys: coordinate dimensions and spatial embedding
    k_geom,                     # Number of neighbors for graph construction
    nbr_weight_decay=nbr_weight_decay,  # Weight decay method for neighbor edges
    max_m=max_m,                # Azimuthal transform order
    plt_edge_hist=False,        # Skip plotting edge distance histogram
    plt_nbr_weights=False,      # Skip plotting neighbor weights
    plt_agf_angles=False,       # Skip plotting angular features (slows down runtime)
    plt_theta=False,            # Skip plotting theta distribution
)
# Initialize Banksy with geometry and neighborhood parameters; returns dictionary of results

warnings.simplefilter(action='ignore', category=FutureWarning)
# Suppress FutureWarning messages for cleaner output

In [None]:
'''
2. NEXT WE WILL CONSTRUCT A BANKSY MATRIX
'''
# Step 2: Build the Banksy embedding matrix for clustering

from banksy.embed_banksy import generate_banksy_matrix
# Function to generate the Banksy matrix based on spatial + expression features

### the following are the main hyperparameters for the banksy algorithm
### ------------------------------------------------------------------

pca_dims = [21]
# Number of principal components (dimensionality reduction target)

lamda_list = [0.1, 0.8]
# List of λ (lambda) values:
# - Controls trade-off between global (0.1) vs. domain-specific (0.8) clustering
# - Higher λ = stronger spatial/domain influence

### ------------------------------------------------------------------
### the following are the main hyperparameters for the banksy algorithm
### ------------------------------------------------------------------

banksy_dict, banksy_matrix = generate_banksy_matrix(
    adata,               # Input AnnData object
    banksy_dict,         # Banksy dictionary containing spatial + non-spatial graphs
    lamda_list,          # List of λ values for constructing embeddings
    max_m,               # Azimuthal transform order
    verbose=False        # Suppress detailed logging
)
# Output:
# - banksy_dict: updated dictionary with Banksy embeddings
# - banksy_matrix: final Banksy feature matrix for clustering

In [None]:
### append non-spatial results to the banksy_dict for comparison

from banksy.main import concatenate_all
# Utility to combine matrices into an AnnData-compatible format

banksy_dict['nonspatial'] = {   # Add a new entry 'nonspatial' to banksy_dict
    0.0: {
        "adata": concatenate_all([adata.X], 0, adata=adata),
        # Concatenate only the expression matrix (adata.X) without spatial context
        # Store as AnnData to allow non-spatial clustering (baseline comparison)
    }
}

In [None]:
### append non-spatial results to the banksy_dict for comparison
# Add baseline (non-spatial) results so we can compare them with spatial Banksy outputs

from banksy.main import concatenate_all
# Utility function to combine matrices into an AnnData-compatible format

banksy_dict['nonspatial'] = {   # Create a new entry 'nonspatial' inside banksy_dict
    0.0: {
        "adata": concatenate_all([adata.X], 0, adata=adata),
        # Use only the expression matrix (adata.X) without spatial information
        # 0.0 = dummy lambda value (no spatial weighting)
        # Store as AnnData object for downstream non-spatial clustering
    }
}

In [None]:
'''
3. BANKSY APPLIES PCA AND UMAP OVER THE SPATIAL DERIVED MATRIX, FOLLOWING BY LEIDEN CLUSTERING
'''
# Step 3: Apply PCA → UMAP → Leiden clustering on the Banksy matrix

import gc
gc.collect()
# Run garbage collection to free memory before clustering

from banksy_utils.umap_pca import pca_umap
# Utility function: run PCA, optionally add UMAP embeddings

from banksy.cluster_methods import run_Leiden_partition
# Function: apply Leiden clustering over neighbor graph


# Run PCA and UMAP embeddings on Banksy data
pca_umap(
    banksy_dict,
    pca_dims=pca_dims,        # number of PCA dimensions to use
    add_umap=True,            # compute UMAP embeddings as well
    plt_remaining_var=False,  # skip variance explained plots
    verbose=False             # suppress verbose logging
)

seed = 329  # random seed for reproducibility

resolutions = [
                #.1, .2, .3, .4, .5,   # (optional smaller cluster resolutions)
                .6                      # resolution parameter for Leiden clustering
              ]

# Apply Leiden clustering on Banksy embeddings
results_df, max_num_labels = run_Leiden_partition(
    banksy_dict,
    resolutions,          # clustering resolutions to test
    num_nn=50,            # number of neighbors
    num_iterations=-1,    # run until convergence
    partition_seed=seed,  # set random seed
    match_labels=True,    # match labels across resolutions
    verbose=False         # suppress logs
)

# Extract partition names (resolutions)
p_names = results_df.index

# Loop over partitions and save labels into adata.obs
for p_name in p_names:
    labels = results_df.loc[p_name, 'relabeled']     # cluster labels (after relabeling)
    adata_results = results_df.loc[p_name, "adata"]  # clustered AnnData object

    label_name = f"labels_{p_name}"                  # column name for labels
    print(label_name)

    adata_results.obs[label_name] = np.char.mod('%d', labels.dense)  # format labels as strings
    adata_results.obs[label_name] = adata_results.obs[label_name].astype('category')  # categorical dtype

    adata.obs = adata.obs.reindex(adata_results.obs.index)  # align indices
    adata.obs[label_name] = adata_results.obs[label_name]   # add labels to adata.obs

# Save reduced representations (Banksy-specific embeddings) into adata.obsm
adata.obsm[f'pc{pca_dims[0]}_banksy'] = adata_results.obsm[f'reduced_pc_{pca_dims[0]}'].copy()
adata.obsm[f'umap{pca_dims[0]}_banksy'] = adata_results.obsm[f'reduced_pc_{pca_dims[0]}_umap'].copy()

# Extract sample name for saving results
sample_name = banksy_dict['nonspatial'][0.0]['adata'].obs['sample_id'][0]

# Save results to pickle files
with open(out_obj_destdir + f'banksy_results_{sample_name}.pkl', 'wb') as f:
    pickle.dump(banksy_dict, f)

with open(out_obj_destdir + f'banksy_results_{sample_name}_results_df.pkl', 'wb') as f:
    pickle.dump(results_df, f)

with open(out_obj_destdir + f'banksy_results_{sample_name}_max_num_labels.pkl', 'wb') as f:
    pickle.dump(max_num_labels, f)

with open(out_obj_destdir + f'banksy_results_{sample_name}_p_names.pkl', 'wb') as f:
    pickle.dump(p_names, f)

#warnings are suppressed to keep output clean
warnings.simplefilter(action='ignore', category=FutureWarning)