In [None]:
# load the dataset

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import normalized_mutual_info_score
from sklearn.preprocessing import MinMaxScaler
import pickle
import matplotlib.colors as mcolors
from scipy.ndimage import gaussian_filter1d
import seaborn as sns
import matplotlib as mpl
from tqdm import tqdm
from matplotlib import cm
from scipy.cluster.hierarchy import linkage, dendrogram,fcluster
import networkx as nx
from scipy.spatial import distance
from scipy.spatial.distance import squareform
from scipy.spatial.distance import pdist
import matplotlib.colors as mcolors
from scipy.ndimage import gaussian_filter
from skimage import measure
mpl.rcParams['pdf.fonttype'] = 42
import scipy
import scanpy as sc
import zarr
# import tarfile
import io
import anndata as ad
import os

## Position cell type transcriptomic centroids onto STaligned Slide-seq data from Langlieb et al. with their prefiltering

In [None]:
for file_select in tqdm(os.listdir('./zenodo/csv/macosko_STalign/')):
    puck_num_select = int(file_select.split('_')[-1].split('.')[0])
    section_select = int(file_select.split('__')[0].split('_')[1])
    
    print(section_select)

    # load the scRNA-seq centroid transcriptomes
    dfCluster_average_Expression = pd.read_csv('/data/LBA_DATA/Macosko/Single_Nuc_Cluster_Avg_Expression.csv.gz', index_col=0)
    dfCluster_average_Expression.columns = [_.split('=')[0] for _ in dfCluster_average_Expression.columns]

    # prepare the puck with STAligned coordinates
    puck_name = f'Puck_Num_{puck_num_select}'
    puck = ad.read_h5ad(f"/data/LBA_DATA/Macosko/All_Pucks_h5ad/{puck_name}.h5ad")
    matrix_file = f'/data/LBA_DATA/Macosko/Mapping_Matrices/{puck_name}.mapping.mtx.gz'
    matrix = scipy.io.mmread(matrix_file)
    metadata_file = f'/data/LBA_DATA/Macosko/Mapping_Matrices/{puck_name}.mapping.metadata.tsv.gz'
    metadata = pd.read_csv(metadata_file, sep='\t', index_col=0)
    cell_types_file = f'/data/LBA_DATA/Macosko/Mapping_Matrices/{puck_name}.mapping.MappedCellTypes.txt'
    cell_types = np.loadtxt(cell_types_file, dtype=str)
    ctass= pd.DataFrame(matrix.toarray(), columns=cell_types)
    ctass.index = metadata.index
    ctass['puckID'] = puck_name
    c1 = pd.read_csv(f"/data/LBA_DATA/Macosko/macosko_STalign/MSI_{section_select}__Macosko_{puck_num_select}.csv", index_col=0)

    # prefilter the cell types puck using spatial matching to our correspondent section
    puck_triplets = puck.obs[['CCF_Y', 'CCF_Z', 'CCF_X']]
    c1_triplets = c1[['y_index', 'z_index', 'x_index']]
    puck_tuples = list(puck_triplets.itertuples(index=False, name=None))
    c1_tuples = set(c1_triplets.itertuples(index=False, name=None))
    mask = [triplet in c1_tuples for triplet in puck_tuples]
    puck_filtered = puck[mask].copy()
    c1['acronym'] = np.array(puck_filtered.obs['CCF_acronym'])
    c1['puckID'] = np.array(puck_filtered.obs.index)
    c1 = c1.set_index('puckID').loc[ctass.index]

    # drop the columns that are not in the snRNA data
    celltypes_intersect = np.intersect1d(ctass.columns[:-1], dfCluster_average_Expression.index)
    ctass = ctass.loc[:,celltypes_intersect]
    dfCluster_average_Expression = dfCluster_average_Expression.loc[ctass.columns,:]

    # follow Langlieb et al guidelines on RCTD believable cell type assignments
    ctass[ctass < 0.3] = 0.0
    ctass = ctass>0
    ctass = ctass.loc[ctass.sum(axis=1) == 1,:]

    # assign the scRNA-seq centroid for each cell type singlet found by Slide-seq
    active_clusters = ctass.astype(int).idxmax(axis=1)
    cell_x_gene = dfCluster_average_Expression.loc[active_clusters].copy()
    cell_x_gene.index = ctass.index
    cell_x_gene['y_index'] = c1.loc[cell_x_gene.index, 'y_index_new']
    cell_x_gene['z_index'] = c1.loc[cell_x_gene.index, 'z_index_new']

    # save the imputed gene expression puck to file
    cell_x_gene_avg = cell_x_gene.groupby(level=0, axis=1).mean()
    output_filename = f"gexpr_{section_select}.h5"
    cell_x_gene_avg.to_hdf(output_filename, key="table", mode='w')

## Prepare to compare cell types and their neighbor lipizones

In [None]:
all_results = []

for i in tqdm(range(1, 33)):
    try:
        gexpr = pd.read_hdf(f"./gene_expression/gexpr_{i}.h5", key="table")
        
        sec = dat.loc[dat['Section'] == i,:]
        sec = sec.loc[sec['z_index'] > 456/2,:]
        
        distance_matrix = cdist(gexpr[['y_index', 'z_index']], 
                              sec[['y_index', 'z_index']], 
                              metric='euclidean')
        distance_df = pd.DataFrame(distance_matrix, 
                                 index=gexpr.index, 
                                 columns=sec.index)
        distance_df_masked = distance_df.copy()
        distance_df_masked[distance_df_masked > 4] = np.nan # this is still relatively large as a radius
        
        # find closest cells and their indices
        closest_indices = distance_df_masked.idxmin(axis=0)

        all_results.append(closest_indices)
        del gexpr, distance_df_masked, distance_df
        
    except Exception as e:
        print(f"Error processing section {i}: {str(e)}")
        continue

pixelclosestcells = pd.concat(all_results)
pixelclosestcells.to_hdf("pixelclosestcells.h5ad", key="table")
pixelclosestcells