In [None]:
import os
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["OMP_NUM_THREADS"]       = "1"
os.environ["MKL_NUM_THREADS"]       = "1"
os.environ["NUMEXPR_NUM_THREADS"]   = "1"
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc
import anndata
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42

atlas = pd.read_parquet("./zenodo/maindata_2.parquet")
merfish = pd.read_parquet("./zenodo/multimodal/cell_filtered_w500genes.parquet")

## Prepare the data by acronym-constrained neighbor matching of cell types to lipidomic voxels in CCF

In [None]:
datavignettes = atlas
lipidsinallen = datavignettes[['xccf','yccf','zccf']].dropna()
merfishinallen = merfish[['x_ccf', 'y_ccf', 'z_ccf']]
merfishinallen.columns = ['xccf','yccf','zccf']

In [None]:
from allensdk.core.mouse_connectivity_cache import MouseConnectivityCache
mcc = MouseConnectivityCache(manifest_file='mouse_connectivity_manifest.json')
annotation, _ = mcc.get_annotation_volume()
merfish['x_index'] = (merfish['x_ccf']*40).astype(int)
merfish['y_index'] = (merfish['y_ccf']*40).astype(int)
merfish['z_index'] = (merfish['z_ccf']*40).astype(int)
merfish['id'] = annotation[merfish['x_index'], merfish['y_index'], merfish['z_index']]
datavignettes = datavignettes.dropna(subset=['id'])
datavignettes['id'] = datavignettes['id'].astype(int).astype(str)
merfish['id'] = merfish['id'].astype(str)
merfishinallen['id'] = merfish['id'].values
#drop vascular and immune cells first...
merfishinallen['division'] = merfish['division'].values
merfishinallen = merfishinallen.loc[~merfishinallen['division'].isin(['6 Vascular', '7 Immune']),:]
datavignettes =datavignettes.dropna(subset=['xccf'])
datavignettess = datavignettes.copy()

In [None]:
import numpy as np
import pandas as pd
from scipy.spatial import cKDTree
from threadpoolctl import threadpool_limits, threadpool_info
threadpool_limits(limits=8)
import os
os.environ['OMP_NUM_THREADS'] = '6'

# 1) pre-group merfish and build trees
trees = {}
feats = {}
for id_, sub in merfish.loc[merfishinallen.index,:].groupby('id'): # this is a priori done for all sections once
    coords = sub[['x_ccf','y_ccf','z_ccf']].to_numpy()
    trees[id_] = cKDTree(coords)
    feats[id_] = sub['labels_cluster'].to_numpy()

from tqdm import tqdm
import pandas as pd

thr = 0.05 
idxs = []
modes = []    

for sec in tqdm(datavignettess['SectionID'].unique()):
    datavignettes = datavignettess[datavignettess['SectionID'] == sec]

    for id_, dsub in datavignettes.groupby('id'):
        tree = trees.get(id_)
        if tree is None:
            continue

        # query points in this vignette
        qpts = dsub[['xccf','yccf','zccf']].to_numpy()
        nbrs_list = tree.query_ball_point(qpts, r=thr)

        # array of cluster labels for this section
        arr = feats[id_]

        for i, nbrs in enumerate(nbrs_list):
            if nbrs:
                idxs.append(dsub.index[i])
                # build a Series, take its mode, pull out the first (in case of ties)
                most_common = pd.Series(arr[nbrs]).mode().iat[0]
                modes.append(most_common)


celltypes = pd.DataFrame(modes, index = idxs, columns = ["putative_celltype"])
mappingtab = merfish[['cluster', 'subclass_color','labels_division','labels_class', 'labels_subclass',  'labels_supertype']].drop_duplicates()
mappingtab.index = mappingtab['cluster']
celltypes['subclass_color'] = celltypes['putative_celltype'].map(mappingtab['subclass_color'])
celltypes['labels_division'] = celltypes['putative_celltype'].map(mappingtab['labels_division'])
celltypes['labels_class'] = celltypes['putative_celltype'].map(mappingtab['labels_class'])
celltypes['labels_subclass'] = celltypes['putative_celltype'].map(mappingtab['labels_subclass'])
celltypes['labels_supertype'] = celltypes['putative_celltype'].map(mappingtab['labels_supertype'])
celltypes.to_parquet("merfishsubclassannotated_acronymized.parquet")

In [None]:
celltypes = pd.read_parquet("./zenodo/multimodal/merfishsubclassannotated_acronymized.parquet")
datavignettes = datavignettess 
result = celltypes
result['SectionID'] = datavignettes.loc[result.index,'SectionID']
result['xccf'] = datavignettes.loc[result.index,'xccf']
result['yccf'] = datavignettes.loc[result.index,'yccf']
result['zccf'] = datavignettes.loc[result.index,'zccf']
result['boundary'] = datavignettes.loc[result.index,'boundary']

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

selected_sections = np.random.choice(result['SectionID'].unique(), size=40, replace=False)

fig, axes = plt.subplots(nrows=4, ncols=10, figsize=(20, 8))
axes = axes.flatten() 
for ax, section_id in zip(axes, selected_sections):
    data = result[result['SectionID'] == section_id]
    ax.scatter(data['zccf'], -data['yccf'], c=data['subclass_color'], s=0.1, rasterized = True)

    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_title(f"Section {section_id}", fontsize=8)
    for spine in ax.spines.values():
        spine.set_visible(False)

plt.tight_layout()
plt.show()

In [None]:
neighs = celltypes
neighs['labels_cluster'] = neighs['putative_celltype']
atlas = atlas.loc[neighs.index,:]
for i in range(2, 11):
    atlas['level_'+str(i)] = atlas['level_'+str(i-1)].astype(str) + atlas['level_'+str(i)].astype(str)

atlas['level_9']

## Clean cell types vs lipizones

In [None]:
import scipy.cluster.hierarchy as sch
from tqdm import tqdm

LABEL1s = ['level_6', 'level_7', 'level_8', 'level_9', 'level_10', 'lipizone_names']
LABEL2s = ['labels_subclass', 'labels_supertype', 'labels_cluster']

NMATCHEDPIXELS = 100
MINCOLOC = 200 ############ HARDCORE

lipizonescelltypes = []
for LABEL1 in tqdm(LABEL1s):
    for LABEL2 in LABEL2s:
        lipizoneZ = atlas[LABEL1].copy().values
        celltypeZ = neighs[LABEL2].copy().values
        cmat = pd.crosstab(lipizoneZ, celltypeZ)
        normalized_df = cmat / cmat.sum() # fraction 
        normalized_df = (normalized_df.T / normalized_df.T.mean()).T ## switch to enrichments
        normalized_df1 = normalized_df.copy()

        cmat = pd.crosstab(lipizoneZ, celltypeZ).T
        normalized_df = cmat / cmat.sum() # fraction 
        normalized_df = (normalized_df.T / normalized_df.T.mean()) ## switch to enrichments
        normalized_df2 = normalized_df.copy()

        normalized_df = normalized_df2 * normalized_df1
        normalized_df[cmat.T < NMATCHEDPIXELS] = 0 #####################
        normalized_df = normalized_df.loc[:, normalized_df.sum() > 200] #####################

        linkage = sch.linkage(sch.distance.pdist(normalized_df.T), method='weighted', optimal_ordering=True)
        order = sch.leaves_list(linkage)
        normalized_df = normalized_df.iloc[:, order]

        order = np.argmax(normalized_df.values, axis=1)
        order = np.argsort(order)
        normalized_df = normalized_df.iloc[order,:]

        maxcoloc = normalized_df.max(axis=1).sort_values()
        lipizonesthatarecelltypes = maxcoloc[maxcoloc > MINCOLOC][::-1]
        lipizonescelltype=pd.DataFrame(normalized_df.idxmax(axis=1).sort_values().loc[lipizonesthatarecelltypes.index])
        lipizonescelltype.columns = ['putative_celltype']
        lipizonescelltype['putative_celltype'] = (
            lipizonescelltype['putative_celltype']
              .str.replace(r'^.*?=', '', regex=True)
        )
        lipizonescelltype['colocalization'] = maxcoloc.loc[lipizonescelltype.index]
        lipizonescelltype['label1'] = LABEL1
        lipizonescelltype['label2'] = LABEL2
        lipizonescelltypes.append(lipizonescelltype)
        
colocalization_all_levels = pd.concat(lipizonescelltypes)
colocalization_all_levels.to_csv("colocalization_all_levels.csv")
colocalization_all_levels

In [None]:
lipizonesxlevels = atlas[LABEL1s].drop_duplicates().reset_index().iloc[:,1:]
lipizonesxlevels.index = lipizonesxlevels.lipizone_names
mapping = {
    lvl: grp['putative_celltype'].to_dict()
    for lvl, grp in colocalization_all_levels.groupby('label1')
}
result = lipizonesxlevels.copy()
for lvl in result.columns:
    result[lvl] = result[lvl].map(mapping.get(lvl, {}))

frommacoscko = pd.read_csv("./zenodo/csv/lipizone_celltype_correspondence_macoscko.csv", index_col=0)
priority = ['lipizone_names', 'level_10', 'level_9', 'level_8', 'level_7', 'level_6']
result['celltype'] = result[priority].bfill(axis=1).iloc[:, 0]
result['levelmatch'] = result[priority].apply(lambda row: row.first_valid_index(), axis=1)
orig = lipizonesxlevels
col_all = colocalization_all_levels  
map_label2 = col_all['label2'].to_dict()
map_coloc  = col_all['colocalization'].to_dict()
def safe_get_code(idx, lvl):
    if pd.isna(lvl) or lvl not in orig.columns:
        return np.nan
    return orig.at[idx, lvl]

result['orig_code'] = [
    safe_get_code(idx, lvl)
    for idx, lvl in zip(result.index, result['levelmatch'])
]

result['label2']        = result['orig_code'].map(map_label2)
result['colocalization'] = result['orig_code'].map(map_coloc)
celltypematched = result[["celltype",	"levelmatch", "label2",	"colocalization"]]
celltypematched['celltype_macoscko'] = [
    frommacoscko.loc[x, 'putative_celltype'] if x in frommacoscko.index else np.nan
    for x in celltypematched.index
]
celltypematched

In [None]:
data = pd.read_parquet("./zenodo/maindata_2.parquet")
merfish['x_index'] = (merfish['x_ccf']*40).astype(int)
merfish['y_index'] = (merfish['y_ccf']*40).astype(int)
merfish['z_index'] = (merfish['z_ccf']*40).astype(int)
border = np.load("./zenodo/mixed/eroded_annot.npy")
merfish['boundary'] = border[merfish['x_index'], merfish['y_index'], merfish['z_index']]
data['ct'] = np.nan
data.loc[neighs.index, 'ct'] = neighs['putative_celltype']
celltypematched["colocalization"] = celltypematched["colocalization"].fillna(0)
celltypematched = celltypematched.sort_values(by="colocalization")[::-1]
celltypematched

In [None]:
unique_sections = data["SectionID"].unique()[:28]
unique_sections2 = merfish["brain_section_label"].unique()[::2][::-1]

for XXX in celltypematched.index[:10]: # the best ones
    print(celltypematched.loc[XXX,:])
    fig, axs = plt.subplots(4, 8, figsize=(32, 16))
    axs = axs.flatten()

    for i, section_value in enumerate(unique_sections):
        if i >= len(axs):
            break
        ax = axs[i]
        section = data[data["SectionID"] == section_value]
        filtered_section = section.loc[section['lipizone_names'].isin([XXX]),:]

        ax.scatter(filtered_section['z_index'], -filtered_section['y_index'],
                        c="red", s=1.0,
                        alpha=1, zorder=1, rasterized=True)  

        filtered_section_contour = section.loc[section['boundary'] == 1,:]
        ax.scatter(filtered_section_contour['z_index'], -filtered_section_contour['y_index'],
                        c='black', s=0.01, rasterized=True, zorder=2, alpha=0.9)

        ax.set_aspect('equal')

    for ax in axs:
        ax.axis('off') 

    plt.tight_layout()
    plt.show()

    ct = celltypematched.loc[XXX,:]['celltype']
    lab = celltypematched.loc[XXX,:]['label2']
    
    if pd.notna(ct):
        fig, axs = plt.subplots(4, 8, figsize=(32, 16))
        axs = axs.flatten()

        for i, section_value in enumerate(unique_sections2[:32]):
            if i >= len(axs):
                break
            ax = axs[i]
            section = merfish[merfish["brain_section_label"] == section_value]
            filtered_section = section.loc[section[lab]==ct,:]

            ax.scatter(filtered_section['z_index'], -filtered_section['y_index'],
                            c="red", s=1.0,
                            alpha=1, zorder=1, rasterized=True)  

            filtered_section_contour = section.loc[section['boundary'] == 1,:]
            ax.scatter(filtered_section_contour['z_index'], -filtered_section_contour['y_index'],
                            c='black', s=0.01, rasterized=True, zorder=2, alpha=0.9)

            ax.set_aspect('equal')

        for ax in axs:
            ax.axis('off') 

        plt.tight_layout()
        plt.show() 

        fig, axs = plt.subplots(4, 8, figsize=(32, 16))
        axs = axs.flatten()

        for i, section_value in enumerate(unique_sections):
            if i >= len(axs):
                break
            ax = axs[i]
            section = data[data["SectionID"] == section_value]
            filtered_section = section.loc[section['ct']==ct,:]

            ax.scatter(filtered_section['z_index'], -filtered_section['y_index'],
                            c="red", s=1.0,
                            alpha=1, zorder=1, rasterized=True)  

            filtered_section_contour = section.loc[section['boundary'] == 1,:]
            ax.scatter(filtered_section_contour['z_index'], -filtered_section_contour['y_index'],
                            c='black', s=0.01, rasterized=True, zorder=2, alpha=0.9)

            ax.set_aspect('equal')

        for ax in axs:
            ax.axis('off') 

        plt.tight_layout()
        plt.show()

## Decide on a threshold to call a lipizone to match a cell type territory

In [None]:
counts = data.loc[data['division'] != "General",:].groupby(['lipizone_names','division']).size()
counts

In [None]:
modal_division = (
    data.loc[data['division'] != "General",:]
    .groupby('lipizone_names')['division']
    .agg(lambda x: x.value_counts().idxmax())
)

In [None]:
# remove cerebellum and fiber tracts to be conservative

lipizonesthatarenotCBorfibers = modal_division.loc[(modal_division != "fiber tracts") & (modal_division != "Cerebellum")].index
len(lipizonesthatarenotCBorfibers) # so here we study 399 lipizones

In [None]:
celltypematchedall = celltypematched.copy()
celltypematched = celltypematched.loc[celltypematched.index.isin(lipizonesthatarenotCBorfibers),:]
celltypematched

In [None]:
# put a hard threshold on colocalization and define a lipizone as mapping to a cell type territory (at some level for the two hierarchies)
# i want to reintroduce the cerebellar and fiber tract guys that perfectly match

theyrecelltypeterritories = pd.concat([celltypematched.loc[celltypematched['colocalization'] > 200,:], celltypematchedall.loc[celltypematchedall['colocalization'] > 1200,:], celltypematched.loc[celltypematched['celltype_macoscko'].notna(),:]])
theyrecelltypeterritories = theyrecelltypeterritories.loc[
    ~theyrecelltypeterritories.index.duplicated(keep='first')
]
theyrecelltypeterritories # so 304 that match with a permissive threshold at 200 and the OR on the macoscko data...

In [None]:
fiber = np.setdiff1d(modal_division.loc[(modal_division == "fiber tracts")].index, theyrecelltypeterritories.index)
CB = np.setdiff1d(modal_division.loc[(modal_division == "Cerebellum")].index, theyrecelltypeterritories.index)

len(CB)+len(fiber) # 122 that are cerebellar or fiber tract therefore difficult to track, mostly are cerebellar - large diversity there!

In [None]:
len(CB)

In [None]:
len(fiber)

In [None]:
whouare = celltypematchedall.loc[~celltypematchedall.index.isin(theyrecelltypeterritories.index),:]
whouare = whouare.loc[~whouare.index.isin(CB),:]
whouare = whouare.loc[~whouare.index.isin(fiber),:]
whouare

In [None]:
theyrecelltypeterritories.shape[0] / (len(fiber) + len(CB) + theyrecelltypeterritories.shape[0]) # 72% lipizones at some level overlap CT territories

In [None]:
# assess agreement between two ST technologies
macosckoonly = pd.isna(theyrecelltypeterritories['celltype']) & ~pd.isna(theyrecelltypeterritories['celltype_macoscko'])
np.sum(macosckoonly)

In [None]:
merfishonly = ~pd.isna(theyrecelltypeterritories['celltype']) & pd.isna(theyrecelltypeterritories['celltype_macoscko'])
np.sum(merfishonly)

In [None]:
both = ~pd.isna(theyrecelltypeterritories['celltype']) & ~pd.isna(theyrecelltypeterritories['celltype_macoscko'])
np.sum(both)

In [None]:
# characterize lipizones that are not cell types: what divisions are they found in? what branch of our hierarchy?
division_difficultmatch = pd.Series([modal_division[x] for x in whouare.index])
numeratore = division_difficultmatch.value_counts()
numeratore

In [None]:
to_drop = set(CB) | set(fiber)

filtered_modal_division = {
    k: v
    for k, v in modal_division.items()
    if k not in to_drop
}

len(filtered_modal_division)

In [None]:
denominatore = pd.Series(list(filtered_modal_division.values())).value_counts()
num_aligned = numeratore.reindex(denominatore.index, fill_value=0)
ratio = num_aligned / denominatore
ratio.sort_values() # hindbrain, striatum, midbrain. complex, mixed, myelinated... good!

In [None]:
for i in range(2, 11):
    data['level_'+str(i)] = data['level_'+str(i-1)].astype(str) + data['level_'+str(i)].astype(str)
    
modal_lev2 = (
    data
    .groupby('lipizone_names')['level_2']
    .agg(lambda x: x.value_counts().idxmax())
)

division_difficultmatch = pd.Series([modal_lev2[x] for x in whouare.index])
numeratore = division_difficultmatch.value_counts()

to_drop = set(CB) | set(fiber)

filtered_modal_lev2 = {
    k: v
    for k, v in modal_lev2.items()
    if k not in to_drop
}

denominatore = pd.Series(list(filtered_modal_lev2.values())).value_counts()

num_aligned = numeratore.reindex(denominatore.index, fill_value=0)
ratio = num_aligned / denominatore
ratio # it's as expected overwhelmingly WM stuff!!!! builds a strong link to downstream analyses! especially 11 enters in the 111 callosal et al branch :)

## Are there lipotypes?

In [None]:
theyrecelltypeterritories.to_csv("./zenodo/csv/theyrecelltypeterritories.csv")

In [None]:
theyrecelltypeterritories['celltype'].value_counts()[:10] # for these cell types we have > 1 lipizone, what could explain that?

In [None]:
theyrecelltypeterritories.loc[theyrecelltypeterritories['celltype'] == theyrecelltypeterritories['celltype'].value_counts()[:10].index[0],:]

In [None]:
for x in range(10):
    lipotype_h1 = theyrecelltypeterritories.loc[theyrecelltypeterritories['celltype'] == theyrecelltypeterritories['celltype'].value_counts()[:10].index[x],:].index

    unique_sections = data["SectionID"].unique()[:28]

    fig, axs = plt.subplots(4, 8, figsize=(32, 16))
    axs = axs.flatten()

    for i, section_value in enumerate(unique_sections):
        if i >= len(axs):
            break
        ax = axs[i]
        section = data[data["SectionID"] == section_value]
        filtered_section = section.loc[section['lipizone_names'].isin(lipotype_h1),:]
        print(len(filtered_section['lipizone_color'].unique()))

        ax.scatter(filtered_section['z_index'], -filtered_section['y_index'],
                        c=filtered_section['lipizone_color'].astype("category").cat.codes, cmap="tab20", s=1.0,
                        alpha=1, zorder=1, rasterized=True)  

        filtered_section_contour = section.loc[section['boundary'] == 1,:]
        ax.scatter(filtered_section_contour['z_index'], -filtered_section_contour['y_index'],
                        c='black', s=0.01, rasterized=True, zorder=2, alpha=0.9)

        ax.set_aspect('equal')

    for ax in axs:
        ax.axis('off') 

    plt.tight_layout()
    plt.show()

In [None]:
# it's cortical and striatal mostly! these are the "many lipidomic matches"

In [None]:
for x in range(10):
    lipotype_h1 = theyrecelltypeterritories.loc[theyrecelltypeterritories['celltype'] == theyrecelltypeterritories['celltype'].value_counts()[theyrecelltypeterritories['celltype'].value_counts() ==2][:10].index[x],:].index

    unique_sections = data["SectionID"].unique()[:28]

    fig, axs = plt.subplots(4, 8, figsize=(32, 16))
    axs = axs.flatten()

    for i, section_value in enumerate(unique_sections):
        if i >= len(axs):
            break
        ax = axs[i]
        section = data[data["SectionID"] == section_value]
        filtered_section = section.loc[section['lipizone_names'].isin(lipotype_h1),:]
        print(len(filtered_section['lipizone_color'].unique()))

        ax.scatter(filtered_section['z_index'], -filtered_section['y_index'],
                        c=filtered_section['lipizone_color'].astype("category").cat.codes, cmap="tab20", s=1.0,
                        alpha=1, zorder=1, rasterized=True)  

        filtered_section_contour = section.loc[section['boundary'] == 1,:]
        ax.scatter(filtered_section_contour['z_index'], -filtered_section_contour['y_index'],
                        c='black', s=0.01, rasterized=True, zorder=2, alpha=0.9)

        ax.set_aspect('equal')

    for ax in axs:
        ax.axis('off') 

    plt.tight_layout()
    plt.show() # cerebellum double layering in there!!!

In [None]:
theyrecelltypeterritories['celltype'].value_counts()[theyrecelltypeterritories['celltype'].value_counts() > 1] # 66 cell types appear with 1+ lipizones

In [None]:
len(theyrecelltypeterritories['celltype'].unique())

## Maybe the right granularity to study the cell types vs the lipizones is the subclass

In [None]:
import scipy.cluster.hierarchy as sch
from tqdm import tqdm

LABEL1s = ['level_6', 'level_7', 'level_8', 'level_9', 'level_10', 'lipizone_names']
LABEL2s = ['labels_subclass']

normalized_dfs = []
NMATCHEDPIXELS = 100
MINCOLOC = 25 ############ HARDCORE

lipizonescelltypes = []
for LABEL1 in tqdm(LABEL1s):
    for LABEL2 in LABEL2s:
        lipizoneZ = atlas[LABEL1].copy().values
        celltypeZ = neighs[LABEL2].copy().values
        cmat = pd.crosstab(lipizoneZ, celltypeZ)
        normalized_df = cmat / cmat.sum() # fraction 
        normalized_df = (normalized_df.T / normalized_df.T.mean()).T ## switch to enrichments
        normalized_df1 = normalized_df.copy()

        cmat = pd.crosstab(lipizoneZ, celltypeZ).T
        normalized_df = cmat / cmat.sum() # fraction 
        normalized_df = (normalized_df.T / normalized_df.T.mean()) ## switch to enrichments
        normalized_df2 = normalized_df.copy()

        normalized_df = normalized_df2 * normalized_df1
        normalized_df[cmat.T < NMATCHEDPIXELS] = 0 #####################
        normalized_df = normalized_df.loc[:, normalized_df.sum() > MINCOLOC] #####################

        linkage = sch.linkage(sch.distance.pdist(normalized_df.T), method='weighted', optimal_ordering=True)
        order = sch.leaves_list(linkage)
        normalized_df = normalized_df.iloc[:, order]

        order = np.argmax(normalized_df.values, axis=1)
        order = np.argsort(order)
        normalized_df = normalized_df.iloc[order,:]

        normalized_dfs.append(normalized_df)

In [None]:
import pandas as pd

bools = [(df.max() > 100) for df in normalized_dfs]

combined = pd.concat(bools, axis=1).any(axis=1)

combined

In [None]:
np.sum(combined) # even at 100 just a minority of subclasses is captured! at any level!

In [None]:
# now we'd need to characterize the subclasses that are found vs those that are not, what distinguishes them?

cmat.sum(axis=1).sort_values()[cmat.sum(axis=1).sort_values() > MINCOLOC] # so 289 subclasses have at least 30 pixels mapped to our data

# but only 97 are detected. why?

In [None]:
cmat = cmat.loc[cmat.sum(axis=1).sort_values()[cmat.sum(axis=1).sort_values() > MINCOLOC].index,:]

cmat_undetected = cmat.loc[~cmat.index.isin(combined.index[combined]),:]
cmat_undetected

In [None]:
cmat_undetected.index

In [None]:
import pickle
from allensdk.core.mouse_connectivity_cache import MouseConnectivityCache
mcc = MouseConnectivityCache(manifest_file='mouse_connectivity_manifest.json')
annotation, _ = mcc.get_annotation_volume()
merfish['x_index'] = (merfish['x_ccf']*40).astype(int)
merfish['y_index'] = (merfish['y_ccf']*40).astype(int)
merfish['z_index'] = (merfish['z_ccf']*40).astype(int)
merfish['id'] = annotation[merfish['x_index'], merfish['y_index'], merfish['z_index']]

file_path = './zenodo/mixed/allen_name_to_annots.pkl'

with open(file_path, 'rb') as file:
    allen_name_to_annots = pickle.load(file)

divisions = ['Olfactory areas', 'Isocortex', 'Hippocampal formation', 'Cortical subplate', 'Striatum', 'Pallidum', 'Thalamus', 'Hypothalamus', 'Midbrain', 'Hindbrain', 'Cerebellum', 'fiber tracts', 'ventricular systems']#, ventricular systems']

merfish['division'] = "General"
for i in divisions:
    merfish['division'][merfish['id'].isin(allen_name_to_annots[i])] = i
    
merfish['division'].value_counts()

In [None]:
numeratore = merfish.loc[merfish['labels_subclass'].isin(cmat_undetected.index), 'division'].value_counts()

denominatore = merfish.loc[:, 'division'].value_counts()

num_aligned = numeratore.reindex(denominatore.index, fill_value=0)
ratio = num_aligned / denominatore
ratio.sort_values() # wm/mixed regions stand out, reassuringly. only the HY is surprising... and the overall high number

In [None]:
modal_mer = (
     merfish.loc[merfish['labels_subclass'].isin(cmat_undetected.index), :]
    .groupby('labels_subclass')['division']
    .agg(lambda x: x.value_counts().idxmax())
)
modal_mer

In [None]:
modal_mer.value_counts() # again in line, with the HY but indeed in HY there are only 2 prevalent, large lipizones...

In [None]:
check = cmat_undetected.loc[modal_mer[modal_mer == "Hypothalamus"].index,:].sum().sort_values()
(check["_1"] + check["Paratrochlear nucleus"]) / check.sum() # they overall cover 33% of the voxels in the region, making fine-grained analysis there very difficul

## Go on a lipizone vs cluster adventure to estimate cell type lipidomes and do nice plots

In [None]:
import scipy.cluster.hierarchy as sch

LABEL1 = "lipizone_names"
LABEL2 = "labels_cluster"

lipizoneZ = atlas[LABEL1].copy().values
celltypeZ = neighs[LABEL2].copy().values
cmat = pd.crosstab(lipizoneZ, celltypeZ)
normalized_df = cmat / cmat.sum() # fraction 
normalized_df = (normalized_df.T / normalized_df.T.mean()).T ## switch to enrichments
normalized_df1 = normalized_df.copy()

cmat = pd.crosstab(lipizoneZ, celltypeZ).T
normalized_df = cmat / cmat.sum() # fraction 
normalized_df = (normalized_df.T / normalized_df.T.mean()) ## switch to enrichments
normalized_df2 = normalized_df.copy()

normalized_df = normalized_df2 * normalized_df1
normalized_df[cmat.T < 20] = 0 #####################
normalized_df = normalized_df.loc[:, normalized_df.sum() > 200] #####################

linkage = sch.linkage(sch.distance.pdist(normalized_df.T), method='weighted', optimal_ordering=True)
order = sch.leaves_list(linkage)
normalized_df = normalized_df.iloc[:, order]

order = np.argmax(normalized_df.values, axis=1)
order = np.argsort(order)
normalized_df = normalized_df.iloc[order,:]

In [None]:
plt.imshow(normalized_df, vmin=0, vmax=400)

In [None]:
normalized_df = normalized_df.loc[:, normalized_df.sum() > 0]

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import to_rgb
from matplotlib.patches import Patch
import random

unique_categories = neighs['labels_class'].unique().tolist()
categories        = neighs['labels_class'].values
n_categories      = len(unique_categories)
beautiful_colors = sns.color_palette("husl", n_categories).as_hex()
random.shuffle(beautiful_colors)
color_dict = {cat: beautiful_colors[i] for i, cat in enumerate(unique_categories)}
color_dict['02 NP-CT-L6b Glut'] = "#000000"
color_dict['01 IT-ET Glut'] = "gray"
col_colors = [color_dict[cat] for cat in categories]

vmin, vmax = 0, np.percentile(normalized_df.values, 98)

data   = normalized_df.to_numpy()
nrows, ncols = data.shape
img    = np.zeros((nrows, ncols, 3))

for j in range(ncols):
    base = np.array(to_rgb(col_colors[j]))
    for i in range(nrows):
        nv = (data[i, j] - vmin) / (vmax - vmin)
        nv = np.clip(nv, 0, 1)
        img[i, j, :] = (1 - nv) * np.array([1, 1, 1]) + nv * base

plt.figure(figsize=(20, 20))
plt.imshow(img, aspect='auto')
plt.xticks(
    ticks=np.arange(ncols),
    labels=[categories[j] for j in range(ncols)],
    rotation=90
)
plt.yticks([])
plt.title('Colocalization of lipizones and cell types')
plt.tight_layout()
#plt.savefig("celltype_vs_lipizones_merfish_matrix.pdf")
plt.show()

legend_elements = [
    Patch(facecolor=color, edgecolor='black', label=label)
    for label, color in color_dict.items()
]

plt.figure(figsize=(8, 6))
plt.legend(
    handles=legend_elements,
    loc='center left',
    bbox_to_anchor=(1, 0.5),
    title="Categories"
)
plt.axis('off')
plt.title("Color Legend")
plt.tight_layout()
#plt.savefig("celltype_vs_lipizones_merfish.pdf")
plt.show()

In [None]:
normalized_df[normalized_df < 200] = 0 #### DANGEROUS, focus on cell type - matching lipizones only...

In [None]:
normalized_df = normalized_df.loc[normalized_df.sum(axis=1) > 0,normalized_df.sum() > 0]
normalized_df # pulite righe e colonne

In [None]:
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
import igraph as ig
import leidenalg
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

def top_n_feats(df, prefix, ns=(1,2,3,4,5,6,7,8,9,10)):
    arr = df.values
    max_n = max(ns)
    idx = np.argpartition(-arr, max_n-1, axis=1)[:, :max_n]
    row_idx = np.arange(arr.shape[0])[:, None]
    top_vals = arr[row_idx, idx]
    sorted_top = np.sort(top_vals, axis=1)[:, ::-1]
    
    out = {}
    for n in ns:
        out[f"{prefix}_top{n}"] = sorted_top[:, n-1]
    return pd.DataFrame(out, index=df.index)

dfs       = [normalized_df]
prefixes  = ['normalized_df']

blocks = [ top_n_feats(df, p) for df, p in zip(dfs, prefixes) ]
combined = pd.concat(blocks, axis=1)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(combined.T).T

nbrs = NearestNeighbors(n_neighbors=10, metric='euclidean').fit(X_scaled)
distances, indices = nbrs.kneighbors(X_scaled)

edges = set()
for i, neigh in enumerate(indices):
    for j in neigh:
        if i != j:
            edges.add(tuple(sorted((i, j))))
g = ig.Graph(list(edges), directed=False)

partition = leidenalg.find_partition(
    g,
    leidenalg.RBConfigurationVertexPartition,
    resolution_parameter=0.1
)

labels = np.array(partition.membership)
combined['cluster'] = labels

In [None]:
plt.imshow(normalized_df, vmin=0, vmax=1000, cmap="Grays")

In [None]:
normalized_df.max().sort_values()

In [None]:
normalized_df.max(axis=1).sort_values()

In [None]:
plt.imshow(normalized_df.loc[combined.index[combined['cluster'] == 2],:], vmin=200, vmax=210)

In [None]:
plt.imshow(normalized_df.loc[combined.index[combined['cluster'] == 1],:], vmin=200, vmax=800)

In [None]:
plt.imshow(normalized_df.loc[combined.index[combined['cluster'] == 0],:], vmin=200, vmax=800)

In [None]:
for iii in np.random.choice(combined[combined['cluster'] == 3].index, 5):
    serie = normalized_df.loc[iii,:].sort_values()[::-1]

    plt.figure(figsize=(4, 3))
    serie[:10].plot(kind='bar')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

In [None]:
for iii in np.random.choice(combined[combined['cluster'] == 2].index, 10):
    serie = normalized_df.loc[iii,:].sort_values()[::-1]

    plt.figure(figsize=(4, 3))
    serie[:10].plot(kind='bar')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

In [None]:
for iii in np.random.choice(combined[combined['cluster'] == 1].index, 10):
    serie = normalized_df.loc[iii,:].sort_values()[::-1]

    plt.figure(figsize=(4, 3))
    serie[:10].plot(kind='bar')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

In [None]:
for iii in np.random.choice(combined[combined['cluster'] == 0].index, 10):
    serie = normalized_df.loc[iii,:].sort_values()[::-1]

    plt.figure(figsize=(4, 3))
    serie[:10].plot(kind='bar')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

In [None]:
# 3 and 2 --> 1:1
# 1 --> 1 very prevalent, others but less
# 0 --> multiple matching cell types

In [None]:
tmp1 = normalized_df.loc[combined.index[combined['cluster'] == 1],:]
tmp1 = (tmp1.T / tmp1.max(axis=1)).T
tmp1 = tmp1.loc[:,tmp1.sum() > 0.95]
linkage = sch.linkage(sch.distance.pdist(tmp1.T), method='weighted', optimal_ordering=True)
order = sch.leaves_list(linkage)
tmp1 = tmp1.iloc[:, order]

order = np.argmax(tmp1.values, axis=1)
order = np.argsort(order)
tmp1 = tmp1.iloc[order,:]
plt.imshow(tmp1, vmin=0.1, vmax=1)

In [None]:
tmp1 = normalized_df.loc[combined.index[combined['cluster'] == 0],:]
tmp1 = (tmp1.T / tmp1.max(axis=1)).T
tmp1 = tmp1.loc[:,tmp1.sum() > 0.95]
linkage = sch.linkage(sch.distance.pdist(tmp1.T), method='weighted', optimal_ordering=True)
order = sch.leaves_list(linkage)
tmp1 = tmp1.iloc[:, order]

order = np.argmax(tmp1.values, axis=1)
order = np.argsort(order)
tmp1 = tmp1.iloc[order,:]
plt.imshow(tmp1, vmin=0.1, vmax=1)

In [None]:
tmp1 = normalized_df.loc[combined.index[(combined['cluster'] == 2) | (combined['cluster'] == 3)],:]
tmp1 = (tmp1.T / tmp1.max(axis=1)).T
tmp1 = tmp1.loc[:,tmp1.sum() > 0.95]
linkage = sch.linkage(sch.distance.pdist(tmp1.T), method='weighted', optimal_ordering=True)
order = sch.leaves_list(linkage)
tmp1 = tmp1.iloc[:, order]

order = np.argmax(tmp1.values, axis=1)
order = np.argsort(order)
tmp1 = tmp1.iloc[order,:]
plt.imshow(tmp1, vmin=0.1, vmax=1)

In [None]:
tmp1 = normalized_df.copy()
tmp1 = (tmp1.T / tmp1.max(axis=1)).T
tmp1 = tmp1.loc[:,tmp1.max() == 1]
linkage = sch.linkage(sch.distance.pdist(tmp1.T), method='weighted', optimal_ordering=True)
order = sch.leaves_list(linkage)
tmp1 = tmp1.iloc[:, order]

order = np.argmax(tmp1.values, axis=1)
order = np.argsort(order)
tmp1 = tmp1.iloc[order,:]
plt.imshow(tmp1, vmin=0.1, vmax=1, cmap="Grays")

## Define the cell types lipidomes

In [None]:
atlas = pd.read_parquet("./zenodo/maindata_2.parquet")

cols = atlas.columns[:173]
vmin = atlas[cols].quantile(0.005)
vmax = atlas[cols].quantile(0.995)

atlas.loc[:, cols] = (atlas.loc[:, cols] - vmin) / (vmax - vmin)
atlas.loc[:, cols] = atlas.loc[:, cols].clip(0,1)

atlas['putative_celltype'] = atlas['lipizone_names'].map(theyrecelltypeterritories['celltype'])
atlas['putative_celltype'] = atlas['putative_celltype'].fillna('0')
atl = atlas.loc[atlas['putative_celltype'] != '0',:]
celltypelipidome = atl.iloc[:,:173].groupby(atl['putative_celltype']).mean()
celltypelipidome.to_csv("./zenodo/csv/celltypelipidome_viamatching_MERFISH.csv")

convtab=merfish[['labels_cluster', 'labels_class']].drop_duplicates().reset_index().iloc[:,1:]
convtab['color'] = convtab['labels_class'].map(color_dict)
color_dict2 = convtab.set_index('labels_cluster')['color'].to_dict()

convtab=merfish[['labels_supertype', 'labels_class']].drop_duplicates().reset_index().iloc[:,1:]
convtab['color'] = convtab['labels_class'].map(color_dict)
color_dict3 = convtab.set_index('labels_supertype')['color'].to_dict()

convtab=merfish[['labels_subclass', 'labels_class']].drop_duplicates().reset_index().iloc[:,1:]
convtab['color'] = convtab['labels_class'].map(color_dict)
color_dict4 = convtab.set_index('labels_subclass')['color'].to_dict()
color_dict2 = color_dict | color_dict2 | color_dict3 | color_dict4