In [None]:
import os
import pickle
import warnings
from collections import deque, Counter
import random
import itertools
import json
from datetime import datetime
import cProfile
import pstats

import joblib
import numpy as np
import pandas as pd
import anndata
import scanpy as sc
import squidpy as sq
import backSPIN
import leidenalg
import networkx as nx
import igraph as ig

from matplotlib import colors as mcolors
from matplotlib import pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
from matplotlib.cm import ScalarMappable
from matplotlib.colors import Normalize

from sklearn.cluster import KMeans, DBSCAN
from sklearn.decomposition import PCA, NMF
from sklearn.mixture import GaussianMixture
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

from xgboost import XGBClassifier
from imblearn.under_sampling import RandomUnderSampler

from scipy.cluster.hierarchy import linkage, fcluster
from scipy.ndimage import gaussian_filter1d
from scipy.signal import find_peaks
from scipy.spatial.distance import squareform, pdist
from scipy.sparse import csr_matrix
from scipy.stats import mannwhitneyu, entropy

from statsmodels.stats.multitest import multipletests
from threadpoolctl import threadpool_limits, threadpool_info

from tqdm import tqdm
from kneed import KneeLocator
from PyPDF2 import PdfMerger

threadpool_limits(limits=8)
os.environ['OMP_NUM_THREADS'] = '6'
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_hdf("20241103_featsel_lba.h5", key="table")
coordinates = data[['Section', 'xccf', 'yccf', 'zccf']]
coordinates['Section'] = coordinates['Section'].astype(int)
metdat = data.iloc[:,:23]
data = data.iloc[:,23:]
metadata = pd.read_hdf("metadata.h5ad", key="table")
harmonized_nmf_result = pd.read_hdf("corrected_nmfall_nochunking.h5ad", key="table")
factor_to_lipid = np.load("factor_to_lipid.npy")
reconstructed_data_df = pd.DataFrame(np.dot(harmonized_nmf_result.values, factor_to_lipid), index = harmonized_nmf_result.index, columns = data.columns)
reconstructed_data_df = reconstructed_data_df - np.min(reconstructed_data_df) + 1e-7
data = reconstructed_data_df

nmf_result = harmonized_nmf_result.loc[metadata['Sample'] == "ReferenceAtlas",:]
scalerglobal = StandardScaler()
standardized_embeddings_atlas = pd.DataFrame(scalerglobal.fit_transform(nmf_result), index=nmf_result.index, columns=nmf_result.columns)
standardized_embeddings_GLOBAL = pd.DataFrame(scalerglobal.transform(harmonized_nmf_result), index=harmonized_nmf_result.index, columns=harmonized_nmf_result.columns)
standardized_embeddings_GLOBAL

In [None]:
# the parameters over which the clustering algorithm was built
## how many splits do we want?
max_depth = 15 # go down where possible! it should automatically stop before everywhere given the biological plausibility criteria!

# parameters
K = 60 # big K for kmeans split before hierarchically reaggregating
min_voxels = 150 # minimum number of observations per lipizone
min_diff_lipids = 2 # minimum number of differential lipids (out of 108 used for clustering) to accept a split
min_fc = 0.2 # minimal fold change required to consider lipids differential
pthr = 0.05 # adjusted pvalue threshold for the wilcoxon test
thr_signal = 0.0000000001 # threshold on signal to keep a component for clustering
penalty1 = 1.5 # extent to penalize the embeddings of the previous split compared to the current one (division factor)
penalty2 = 2 # extent to penalize the global embeddings compared to the current one (division factor)
ACCTHR = 0.6 # lower bound generalization ability for the classifier to relabel and deem a split valid to go deeper
#top_loaders_general = []

# this class stores the tree of classifiers, it's the trained object that, given a lipidomic dataset, is able to return lipizones
class Node:
    def __init__(self, level, path=[]):
        self.level = level
        self.path = path
        self.scaler = None
        self.nmf = None
        self.xgb_model = None
        self.feature_importances = None  # store feature importances at each split to establish a minimal palette
        self.children = {}
        self.factors_to_use = None

## Deploy the lipizone classifier on the whole dataset

In [None]:
def traverse_tree(node, current_adata, embds, paths, level=0):
    print(level)
    
    if node is None or not node.children:
        return

    if current_adata.shape[0] == 0:
        print(f"Empty data at level {level}. Returning.")
        return
    
    # apply NMF to the current data subset
    nmf = node.nmf
    X_nmf = nmf.transform(current_adata.X)
    
    # select the factors used at this node
    factors_to_use = node.factors_to_use
    X_nmf = X_nmf[:, factors_to_use]

    # scale (whiten) the NMF-transformed data
    scaler = node.scaler
    X_scaled = scaler.transform(X_nmf)

    globembds = standardized_embeddings_GLOBAL.loc[current_adata.obs_names].values / penalty2
    embspace = np.concatenate((X_scaled, embds / penalty1, globembds), axis=1)

    # predict the child cluster using the stored XGBoost model
    xgb_model = node.xgb_model
    child_labels = xgb_model.predict(embspace)
    
    unique_labels, counts = np.unique(child_labels, return_counts=True)
    
    for i, index in enumerate(current_adata.obs_names):
        if index not in paths:
            paths[index] = []
        paths[index].append(child_labels[i])

    cl0members = current_adata.obs_names[child_labels == 0]
    cl1members = current_adata.obs_names[child_labels == 1]

    current_adata0 = current_adata[current_adata.obs_names.isin(cl0members)]
    current_adata1 = current_adata[current_adata.obs_names.isin(cl1members)]

    if current_adata0.shape[0] == 0 or current_adata1.shape[0] == 0:
        print(f"Warning: One child node has no data at level {level}. Skipping.")
        return

    embd0 = X_scaled[child_labels == 0]
    embd1 = X_scaled[child_labels == 1]

    # recursively traverse the child nodes
    traverse_tree(node.children[0], current_adata0, embd0, paths, level + 1)
    traverse_tree(node.children[1], current_adata1, embd1, paths, level + 1)


In [None]:
# import the root node
filename = "rootnode_clustering_whole_clean.pkl"
with open(filename, "rb") as file:
    root_node = pickle.load(file)

# prepare the anndata object
new_adata = sc.AnnData(X=reconstructed_data_df)
new_adata.obsm['spatial'] = metadata[['zccf', 'yccf', 'Section']].loc[reconstructed_data_df.index,:].values

new_adata

In [None]:
# deploy our tree with NMF and XGBC onto the full multi-brain dataset
DSFACT = 1
paths = {}
embds = standardized_embeddings_GLOBAL[::DSFACT].values
traverse_tree(root_node, new_adata[::DSFACT], embds, paths)
df_paths = pd.DataFrame.from_dict(paths, orient='index')
df_paths.columns = [f'level_{i}' for i in range(1, 12)]
df_paths = df_paths.fillna(-1)
df_paths = df_paths.astype(int) + 1
df_paths.to_hdf("splithistory_allbrains.h5ad", key="table")
df_paths

## Prepare the transferred lipizones with colors and names

In [None]:
df_paths = pd.read_hdf("splithistory_allbrains.h5ad", key="table")
tree = df_paths.copy()

tree['lipizone'] = tree['level_1'].astype(str)
for i in range(2,12):
    tree['lipizone'] = tree['lipizone'].astype(str) + tree['level_'+str(i)].astype(str)
tree

In [None]:
# quickly check that all lipizones have abundant pixel counts
tree['lipizone'].value_counts()

In [None]:
colors = pd.read_hdf("colorzones.h5ad", key="table")
mapping = pd.DataFrame({
    'lipizone': tree.loc[colors.index, 'lipizone'],
    'lipizone_color': colors['lipizone_color']
})

modal_mapping = mapping.groupby('lipizone').agg(
    lipizone_color=('lipizone_color', lambda x: x.mode().iloc[0])
).reset_index()

modal_mapping.set_index('lipizone', inplace=True)

tree['lipizone_color'] = tree['lipizone'].map(modal_mapping['lipizone_color'])
modal_mapping

In [None]:
modal_mapping['lipizone_color'].value_counts() # small note, there's one ambiguous color

In [None]:
# substitute the CCF coordinates
ccf = pd.read_hdf("allenotherbrains_full.h5ad", key="table") 
ccf = ccf.loc[ccf.index.isin(metadata.index),:]
metadata1 = metadata.loc[ccf.index,:]
metadata1.loc[:, ccf.columns] = ccf
metadata2 = metadata.loc[metadata['Sample'].isin(['ReferenceAtlas', 'SecondAtlas']),:]

In [None]:
metadata = pd.concat([metadata2, metadata1], axis=0)
metadata

In [None]:
metadata = metadata.loc[metadata['allencolor'] != "#000000",:]
metadata.shape

In [None]:
metadata = metadata.loc[metadata.index.isin(tree.index),:]
metadata

In [None]:
tree = tree.loc[metadata.index,:]
data = data.loc[metadata.index,:]

## Assess the inter-brain consistency of lipizones

In [None]:
predicted_lipizones = tree['lipizone_color']
allbrains = pd.concat([reconstructed_data_df, metadata], axis=1)
allbrains['lipizone_color'] = predicted_lipizones

allbrains = allbrains.loc[tree.index,:]
allbrains

In [None]:
st = [x[:8] for x in tree['lipizone']]

In [None]:
len(np.unique(st)) ## ok

In [None]:
allbrains['supertype'] = st

In [None]:
# lipizone proportions correlation across brains

result = allbrains.groupby(allbrains['Sample'])['supertype'].value_counts(normalize=True).unstack(fill_value=0)

cors = result.T.corr()
cors = cors.drop(['GBA1', 'CERT1'], axis=0)
cors = cors.drop(['GBA1', 'CERT1'], axis=1)

import seaborn as sns
sns.clustermap(cors, vmin=0.5)
plt.savefig("corrofproportions_supertypes.pdf")

In [None]:
plt.scatter(result.loc["Pregnant2"][result.loc["Male1"] < 0.1], result.loc["Male2"][result.loc["Male2"] < 0.1], s=0.1)
plt.show() # even the furthest are not bad, good to hear

In [None]:
plt.scatter(result.loc["Female1"][result.loc["Male1"] < 0.1], result.loc["Female2"][result.loc["Male2"] < 0.1], s=0.1)
plt.show()

In [None]:
np.corrcoef(result.loc["Male1"][result.loc["Male1"] < 0.1], result.loc["Male2"][result.loc["Male2"] < 0.1])

In [None]:
# lipid x lipizone matrix correlation across brains

cols_to_group = ['Sample', 'lipizone_color']
data_cols = allbrains.iloc[:, :105].columns
result = allbrains[list(data_cols) + cols_to_group].groupby(by=cols_to_group).mean()
result

In [None]:
# check the overall lipizone-lipid flattened vector across individual brains

flattened = result.unstack(level='lipizone_color')
flattened.columns = [f'{lipid}_{color}' for lipid, color in flattened.columns]
flattened

In [None]:
import seaborn as sns
cors_lip = flattened.T.corr()
cors_lip = cors_lip.drop(['GBA1', 'CERT1'], axis=0)
cors_lip = cors_lip.drop(['GBA1', 'CERT1'], axis=1)

sns.clustermap(cors_lip, vmin=0.75) # all correlations are hyperhigh yet the pregnants group together!
plt.savefig("lipidometransferzones.pdf")
plt.show()

In [None]:
import pandas as pd

color_corrs = {}

for color in result.index.get_level_values('lipizone_color').unique():
    
    df_color = result.xs(color, level='lipizone_color')

    corr_matrix = df_color.T.corr()
    
    color_corrs[color] = corr_matrix

print(color_corrs.get("#000003"))

In [None]:
min_corr = np.min(np.stack([df.values for df in list(color_corrs.values())]), axis=0)

mincorracrosslipi = pd.DataFrame(mean_corr, 
                                  index=color_corrs.get("#000003").index, 
                                  columns=color_corrs.get("#000003").columns)

mincorracrosslipi # ok but this is kinda obvious from how the self supervision works...
# might actually be even repurposed to scrna-seq...

## Do a fast visual check in space

In [None]:
## do a poster :)

plt.figure(figsize=(10, 8))
ddf = data[(data['SectionID'] == 12.0)]
plt.scatter(ddf['zccf'], -ddf['yccf'], c=ddf['lipizone_color'],s=2.0,rasterized=True)
plt.axis('off')
plt.show()

plt.figure(figsize=(10, 8))
ddf = data[(data['SectionID'] == 12.0)]
plt.scatter(ddf['y'], -ddf['x'], c=ddf['lipizone_color'],s=2.0,rasterized=True)
plt.axis('off')
plt.show()

In [None]:
data = allbrains # note MALDI matrix is potentially messing up some checks for now
data['lipizone_color'] = tree['lipizone_color']
    
global_min_z = data['x'].min()
global_max_z = data['x'].max()
global_min_y = data['y'].min() 
global_max_y = data['y'].max()  

fig, axes = plt.subplots(28, 5, figsize=(20, 75))
axes = axes.flatten()

for section in range(1, np.max(data['SectionID'].astype(int))+1):
    ax = axes[section - 1]
    try:
        ddf = data[(data['SectionID'] == section)]

        ax.scatter(ddf['y'], -ddf['x'], c=ddf['lipizone_color'],s=0.05,rasterized=True)
        ax.axis('off')
        ax.set_aspect('equal')  
        #ax.set_ylim(global_min_z, global_max_z)
        ax.set_xlim(global_min_y, global_max_y)
        ax.set_title(ddf['Sample'][0] + " - " + str(section))
    except:
        continue

plt.tight_layout(rect=[0, 0, 0.9, 1])
plt.savefig("allsectionsacross.pdf")
plt.show()

## Lipizones map consistently in space

In [None]:
# check some lipizones one by one to start to appreciate if the spatial patterns are maintained during label transfer - quite wow!

allbrains['bg'] = allbrains['lipizone_color'].astype("category").cat.codes

for alteredlipi in np.random.choice(allbrains['lipizone_color'].unique(), 8):

    sec1 = allbrains.loc[(allbrains['Section'] == 1.0) & (allbrains['Sample'].isin(['Female1', 'Female2', 'Female3',
           'Male1', 'Male2', 'Male3', 'Pregnant1', 'Pregnant2', 'Pregnant4'])),:] # inspect one specific section

    samples = sec1['Sample'].unique()
    num_samples = len(samples)
    fig, axes = plt.subplots(3, 3, figsize=(15, 15))

    axes = axes.flatten()

    for idx, samp in enumerate(samples):
        ax = axes[idx]
        xxx = sec1.loc[sec1['Sample'] == samp, :]

        ax.scatter(
            xxx['y'], -xxx['x'],
            c=xxx['bg'],
            s=0.05,
            alpha=0.7,
            cmap="Greys"
        )

        yyy = xxx.loc[xxx['lipizone_color'] == alteredlipi, :]

        ax.scatter(
            yyy['y'], -yyy['x'],
            c="red",
            s=1.0,
            alpha=0.7
        )

        ax.set_aspect('equal')

        ax.set_xticks([])
        ax.set_yticks([])

        for spine in ax.spines.values():
            spine.set_visible(False)

        ax.set_title(samp)

    for idx in range(num_samples, len(axes)):
        fig.delaxes(axes[idx])
    plt.tight_layout()
    plt.show()

In [None]:
# check some lipizones one by one to start to appreciate if the spatial patterns are maintained during label transfer - quite wow!

allbrains['bg'] = allbrains['lipizone_color'].astype("category").cat.codes

for alteredlipi in np.random.choice(allbrains['lipizone_color'].unique(), 8):

    sec1 = allbrains.loc[(allbrains['Section'] == 1.0) & (allbrains['Sample'].isin(['Female1', 'Female2', 'Female3',
           'Male1', 'Male2', 'Male3', 'Pregnant1', 'Pregnant2', 'Pregnant4'])),:] # inspect one specific section

    samples = sec1['Sample'].unique()
    num_samples = len(samples)
    fig, axes = plt.subplots(3, 3, figsize=(15, 15))

    axes = axes.flatten()

    for idx, samp in enumerate(samples):
        ax = axes[idx]
        xxx = sec1.loc[sec1['Sample'] == samp, :]

        ax.scatter(
            xxx['y'], -xxx['x'],
            c=xxx['bg'],
            s=0.05,
            alpha=0.7,
            cmap="Greys"
        )

        yyy = xxx.loc[xxx['lipizone_color'] == alteredlipi, :]

        ax.scatter(
            yyy['y'], -yyy['x'],
            c="red",
            s=1.0,
            alpha=0.7
        )

        ax.set_aspect('equal')

        ax.set_xticks([])
        ax.set_yticks([])

        for spine in ax.spines.values():
            spine.set_visible(False)

        ax.set_title(samp)

    for idx in range(num_samples, len(axes)):
        fig.delaxes(axes[idx])
    plt.tight_layout()
    plt.show()

## Do a fine-grained analysis of lipizone proportions by Allen regions to compare the reference atlas and the second atlas, assessing consistency and transferrability of the clustering

In [None]:
brains23 = allbrains.loc[~pd.isna(allbrains['id']),:]
brains23 = brains23.loc[brains23['id'] != 0,:]
brains23

In [None]:
# check if the spatial patterns are respected in brain 3, i.e., if the region/lipizone relationship is the same as in brain 2

import pickle

file_path = 'allen_name_to_annots.pkl'

with open(file_path, 'rb') as file:
    allen_name_to_annots = pickle.load(file)

divisions = ['Olfactory areas', 'Isocortex', 'Hippocampal formation', 'Cortical subplate', 'Striatum', 'Pallidum', 'Thalamus', 'Hypothalamus', 'Midbrain', 'Hindbrain', 'Cerebellum', 'fiber tracts', 'ventricular systems']#, ventricular systems']

regions = [
    "Frontal pole, cerebral cortex",
    "Somatomotor areas",
    "Somatosensory areas",
    "Gustatory areas",
    "Visceral area",
    "Auditory areas",
    "Visual areas",
    "Anterior cingulate area",
    "Prelimbic area",
    "Infralimbic area",
    "Orbital area",
    "Agranular insular area",
    "Retrosplenial area",
    "Posterior parietal association areas",
    "Temporal association areas",
    "Perirhinal area",
    "Ectorhinal area",
    "Olfactory areas",
    "Ammon's horn",
    "Dentate gyrus",
    "Fasciola cinerea",
    "Induseum griseum",
    "Retrohippocampal region",
    "Layer 6b, isocortex",
    "Claustrum",
    "Endopiriform nucleus",
    "Lateral amygdalar nucleus",
    "Basolateral amygdalar nucleus",
    "Basomedial amygdalar nucleus",
    "Posterior amygdalar nucleus",
    "Striatum dorsal region",
    "Striatum ventral region",
    "Nucleus accumbens",
    "Fundus of striatum",
    "Olfactory tubercle",
    "Lateral strip of striatum",
    "Lateral septal complex",
    "Striatum-like amygdalar nuclei",
    "Pallidum",
    "Thalamus, sensory-motor cortex related",
    "Thalamus, polymodal association cortex related",
    "Periventricular zone",
    "Periventricular region",
    "Hypothalamic medial zone",
    "Hypothalamic lateral zone",
    "Median eminence",
    "Midbrain, sensory related",
    "Midbrain, motor related",
    "Midbrain, behavioral state related",
    "Pons, sensory related",
    "Pons, motor related",
    "Pons, behavioral state related",
    "Medulla, sensory related",
    "Medulla, motor related",
    "Medulla, behavioral state related",
    "Cerebellar cortex",
    "Cerebellar nuclei",
    "cranial nerves",
    "cerebellum related fiber tracts",
    "supra-callosal cerebral white matter",
    "lateral forebrain bundle system",
    "extrapyramidal fiber systems",
    "medial forebrain bundle system",
    "ventricular systems"
]

brains23['region'] = "General"
for i in regions:
    brains23['region'][brains23['id'].isin(allen_name_to_annots[i])] = i

In [None]:
brains23_2 = brains23.loc[brains23['Sample'] == "ReferenceAtlas",:]
brains23_3 = brains23.loc[brains23['Sample'] == "SecondAtlas",:]

regioncors = []
for region in brains23_2['region'].unique():
    vcb3 = brains23_3.loc[brains23_3['region'] == region, 'lipizone_color'].value_counts()
    vcb2 = brains23_2.loc[brains23_2['region'] == region, 'lipizone_color'].value_counts()

    vcb3 = vcb3[vcb3 > 40]
    vcb2 = vcb2[vcb2 > 40]

    # conservatively check all indexes, so penalize correlation in case of dropout in one of the two brains
    indexes = np.union1d(vcb3.index, vcb2.index)

    # reindex both vectors to include all indexes, filling missing values with NaN
    vcb2 = vcb2.reindex(indexes)
    vcb3 = vcb3.reindex(indexes)
    vcb2 = vcb2.fillna(0.0)
    vcb3 = vcb3.fillna(0.0)

    regioncors.append(np.corrcoef(vcb2, vcb3)[0,1])


lipizonecors = []
for lipizone in brains23_2['lipizone_color'].unique():   
    vcb3 = brains23_3.loc[brains23_3['lipizone_color'] == lipizone, 'region'].value_counts()
    vcb2 = brains23_2.loc[brains23_2['lipizone_color'] == lipizone, 'region'].value_counts()

    vcb3 = vcb3[vcb3 > 40]
    vcb2 = vcb2[vcb2 > 40]

    # conservatively check all indexes, so penalize correlation in case of dropout in one of the two brains
    indexes = np.union1d(vcb3.index, vcb2.index)

    # reindex both vectors to include all indexes, filling missing values with NaN
    vcb2 = vcb2.reindex(indexes)
    vcb3 = vcb3.reindex(indexes)
    vcb2 = vcb2.fillna(0.0)
    vcb3 = vcb3.fillna(0.0)

    lipizonecors.append(np.corrcoef(vcb2, vcb3)[0,1])

plt.hist(regioncors, bins=20) # regional patterns are fundamentally good
plt.show()

plt.hist(lipizonecors, bins=20) # most lipizones are very consistent, a few are definitely not. but it's really good since we are using fine-grained Allen regions!
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde

regioncors_clean = np.array(regioncors)[np.isfinite(regioncors)]

fig, ax = plt.subplots()

ax.hist(regioncors_clean, bins=20, density=True, facecolor='none', edgecolor='black')

density = gaussian_kde(regioncors_clean)
x_vals = np.linspace(np.min(regioncors_clean), np.max(regioncors_clean), 200)
ax.plot(x_vals, density(x_vals), color='red', linewidth=2)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['left'].set_color('black')
ax.spines['bottom'].set_color('black')
ax.set_xlabel('Region-region correlation across lipizones between two atlases')
ax.set_ylabel('Frequency')
plt.savefig("regioregiotwoatlases.pdf")
plt.show()


lipizonecors_clean = np.array(lipizonecors)[np.isfinite(lipizonecors)]
fig, ax = plt.subplots()
ax.hist(lipizonecors_clean, bins=20, density=True, facecolor='none', edgecolor='black')
density = gaussian_kde(lipizonecors_clean)
x_vals = np.linspace(np.min(lipizonecors_clean), np.max(lipizonecors_clean), 200)
ax.plot(x_vals, density(x_vals), color='red', linewidth=2)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['left'].set_color('black')
ax.spines['bottom'].set_color('black')
ax.set_xlabel('Lipizone-lipizone correlation across regions between two atlases')
ax.set_ylabel('Frequency')
plt.savefig("lipilipitwoatlases.pdf")
plt.show()

In [None]:
failures = brains23_2['lipizone_color'].unique()[np.array(lipizonecors) < 0.4]
len(failures) / len(brains23_2['lipizone_color'].unique()) * 100 # 2% lipizones fail to transfer efficiently

In [None]:
lipizone = '#fdbc6e' # to check the few failures... given cerebellum is hard to register and the olfactory parts are better in the second atlas, a few failure cases are almost tolerable

vcb3 = brains23_3.loc[brains23_3['lipizone_color'] == lipizone, 'region'].value_counts()
vcb2 = brains23_2.loc[brains23_2['lipizone_color'] == lipizone, 'region'].value_counts()

vcb3 = vcb3[vcb3 > 40]
vcb2 = vcb2[vcb2 > 40]

indexes = np.intersect1d(vcb3.index, vcb2.index)
vcb2 = vcb2.loc[indexes]
vcb3 = vcb3.loc[indexes]

labels = vcb2.index
plt.scatter(vcb2, vcb3)
for i, label in enumerate(labels):
    plt.annotate(label, (vcb2[i], vcb3[i]), xytext=(5, 5), textcoords='offset points')
plt.show()

## Check 'em on a tSNE

In [None]:
tsne = pd.read_hdf("tsne_df.h5ad", key="table")
tsne

In [None]:
metadata = metadata.loc[tsne.index,:]

In [None]:
metadata['lipizone_color'] = tree['lipizone_color']

In [None]:
metadata = metadata.loc[~metadata['Sample'].isin(['GBA1', 'CERT1']),:]
metadata

In [None]:
tsne = tsne.loc[metadata.index,:]
tsne

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42

samples = metadata['Sample'].unique()
num_samples = len(samples)
first_sample_idx = metadata[metadata['Sample'] == samples[0]].index
unique_lipizones = metadata.loc[first_sample_idx, 'lipizone_color'].unique()
for chosen_lipizone in np.random.choice(unique_lipizones, 15):
    num_rows, num_cols = 3, 4
    fig, axs = plt.subplots(num_rows, num_cols, figsize=(8 * num_cols, 6 * num_rows), squeeze=False)

    for i, sample in enumerate(samples):
       
        row = i // num_cols
        col = i % num_cols
        ax = axs[row, col]

        sample_idx = metadata[metadata['Sample'] == sample].index

        tsne_sample = tsne.loc[sample_idx].iloc[::5, :]

        ax.scatter(tsne_sample.iloc[:, 0], tsne_sample.iloc[:, 1],
                   color='gray', s=5.0, alpha=0.5, label='All points', rasterized=True)

        chosen_idx = metadata[(metadata['Sample'] == sample) &
                              (metadata['lipizone_color'] == chosen_lipizone)].index
        tsne_chosen = tsne.loc[chosen_idx].iloc[::5, :]

        ax.scatter(tsne_chosen.iloc[:, 0], tsne_chosen.iloc[:, 1],
                   color='red', s=10.0, alpha=1.0, label=f'Lipizone: {chosen_lipizone}', rasterized=True)

        for spine in ax.spines.values():
            spine.set_visible(False)

        ax.set_xticks([])
        ax.set_yticks([])
        ax.set_title(sample, fontsize=36)

    total_axes = num_rows * num_cols
    if num_samples < total_axes:
        for j in range(num_samples, total_axes):
            row = j // num_cols
            col = j % num_cols
            axs[row, col].axis('off')

    plt.tight_layout()
    plt.savefig(chosen_lipizone+"_"+sample+".pdf")
    plt.show()

## Reproduce core observations on the second atlas (boring code!)

In [None]:
brain3 = test

In [None]:
import pickle

file_path = 'allen_name_to_annots.pkl'

with open(file_path, 'rb') as file:
    allen_name_to_annots = pickle.load(file)

divisions = ['Olfactory areas', 'Isocortex', 'Hippocampal formation', 'Cortical subplate', 'Striatum', 'Pallidum', 'Thalamus', 'Hypothalamus', 'Midbrain', 'Hindbrain', 'Cerebellum', 'fiber tracts', 'ventricular systems']#, ventricular systems']

brain3['division'] = "General"
for i in divisions:
    brain3['division'][brain3['id'].isin(allen_name_to_annots[i])] = i
    
brain3['division'].value_counts()

midp = brain3['zccf'].mean()

data = brain3.loc[(brain3['division'] == "Isocortex") & (brain3['zccf'] > midp),:]

data = data.loc[data['Section'] > 4,:] #### anterior sections are a bit their own world

unique_sections = data["Section"].unique()

In [None]:
# select AP positions and --> ALL layers
focus = data[data["SectionID"].isin([49, 50, 52])]

# keep only abundant lipizones
unique_colors = focus["lipizone_color"].value_counts().index[focus["lipizone_color"].value_counts() > 150]
focus = focus.loc[focus['lipizone_color'].isin(unique_colors),:]

# find clusters of colocalizing lipizones (organizational archetypes)
cmat = pd.crosstab(focus['acronym'], focus['lipizone_color'])
normalized_df1 = cmat / cmat.sum() # fraction 
normalized_df1 = (normalized_df1.T / normalized_df1.T.mean()).T
cmat = pd.crosstab(focus['acronym'], focus['lipizone_color']).T
normalized_df2 = cmat / cmat.sum() # fraction 
normalized_df2 = (normalized_df2.T / normalized_df2.T.mean())
normalized_df = normalized_df1 * normalized_df2
tc = normalized_df.T
adata = anndata.AnnData(X=tc)
sc.pp.neighbors(adata, use_rep='X')
sc.tl.leiden(adata, resolution=2.0)
cluster_labels = adata.obs['leiden']

In [None]:
focus = data[data["SectionID"].isin([49, 50, 52])]

# keep only abundant lipizones
unique_colors = focus["lipizone_color"].value_counts().index[focus["lipizone_color"].value_counts() > 150]
focus = focus.loc[focus['lipizone_color'].isin(unique_colors),:]

# plot in groups to eyeball patterns
color_to_cluster = pd.Series(cluster_labels.values, index=cluster_labels.index).to_dict()
focus['leiden_cluster'] = focus['lipizone_color'].map(color_to_cluster)
unique_clusters = sorted(focus['leiden_cluster'].unique())
sections = focus["SectionID"].unique()

focus = focus.loc[focus["SectionID"] == 50,:]
focus

In [None]:
# Modified plotting code
n_rows = 3
n_cols = 6
fig, axs = plt.subplots(n_rows, n_cols, figsize=(n_cols * 4, n_rows * 4))
axs = axs.flatten()  # Flatten the 2D array of axes for easier indexing

# Plot clusters
for cluster_idx, cluster in enumerate(unique_clusters):
    if cluster_idx < len(axs):  # Ensure we don't exceed the number of available subplot spaces
        cleancandidates = []
        
        cluster_colors = focus[focus['leiden_cluster'] == cluster]['lipizone_color'].unique()
        
        a1 = normalized_df.T
        a2 = cluster_labels
        findacro = a1.groupby(a2).mean()
        candidates = findacro.loc[cluster,:].sort_values()[::-1]
        candidates = candidates[candidates > 36].index.values.astype(str) #25
        
        """
        for acronym in candidates:
            matching_structures = [s for s in structure_graph if s['acronym'] == acronym]
            if matching_structures:
                cleancandidates.append(matching_structures[0]['name'])
            else:
                cleancandidates.append(acronym)
        """      
        titlenow = ' + '.join(candidates)
        
        # Plot for section 11
        for color in cluster_colors:
            color_section = focus[focus['lipizone_color'] == color]
            axs[cluster_idx].scatter(
                color_section['z_index'], 
                -color_section['y_index'],
                c=color, 
                s=7,
                alpha=1, 
                zorder=1, 
                rasterized=True
            )
            
        filtered_section_contour = focus.loc[focus['boundary'] == 1, :]
        axs[cluster_idx].scatter(
            filtered_section_contour['z_index'], 
            -filtered_section_contour['y_index'],
            c='black', 
            s=0.5, 
            rasterized=True, 
            zorder=2, 
            alpha=0.5
        )
        
        axs[cluster_idx].set_aspect('equal')
        axs[cluster_idx].axis('off')
        axs[cluster_idx].set_title(cluster, fontsize=15)

# Remove empty subplots if there are fewer clusters than subplot spaces
for idx in range(len(unique_clusters), len(axs)):
    fig.delaxes(axs[idx])

plt.tight_layout()

plt.show()

In [None]:
# ok but i'd rather do on atlas and plot on brain 3 cmon obv

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc
import anndata
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42

atlas = pd.read_parquet("atlas.parquet")
atlas

import pickle

file_path = 'allen_name_to_annots.pkl'

with open(file_path, 'rb') as file:
    allen_name_to_annots = pickle.load(file)

divisions = ['Olfactory areas', 'Isocortex', 'Hippocampal formation', 'Cortical subplate', 'Striatum', 'Pallidum', 'Thalamus', 'Hypothalamus', 'Midbrain', 'Hindbrain', 'Cerebellum', 'fiber tracts', 'ventricular systems']#, ventricular systems']

atlas['division'] = "General"
for i in divisions:
    atlas['division'][atlas['id'].isin(allen_name_to_annots[i])] = i
    
atlas['division'].value_counts()

midp = atlas['zccf'].mean()

data = atlas.loc[(atlas['division'] == "Isocortex") & (atlas['zccf'] > midp),:]

data = data.loc[data['Section'] > 4,:] #### anterior sections are a bit their own world

unique_sections = data["Section"].unique()

# select AP positions and --> ALL layers
focus = data[data["Section"].isin([9, 11, 12])]

# keep only abundant lipizones
unique_colors = focus["lipizone_color"].value_counts().index[focus["lipizone_color"].value_counts() > 150]
focus = focus.loc[focus['lipizone_color'].isin(unique_colors),:]

# find clusters of colocalizing lipizones (organizational archetypes)
cmat = pd.crosstab(focus['acronym'], focus['lipizone_color'])
normalized_df1 = cmat / cmat.sum() # fraction 
normalized_df1 = (normalized_df1.T / normalized_df1.T.mean()).T
cmat = pd.crosstab(focus['acronym'], focus['lipizone_color']).T
normalized_df2 = cmat / cmat.sum() # fraction 
normalized_df2 = (normalized_df2.T / normalized_df2.T.mean())
normalized_df = normalized_df1 * normalized_df2
tc = normalized_df.T
adata = anndata.AnnData(X=tc)
sc.pp.neighbors(adata, use_rep='X')
sc.tl.leiden(adata, resolution=2.0)
cluster_labels = adata.obs['leiden']

# plot in groups to eyeball patterns
color_to_cluster = pd.Series(cluster_labels.values, index=cluster_labels.index).to_dict()
focus['leiden_cluster'] = focus['lipizone_color'].map(color_to_cluster)
unique_clusters = sorted(focus['leiden_cluster'].unique())
sections = focus["Section"].unique()

focus = focus.loc[focus["Section"] == 11,:]

# Modified plotting code
n_rows = 3
n_cols = 6
fig, axs = plt.subplots(n_rows, n_cols, figsize=(n_cols * 4, n_rows * 4))
axs = axs.flatten()  # Flatten the 2D array of axes for easier indexing

# Plot clusters
for cluster_idx, cluster in enumerate(unique_clusters):
    if cluster_idx < len(axs):  # Ensure we don't exceed the number of available subplot spaces
        cleancandidates = []
        
        cluster_colors = focus[focus['leiden_cluster'] == cluster]['lipizone_color'].unique()
        
        a1 = normalized_df.T
        a2 = cluster_labels
        findacro = a1.groupby(a2).mean()
        candidates = findacro.loc[cluster,:].sort_values()[::-1]
        candidates = candidates[candidates > 36].index.values.astype(str) #25
        
        """
        for acronym in candidates:
            matching_structures = [s for s in structure_graph if s['acronym'] == acronym]
            if matching_structures:
                cleancandidates.append(matching_structures[0]['name'])
            else:
                cleancandidates.append(acronym)
        """      
        titlenow = ' + '.join(candidates)
        
        # Plot for section 11
        for color in cluster_colors:
            color_section = focus[focus['lipizone_color'] == color]
            axs[cluster_idx].scatter(
                color_section['z_index'], 
                -color_section['y_index'],
                c=color, 
                s=7,
                alpha=1, 
                zorder=1, 
                rasterized=True
            )
            
        filtered_section_contour = focus.loc[focus['boundary'] == 1, :]
        axs[cluster_idx].scatter(
            filtered_section_contour['z_index'], 
            -filtered_section_contour['y_index'],
            c='black', 
            s=0.5, 
            rasterized=True, 
            zorder=2, 
            alpha=0.5
        )
        
        axs[cluster_idx].set_aspect('equal')
        axs[cluster_idx].axis('off')
        axs[cluster_idx].set_title(cluster, fontsize=15)

# Remove empty subplots if there are fewer clusters than subplot spaces
for idx in range(len(unique_clusters), len(axs)):
    fig.delaxes(axs[idx])

plt.tight_layout()
plt.show()

In [None]:
import pickle

file_path = 'allen_name_to_annots.pkl'

with open(file_path, 'rb') as file:
    allen_name_to_annots = pickle.load(file)

divisions = ['Olfactory areas', 'Isocortex', 'Hippocampal formation', 'Cortical subplate', 'Striatum', 'Pallidum', 'Thalamus', 'Hypothalamus', 'Midbrain', 'Hindbrain', 'Cerebellum', 'fiber tracts', 'ventricular systems']#, ventricular systems']

brain3['division'] = "General"
for i in divisions:
    brain3['division'][brain3['id'].isin(allen_name_to_annots[i])] = i
    
brain3['division'].value_counts()

midp = brain3['zccf'].mean()

data = brain3.loc[(brain3['division'] == "Isocortex") & (brain3['zccf'] > midp),:]

data = data.loc[data['Section'] > 4,:] #### anterior sections are a bit their own world

unique_sections = data["Section"].unique()

# select AP positions and --> ALL layers
focus = data[data["SectionID"].isin([49, 50, 52])]

# keep only abundant lipizones
unique_colors = focus["lipizone_color"].value_counts().index[focus["lipizone_color"].value_counts() > 150]
focus = focus.loc[focus['lipizone_color'].isin(unique_colors),:]

focus = data[data["SectionID"].isin([49, 50, 52])]

# keep only abundant lipizones
unique_colors = focus["lipizone_color"].value_counts().index[focus["lipizone_color"].value_counts() > 150]
focus = focus.loc[focus['lipizone_color'].isin(unique_colors),:]
focus['leiden_cluster'] = focus['lipizone_color'].map(color_to_cluster)
#focus['leiden_cluster'] = focus['leiden_cluster'].fillna('17')

In [None]:
focus['lev5'] = [x[:5] for x in focus['lipizone']]
cmnow = pd.crosstab(focus['lev5'], focus['leiden_cluster'])
cmnow =(cmnow.T/cmnow.sum(axis=1)).T
cmnow

In [None]:
cmnow.idxmax(axis=1)

In [None]:
focus['leiden_cluster2'] = focus['lev5'].map(cmnow.idxmax(axis=1))
focus['leiden_cluster2']

In [None]:
focus.loc[pd.isna(focus['leiden_cluster']), 'leiden_cluster'] = focus.loc[pd.isna(focus['leiden_cluster']), 'leiden_cluster2']
focus

In [None]:
focus = focus.loc[focus["SectionID"] == 50,:]
unique_clusters = sorted(focus['leiden_cluster'].unique())

# Modified plotting code
n_rows = 3
n_cols = 6
fig, axs = plt.subplots(n_rows, n_cols, figsize=(n_cols * 4, n_rows * 4))
axs = axs.flatten()  # Flatten the 2D array of axes for easier indexing

# Plot clusters
for cluster_idx, cluster in enumerate(unique_clusters):
    if cluster_idx < len(axs):  # Ensure we don't exceed the number of available subplot spaces
        cleancandidates = []
        
        cluster_colors = focus[focus['leiden_cluster'] == cluster]['lipizone_color'].unique()
        """
        a1 = normalized_df.T
        a2 = cluster_labels
        findacro = a1.groupby(a2).mean()
        candidates = findacro.loc[cluster,:].sort_values()[::-1]
        candidates = candidates[candidates > 36].index.values.astype(str) #25
        
        
        for acronym in candidates:
            matching_structures = [s for s in structure_graph if s['acronym'] == acronym]
            if matching_structures:
                cleancandidates.append(matching_structures[0]['name'])
            else:
                cleancandidates.append(acronym)
        """      
        titlenow = ' + '.join(candidates)
        
        # Plot for section 11
        for color in cluster_colors:
            color_section = focus[focus['lipizone_color'] == color]
            axs[cluster_idx].scatter(
                color_section['z_index'], 
                -color_section['y_index'],
                c=color, 
                s=7,
                alpha=1, 
                zorder=1, 
                rasterized=True
            )
            
        filtered_section_contour = focus.loc[focus['boundary'] == 1, :]
        axs[cluster_idx].scatter(
            filtered_section_contour['z_index'], 
            -filtered_section_contour['y_index'],
            c='black', 
            s=0.5, 
            rasterized=True, 
            zorder=2, 
            alpha=0.5
        )
        
        axs[cluster_idx].set_aspect('equal')
        axs[cluster_idx].axis('off')
        axs[cluster_idx].set_title(cluster, fontsize=15)

# Remove empty subplots if there are fewer clusters than subplot spaces
for idx in range(len(unique_clusters), len(axs)):
    fig.delaxes(axs[idx])

plt.tight_layout()
plt.savefig("cortex_vignette1_SUPPL_BRAIN3.pdf")
plt.show()

In [None]:
filtered_sections = [66,67,68,69,70,71,72,73]

cb = brain3.loc[(brain3['SectionID'].isin(filtered_sections)),:]
cb

In [None]:
for i in range(2, 12):
    cb['level_'+str(i)] = cb['level_'+str(i-1)].astype(str) + cb['level_'+str(i)].astype(str)
    
cb

In [None]:
unique_sections = cb['Section'].unique()

fig, axs = plt.subplots(4, 8, figsize=(32, 16))
axs = axs.flatten()

for i, section_value in enumerate(unique_sections):
    if i >= len(axs):
        break
    ax = axs[i]
    section = cb[cb["Section"] == section_value]
    
    filtered_section = section.loc[(section['level_2'] == "22"),:] 

    ax.scatter(filtered_section['z_index'], -filtered_section['y_index'],
                    c=filtered_section['level_4'].astype("category").cat.codes, cmap="tab20", s=0.2,
                    alpha=1, zorder=1, rasterized=True)  

    filtered_section_contour = section.loc[section['boundary'] == 1,:]
    ax.scatter(filtered_section_contour['z_index'], -filtered_section_contour['y_index'],
                    c='black', s=0.01, rasterized=True, zorder=2, alpha=0.9)
 
    ax.set_aspect('equal')
    
for ax in axs:
    ax.axis('off') 

plt.tight_layout()
plt.savefig("cerebellumbrain3.pdf")
plt.show() 

In [None]:
import matplotlib.pyplot as plt

# Define a list of custom color names.
custom_colors = ['darkred', 'teal', 'brown', 'purple', 'orange', 'olive', 'navy', 'maroon', 'blue', 'green', 'pink', 'red', 'yellow', 'black', 'gray', 'lightblue']

# Compute the mapping from each unique level_4 value to a custom color.
unique_level4 = sorted(cb['level_4'].unique())

# Check if there are enough colors for a one-to-one mapping.
if len(unique_level4) > len(custom_colors):
    raise ValueError("Not enough colors in custom_colors to ensure a one-to-one mapping for level_4 categories!")

# Since the check passed, we can safely map without cycling.
color_mapping = {cat: custom_colors[i] for i, cat in enumerate(unique_level4)}

print("Custom color mapping for level_4:")
for k, v in color_mapping.items():
    print(f"{k}: {v}")


# Now, plot each section using the custom color mapping.
unique_sections = cb['Section'].unique()

fig, axs = plt.subplots(4, 8, figsize=(32, 16))
axs = axs.flatten()

for i, section_value in enumerate(unique_sections):
    if i >= len(axs):
        break
    ax = axs[i]
    section = cb[cb["Section"] == section_value]
    
    # Filter for the desired condition on level_2.
    filtered_section = section.loc[section['level_2'] == "22", :]
    
    # Map each level_4 value to its corresponding custom color.
    colors = filtered_section['level_4'].map(color_mapping)
    
    ax.scatter(filtered_section['z_index'], -filtered_section['y_index'],
               color=colors, s=0.2, alpha=1, zorder=1, rasterized=True)
    
    # Plot boundary contours in black.
    filtered_section_contour = section.loc[section['boundary'] == 1, :]
    ax.scatter(filtered_section_contour['z_index'], -filtered_section_contour['y_index'],
               color='black', s=0.01, rasterized=True, zorder=2, alpha=0.9)
    
    ax.set_aspect('equal')

for ax in axs:
    ax.axis('off')

plt.tight_layout()
plt.show()


In [None]:
cb = cb.loc[cb['level_4'].map(color_mapping).isin(['gray', 'lightblue']),:]

fig, axs = plt.subplots(4, 8, figsize=(32, 16))
axs = axs.flatten()

for i, section_value in enumerate(unique_sections):
    if i >= len(axs):
        break
    ax = axs[i]
    section = cb[cb["Section"] == section_value]
    
    # Filter for the desired condition on level_2.
    filtered_section = section.loc[section['level_2'] == "22", :]
    
    # Map each level_4 value to its corresponding custom color.
    colors = filtered_section['level_4'].map(color_mapping)
    
    ax.scatter(filtered_section['z_index'], -filtered_section['y_index'],
               color=colors, s=0.2, alpha=1, zorder=1, rasterized=True)
    
    # Plot boundary contours in black.
    filtered_section_contour = section.loc[section['boundary'] == 1, :]
    ax.scatter(filtered_section_contour['z_index'], -filtered_section_contour['y_index'],
               color='black', s=0.01, rasterized=True, zorder=2, alpha=0.9)
    
    ax.set_aspect('equal')

for ax in axs:
    ax.axis('off')

plt.tight_layout()
plt.show()

In [None]:
tocolor = ['SM 42:3;O2', 'Cer 40:0;O2', 'HexCer 36:1:O2', 'Cer 40:2;O2',
       'PE O-34:2', 'PC 38:5', 'PC 32:1', 'PS 42:9', 'PC 31:0 PE 34:0',
       'SM 36:2;O2']

for currentLipid in tocolor:

    results = []

    for section in cb['Section'].unique():
        subset = cb[cb['Section'] == section]

        perc_2 = subset[currentLipid].quantile(0.05) #### try enhancing contrast a bit
        perc_98 = subset[currentLipid].quantile(0.95)

        results.append([section, perc_2, perc_98])
    percentile_df = pd.DataFrame(results, columns=['Section', '2-perc', '98-perc'])
    med2p = percentile_df['2-perc'].median()
    med98p = percentile_df['98-perc'].median()

    fig, axs = plt.subplots(1, 6, figsize=(24, 4))  # Single row, 6 columns
    axs = axs.flatten()

    # Filter unique sections to only those in 'plotnow'
    filtered_sections = unique_sections[2:]#[sec for sec in unique_sections if sec in plotnow][:6]  # Limit to 6 sections

    for i, section_value in enumerate(filtered_sections):
        ax = axs[i]
        section = brain3[brain3["Section"] == section_value]

        # Filter specific level
        filtered_section = section.loc[section.index.isin(cb.index), :]

        # Plot scatter using preassigned word colors
        ax.scatter(filtered_section['z_index'], -filtered_section['y_index'],
                   c=filtered_section[currentLipid], cmap="plasma", s=0.2, alpha=1, zorder=1, rasterized=True, vmin=med2p, vmax=med98p)

        # Plot boundary
        filtered_section_contour = section.loc[section['boundary'] == 1, :]
        ax.scatter(filtered_section_contour['z_index'], -filtered_section_contour['y_index'],
                   c='black', s=0.01, rasterized=True, zorder=2, alpha=0.9)

        ax.set_aspect('equal')

    # Remove unused axes
    for j in range(len(filtered_sections), len(axs)):
        fig.delaxes(axs[j])

    # Remove axes for clarity
    for ax in axs:
        ax.axis('off')

    plt.suptitle(currentLipid)
    plt.tight_layout()
    plt.show()

In [None]:
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.pyplot as plt
import pandas as pd

# Assuming your data (cb, brain3, unique_sections, etc.) is already defined

tocolor = ['SM 42:3;O2', 'Cer 40:0;O2', 'HexCer 36:1:O2', 'Cer 40:2;O2',
           'PE O-34:2', 'PC 38:5', 'PC 32:1', 'PS 42:9', 'PC 31:0 PE 34:0',
           'SM 36:2;O2']

# Create a PdfPages object to save multiple pages (figures)
with PdfPages('reproddoublelayergranules_brain3.pdf') as pdf:
    for currentLipid in tocolor:

        results = []

        for section in cb['Section'].unique():
            subset = cb[cb['Section'] == section]

            perc_2 = subset[currentLipid].quantile(0.05)  # try enhancing contrast a bit
            perc_98 = subset[currentLipid].quantile(0.95)

            results.append([section, perc_2, perc_98])
        percentile_df = pd.DataFrame(results, columns=['Section', '2-perc', '98-perc'])
        med2p = percentile_df['2-perc'].median()
        med98p = percentile_df['98-perc'].median()

        fig, axs = plt.subplots(1, 6, figsize=(24, 4))  # Single row, 6 columns
        axs = axs.flatten()

        # Filter unique sections to only those in 'plotnow' (here using a slice, adjust as needed)
        filtered_sections = unique_sections[2:]  # Alternatively: [sec for sec in unique_sections if sec in plotnow][:6]

        for i, section_value in enumerate(filtered_sections):
            ax = axs[i]
            section = brain3[brain3["Section"] == section_value]

            # Filter specific level: ensure we're using the same indices as in cb
            filtered_section = section.loc[section.index.isin(cb.index), :]

            # Plot scatter using preassigned word colors
            sc = ax.scatter(filtered_section['z_index'], -filtered_section['y_index'],
                       c=filtered_section[currentLipid], cmap="plasma", s=0.2, alpha=1, 
                       zorder=1, rasterized=True, vmin=med2p, vmax=med98p)

            # Plot boundary
            filtered_section_contour = section.loc[section['boundary'] == 1, :]
            ax.scatter(filtered_section_contour['z_index'], -filtered_section_contour['y_index'],
                       c='black', s=0.01, rasterized=True, zorder=2, alpha=0.9)

            ax.set_aspect('equal')

        # Remove unused axes
        for j in range(len(filtered_sections), len(axs)):
            fig.delaxes(axs[j])

        # Remove axes for clarity
        for ax in axs:
            ax.axis('off')

        plt.suptitle(currentLipid)
        plt.tight_layout()

        # Save the current figure to the PDF file
        pdf.savefig(fig)
        plt.close(fig)  # Close the figure to free up memory


In [None]:
brain3

In [None]:
sec52 = brain3.loc[brain3['SectionID'] == 52,:]
plt.figure(figsize=(10, 8))
ddf = sec52
plt.scatter(ddf['zccf'], -ddf['yccf'], c=ddf['lipizone_color'],s=0.4,rasterized=True)
plt.axis('off')
plt.savefig("transferred.pdf")
plt.show()

In [None]:
#sec52 = brain3.loc[brain3['SectionID'] == 52,:]
plt.figure(figsize=(10, 8))
ddf = sec52.loc[(sec52['zccf'] > 4) & (sec52['zccf'] < 8) & (sec52['yccf'] > 1) & (sec52['yccf'] < 2) & (sec52['level_1'] == 1),:]
plt.scatter(ddf['zccf'], -ddf['yccf'], c=ddf['lipizone_color'],s=40,rasterized=True)
#plt.axis('off')
plt.show()

In [None]:
toplot = [
    "Cer 42:1;O2",
    "PE O-36:0 PE O-38:3",
    "LPC 16:0",
    "Cer 40:1;O2",
    "PE O-38:2",
    "PE O-36:1",
    "PS 40:2",
    "LPE 22:1",
    "PG 39:0",
    "PS 36:3",
    "PC 38:5",
    "PG 34:1",
    "PI 34:1",
    "PC 38:7",
    "PE O-38:7",
    "PA 38:5",
    "Cer 36:1;O2",
    "PS 38:5"
]


In [None]:
dot_size = 0.2  
num_plots = 18
rows, cols = 6, 3
fig, axes = plt.subplots(rows, cols, figsize=(5, 8))
axes = axes.flatten()
wm_voxels_sub = ddf

for iii in range(num_plots):
    if iii < len(toplot):
        ax = axes[iii]
        
        sc1 = ax.scatter(wm_voxels_sub['z_index'], -wm_voxels_sub['y_index'],
                         c=wm_voxels_sub[toplot[iii]], cmap='inferno', s=2, alpha=0.8, vmax=np.percentile(wm_voxels_sub[toplot[iii]], 90), rasterized=True)
        
        ax.set_title(toplot[iii], fontsize=8)
        ax.set_aspect('equal')
        ax.autoscale()  # This will automatically set the axis limits
        ax.tick_params(axis='both', which='both', length=0, labelbottom=False, labelleft=False)
        ax.axis('off')
    else:
        axes[iii].axis('off')

plt.tight_layout()
plt.savefig("difflipids_axons.pdf", format="pdf", bbox_inches="tight")
plt.show()

In [None]:
for i in range(2, 12):
    brain3['level_'+str(i)] = brain3['level_'+str(i-1)].astype(str) + brain3['level_'+str(i)].astype(str)

In [None]:
callosalwm = brain3.loc[brain3['level_3'] == '111',:]

In [None]:
callosalwm2 = callosalwm.loc[callosalwm['zccf'] < np.mean(callosalwm['zccf']),:]
callosalwm2

In [None]:
callosalwm2['centroidcolor'] = "black"
meanz = callosalwm2['zccf'].groupby(callosalwm2['lipizone_color']).mean()
medialcolors = meanz.index[meanz > 4]
callosalwm2.loc[callosalwm2['lipizone_color'].isin(medialcolors), 'centroidcolor'] = "red"
callosalwm2['centroidcolor'].value_counts()

In [None]:
# ML axis - nice, it really seems there's a medial and there's a lateral cluster overall

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 6))

# Loop over groups defined by 'lipizone_color'
for zone, group in callosalwm2.groupby('lipizone_color'):
    # Assume each group has a consistent 'centroidcolor'
    color = group['centroidcolor'].iloc[0]
    sns.kdeplot(
        data=group,
        x='zccf',
        multiple='layer', 
        fill=False,      
        linewidth=0.4,
        color=color,      # Use the centroidcolor for the plot
        label=zone        # Label by lipizone_color
    )

plt.xlabel('Value')
plt.ylabel('Density')
plt.title('Overlaid Histograms by Zone')

ax = plt.gca()
sns.despine(ax=ax, top=True, right=True, left=True, bottom=True)
ax.tick_params(left=False, bottom=False)

plt.savefig("KERNELDENSITY_mediolateral_REPROB3.pdf", format="pdf", bbox_inches="tight")
plt.show()


In [None]:
np.sort(brain3['SectionID'].unique())

In [None]:
br3sub = brain3.loc[brain3['SectionID'].isin([46, 47,50,59,68,71]),:]
br3sub

In [None]:
tocolor = ['SM 34:1;O2']
filtered_sections = br3sub['SectionID'].unique()

for currentLipid in tocolor:

    results = []

    for section in br3sub['SectionID'].unique():
        subset = br3sub[br3sub['SectionID'] == section]

        perc_2 = subset[currentLipid].quantile(0.05) #### try enhancing contrast a bit
        perc_98 = subset[currentLipid].quantile(0.95)

        results.append([section, perc_2, perc_98])
    percentile_df = pd.DataFrame(results, columns=['SectionID', '2-perc', '98-perc'])
    med2p = percentile_df['2-perc'].median()
    med98p = percentile_df['98-perc'].median()

    fig, axs = plt.subplots(1, 6, figsize=(24, 4))  # Single row, 6 columns
    axs = axs.flatten()

    for i, section_value in enumerate(filtered_sections):
        ax = axs[i]
        section = brain3[brain3["SectionID"] == section_value]

        # Filter specific level
        filtered_section = section.loc[section.index.isin(br3sub.index), :]

        # Plot scatter using preassigned word colors
        ax.scatter(filtered_section['z_index'], -filtered_section['y_index'],
                   c=filtered_section[currentLipid], cmap="plasma", s=0.2, alpha=1, zorder=1, rasterized=True, vmin=med2p, vmax=med98p)

        ax.set_aspect('equal')

    # Remove unused axes
    for j in range(len(filtered_sections), len(axs)):
        fig.delaxes(axs[j])

    # Remove axes for clarity
    for ax in axs:
        ax.axis('off')

    plt.suptitle(currentLipid)
    plt.tight_layout()
    plt.savefig("SM341.pdf")
    plt.show()

In [None]:
ventatl = pd.read_hdf("ventricles.h5ad", key="table")
ventatl

In [None]:
color_dict = {
    "#002657": "Thalamic and mid/hindbrain WM / CSF subclass",
    "#1f77b4": "Dorsal VLMCs and canals",
    "#21366b": "Mid/hindbrain WM / CSF subclass",
    "#2a3f6d": "Boundary WM / CSF subclass",
    "#2ca02c": "IIIv VLMCs / Astro-TE",
    "#3e4b6c": "Mixed WM / CSF subclass",
    "#8c564b": "Ependymal dorsomedial linining of LVs",
    "#9467bd": "Ventral ependymal / Astro-TE NN 5",
    "#98df8a": "IIIv dorsal lining / VLMCs",
    "#aec7e8": "ChP \"core wave 1\"",
    "#c49c94": "Dorsal IIIv lining and canals",
    "#c5b0d5": "Dorsal lining of LVs, canal and IVv lining",
    "#e377c2": "Ventral Ependymal IIIv and LVs",
    "#f7b6d2": "Lining of IVv",
    "#ffbb78": "ChP \"core wave 2\""
}
ventatl['cluster'] = ventatl['putativecluster_color'].map(color_dict)
ventatl['cluster']

In [None]:
sec = ventatl.loc[ventatl['Section'] == 8.0,:]
plt.scatter(sec['zccf'], sec['yccf'], c=sec['putativecluster_color'], s=0.2) # okok look at the ChP
plt.show()

In [None]:
ventatl['subclass'] = [x[:5] for x in ventatl['cluster_x']]

ventatl = ventatl.loc[ventatl['subclass'].isin(['12111']),:]
ventatl

In [None]:
pd.crosstab(ventatl['subclass'], ventatl['cluster'])

In [None]:
X = reconstructed_data_df.loc[ventatl.index,:]
y = ventatl['cluster']
y[~y.isin(["ChP \"core wave 1\"", "ChP \"core wave 2\""])] = "Others"
y[y.isin(["ChP \"core wave 1\"", "ChP \"core wave 2\""])] = "Good"
y.value_counts()

In [None]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.utils.class_weight import compute_class_weight

# -------------------------------
# 1. Encode the string labels (binary)
# -------------------------------
# Assume X is your DataFrame of continuous features and y is your Series of string labels.
le = LabelEncoder()
y_encoded = le.fit_transform(y)  # Converts strings to integers (0, 1)

# -------------------------------
# 2. Split into overall training and validation sets
# -------------------------------
X_train, X_val, y_train, y_val = train_test_split(
    X, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42
)

# -------------------------------
# 3. Compute sample weights for the training data to handle class imbalance
# -------------------------------
class_weights = compute_class_weight(
    class_weight='balanced', classes=np.unique(y_train), y=y_train
)
weight_dict = dict(zip(np.unique(y_train), class_weights))
sample_weight_train = np.array([weight_dict[label] for label in y_train])

# -------------------------------
# 4. Train the model with default parameters
# -------------------------------
model = XGBClassifier(
    objective='binary:logistic',
    use_label_encoder=False,
    eval_metric='logloss'
)

# Use early stopping with the validation set to prevent overfitting
model.fit(
    X_train, y_train,
    sample_weight=sample_weight_train,
    eval_set=[(X_val, y_val)],
    early_stopping_rounds=10,
    verbose=True
)

# -------------------------------
# 5. Evaluate performance on the external validation set
# -------------------------------
y_pred = model.predict(X_val)
overall_accuracy = accuracy_score(y_val, y_pred)
print("\nOverall Validation Accuracy: {:.2%}".format(overall_accuracy))

# -------------------------------
# 6. Compute per-class accuracy on the validation set
# -------------------------------
print("\nPer-class accuracy:")
unique_classes = np.unique(y_val)
for cl in unique_classes:
    idx = (y_val == cl)
    class_accuracy = accuracy_score(y_val[idx], y_pred[idx])
    print(f"Accuracy for class '{le.inverse_transform([cl])[0]}': {class_accuracy:.2%}")


In [None]:
import copy

model1 = copy.deepcopy(model)

In [None]:
color_dict = {
    "#002657": "Thalamic and mid/hindbrain WM / CSF subclass",
    "#1f77b4": "Dorsal VLMCs and canals",
    "#21366b": "Mid/hindbrain WM / CSF subclass",
    "#2a3f6d": "Boundary WM / CSF subclass",
    "#2ca02c": "IIIv VLMCs / Astro-TE",
    "#3e4b6c": "Mixed WM / CSF subclass",
    "#8c564b": "Ependymal dorsomedial linining of LVs",
    "#9467bd": "Ventral ependymal / Astro-TE NN 5",
    "#98df8a": "IIIv dorsal lining / VLMCs",
    "#aec7e8": "ChP \"core wave 1\"",
    "#c49c94": "Dorsal IIIv lining and canals",
    "#c5b0d5": "Dorsal lining of LVs, canal and IVv lining",
    "#e377c2": "Ventral Ependymal IIIv and LVs",
    "#f7b6d2": "Lining of IVv",
    "#ffbb78": "ChP \"core wave 2\""
}
ventatl['cluster'] = ventatl['putativecluster_color'].map(color_dict)

X = reconstructed_data_df.loc[ventatl.index,:]
y = ventatl['cluster']
y = y[y.isin(["ChP \"core wave 1\"", "ChP \"core wave 2\""])]
X = X.loc[y.index,:]
y.value_counts()

In [None]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.utils.class_weight import compute_class_weight

# -------------------------------
# 1. Encode the string labels (binary)
# -------------------------------
# Assume X is your DataFrame of continuous features and y is your Series of string labels.
le = LabelEncoder()
y_encoded = le.fit_transform(y)  # Converts strings to integers (0, 1)

# -------------------------------
# 2. Split into overall training and validation sets
# -------------------------------
X_train, X_val, y_train, y_val = train_test_split(
    X, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42
)

# -------------------------------
# 3. Compute sample weights for the training data to handle class imbalance
# -------------------------------
class_weights = compute_class_weight(
    class_weight='balanced', classes=np.unique(y_train), y=y_train
)
weight_dict = dict(zip(np.unique(y_train), class_weights))
sample_weight_train = np.array([weight_dict[label] for label in y_train])

# -------------------------------
# 4. Train the model with default parameters
# -------------------------------
model = XGBClassifier(
    objective='binary:logistic',
    use_label_encoder=False,
    eval_metric='logloss'
)

# Use early stopping with the validation set to prevent overfitting
model.fit(
    X_train, y_train,
    sample_weight=sample_weight_train,
    eval_set=[(X_val, y_val)],
    early_stopping_rounds=10,
    verbose=True
)

# -------------------------------
# 5. Evaluate performance on the external validation set
# -------------------------------
y_pred = model.predict(X_val)
overall_accuracy = accuracy_score(y_val, y_pred)
print("\nOverall Validation Accuracy: {:.2%}".format(overall_accuracy))

# -------------------------------
# 6. Compute per-class accuracy on the validation set
# -------------------------------
print("\nPer-class accuracy:")
unique_classes = np.unique(y_val)
for cl in unique_classes:
    idx = (y_val == cl)
    class_accuracy = accuracy_score(y_val[idx], y_pred[idx])
    print(f"Accuracy for class '{le.inverse_transform([cl])[0]}': {class_accuracy:.2%}")


In [None]:
import copy

model2 = copy.deepcopy(model)

In [None]:
brain3['subclass'] = [x[:5] for x in brain3['lipizone']]

brain3VENT = brain3.loc[brain3['subclass'].isin(['12111']),:]
brain3VENT

In [None]:
subclusters_ventricles_b3 = model1.predict(reconstructed_data_df.loc[brain3VENT.index,:])
subclusters_ventricles_b3

In [None]:
brain3VENT['subclusters_ventricles_b3'] = subclusters_ventricles_b3
#brain3VENT['subclusters_ventricles_b3'] = brain3VENT['subclusters_ventricles_b3'].map(pd.crosstab(y_pred, ventatl.loc[X_val.index, "putativecluster_color"]).idxmax(axis=1))
brain3VENT.loc[brain3VENT['subclusters_ventricles_b3'] == 1, 'subclusters_ventricles_b3'] = "red"
brain3VENT.loc[brain3VENT['subclusters_ventricles_b3'] == 0, 'subclusters_ventricles_b3'] = "blue"
brain3VENT['subclusters_ventricles_b3'].value_counts()

In [None]:
filtered_sections

In [None]:
filtered_sections = [47,49,50,52,57,69,70,71]

fig, axs = plt.subplots(2, 4, figsize=(6, 8))  # Single row, 6 columns
axs = axs.flatten()

for i, section_value in enumerate(filtered_sections):
    ax = axs[i]
    section = brain3VENT[brain3VENT["SectionID"] == section_value]

    # Filter specific level
    filtered_section = section.loc[section.index.isin(brain3VENT.index), :]

    # Plot scatter using preassigned word colors
    ax.scatter(filtered_section['z_index'], -filtered_section['y_index'],
               c=filtered_section['subclusters_ventricles_b3'], s=0.2, alpha=1, zorder=1, rasterized=True, vmin=med2p, vmax=med98p)

    ax.set_aspect('equal')
    ax.set_title(section_value)

# Remove unused axes
for j in range(len(filtered_sections), len(axs)):
    fig.delaxes(axs[j])

# Remove axes for clarity
for ax in axs:
    ax.axis('off')

plt.tight_layout()
plt.savefig("subclusters_ventriclesBRAIN3.pdf")
plt.show()

In [None]:
brain3VENT = brain3VENT.loc[brain3VENT['subclusters_ventricles_b3'] == "blue",:]
brain3VENT

In [None]:
subclusters_ventricles_b3 = model2.predict(reconstructed_data_df.loc[brain3VENT.index,:])
brain3VENT['subclusters_ventricles_b3'] = subclusters_ventricles_b3
brain3VENT['subclusters_ventricles_b3'] = brain3VENT['subclusters_ventricles_b3'].map(pd.crosstab(y_pred, ventatl.loc[X_val.index, "putativecluster_color"]).idxmax(axis=1))
#brain3VENT.loc[brain3VENT['subclusters_ventricles_b3'] == 1, 'subclusters_ventricles_b3'] = "orange"
#brain3VENT.loc[brain3VENT['subclusters_ventricles_b3'] == 0, 'subclusters_ventricles_b3'] = "green"
brain3VENT['subclusters_ventricles_b3'].value_counts()

In [None]:
datavent = brain3VENT.loc[brain3VENT['SectionID'] == 47,:]
xx = datavent

import plotly.graph_objects as go
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=xx['z_index'],
    y=-xx['y_index'],
    mode='markers',
    marker=dict(
        size=6,
        color=xx['subclusters_ventricles_b3'],
        showscale=False
    ),
    text=xx['subclusters_ventricles_b3'],  # to see the color value on hover
    hoverinfo='text'
))

fig.update_layout(
    xaxis=dict(range=[xx['z_index'].min(), xx['z_index'].max()], showgrid=False, zeroline=False, visible=False),
    yaxis=dict(range=[-xx['y_index'].max(), -xx['y_index'].min()], showgrid=False, zeroline=False, visible=False),
    showlegend=False,
    width=800, 
    height=600,
    plot_bgcolor='rgba(0,0,0,0)'
)

fig.update_yaxes(
    scaleanchor="x",
    scaleratio=1,
)

fig.show()

In [None]:
annotated_ventricles = brain3VENT.loc[brain3VENT['SectionID'].isin([47,49,50,52,57,69,70,71]),:]

indLV = annotated_ventricles.loc[(annotated_ventricles['SectionID'] <58),:].index
indIV = annotated_ventricles.loc[(annotated_ventricles['SectionID'] > 68),:].index

lv = annotated_ventricles.loc[indLV,:]
iv = annotated_ventricles.loc[indIV,:]

lv = lv.loc[lv['subclusters_ventricles_b3'].isin(["#aec7e8", "#ffbb78"]),:]
iv = iv.loc[iv['subclusters_ventricles_b3'].isin(["#aec7e8", "#ffbb78"]),:]
lv

lv['subclusters_ventricles_b3'].value_counts(normalize=True)

In [None]:
iv['subclusters_ventricles_b3'].value_counts(normalize=True) ## EXPECTATION: ffbb78 is enriched in the IV ventricle, while #aec7e8 is enriched in the LVs, with a full-brain analysis.

In [None]:
# proportions seem unconfirmed but here sectioning scheme was crappy, or maybe something else...

In [None]:
filtered_sections = [47,49,50,52,57,69,70,71]

fig, axs = plt.subplots(2, 4, figsize=(6, 8))  # Single row, 6 columns
axs = axs.flatten()

for i, section_value in enumerate(filtered_sections):
    ax = axs[i]
    section = brain3VENT[brain3VENT["SectionID"] == section_value]

    # Filter specific level
    filtered_section = section.loc[section.index.isin(brain3VENT.index), :]

    # Plot scatter using preassigned word colors
    ax.scatter(filtered_section['z_index'], -filtered_section['y_index'],
               c=filtered_section['subclusters_ventricles_b3'], s=0.2, alpha=1, zorder=1, rasterized=True, vmin=med2p, vmax=med98p)

    ax.set_aspect('equal')
    ax.set_title(section_value)

# Remove unused axes
for j in range(len(filtered_sections), len(axs)):
    fig.delaxes(axs[j])

# Remove axes for clarity
for ax in axs:
    ax.axis('off')

plt.tight_layout()
plt.savefig("subclusters_ventriclesBRAIN3.pdf")
plt.show()

In [None]:
brain3['subclass'] = [x[:5] for x in brain3['lipizone']]

brain3VENT = brain3.loc[brain3['subclass'].isin(['12111', '12112']),:]
brain3VENT

In [None]:
subclusters_ventricles_b3 = model1.predict(reconstructed_data_df.loc[brain3VENT.index,:])
brain3VENT['subclusters_ventricles_b3'] = subclusters_ventricles_b3
brain3VENT.loc[brain3VENT['subclusters_ventricles_b3'] == 1, 'subclusters_ventricles_b3'] = "red"
brain3VENT.loc[brain3VENT['subclusters_ventricles_b3'] == 0, 'subclusters_ventricles_b3'] = "blue"
brain3VENT['subclusters_ventricles_b3'].value_counts()

In [None]:
brain3VENT['yccf']

In [None]:
brain3VENT = brain3VENT.loc[brain3VENT['subclusters_ventricles_b3'] == "red",:]
brain3VENT = brain3VENT.loc[(brain3VENT['y_index'] > 75) & (brain3VENT['y_index'] < 210) & (brain3VENT['z_index'] < 300) & (brain3VENT['z_index'] > 170),:]

In [None]:
brain3VENT

In [None]:
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.pyplot as plt
import numpy as np
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42

# Open a PdfPages object to collect all pages in one PDF file.
with PdfPages("output.pdf") as pdf:
    filtered_sections = [46, 47]

    for LIPIDPLOT in ["PI 42:8", "HexCer 42:2;O2", "Hex2Cer 40:1;O2", "Hex2Cer 40:0;O2", "SM 36:1;O2", "Cer 36:1;O2"]:
        # Create a new figure for each lipid plot.
        fig, axs = plt.subplots(1, 2, figsize=(12, 8))
        axs = axs.flatten()

        for i, section_value in enumerate(filtered_sections):
            ax = axs[i]
            section = brain3VENT[brain3VENT["SectionID"] == section_value]

            # Filter specific level
            filtered_section = section.loc[section.index.isin(brain3VENT.index), :]

            # Plot scatter using preassigned word colors
            ax.scatter(filtered_section['z_index'], -filtered_section['y_index'],
                       c=filtered_section[LIPIDPLOT],
                       cmap="plasma",
                       vmin=np.percentile(filtered_section[LIPIDPLOT], 5),
                       vmax=np.percentile(filtered_section[LIPIDPLOT], 95),
                       s=40, alpha=1, zorder=1, rasterized=True)

            ax.set_aspect('equal')

        # Remove unused axes if any.
        for j in range(len(filtered_sections), len(axs)):
            fig.delaxes(axs[j])

        # Remove axes for clarity.
        for ax in axs:
            ax.axis('off')

        plt.suptitle(LIPIDPLOT, fontsize=70)
        plt.tight_layout()

        # Save the current figure as a new page in the PDF.
        pdf.savefig(fig)
        plt.close(fig)  # Close the figure after saving it to free up memory.
