In [None]:
import pandas as pd

datavignettes = pd.read_parquet("./zenodo/maindata_2.parquet")
datavignettes = datavignettes.loc[datavignettes['Sample'] == "ReferenceAtlas",:]

# subselect the two orange subclasses
ventricles = datavignettes.loc[datavignettes['subclass'].isin(['12111', '12112']),:]
lipvent = ventricles.iloc[:, :173]
lipvent

## Subcluster the venticular and ventricular linings subclasses

In [None]:
# reduce dimensionality

import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
lip_scaled = scaler.fit_transform(lipvent)

pca = PCA(n_components=10)
lip_pca = pca.fit_transform(lip_scaled)

pca_df = pd.DataFrame(data=lip_pca, index=lipvent.index)

pca_df

In [None]:
# do Leiden clustering

import networkx as nx
import igraph as ig
import leidenalg
import numpy as np
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
from matplotlib.colors import to_hex

def leidenalg_clustering(inputdata, Nneigh=40, Niter=5): 
    nn = NearestNeighbors(n_neighbors=Nneigh, n_jobs=4)
    nn.fit(inputdata)
    knn = nn.kneighbors_graph(inputdata)
    
    G = nx.Graph(knn)
    
    g = ig.Graph.from_networkx(G)
    
    partitions = leidenalg.find_partition(g, leidenalg.ModularityVertexPartition, n_iterations=Niter, seed=230598)
    labels = np.array(partitions.membership)
    
    return labels

cl = leidenalg_clustering(pca_df.values)
ventricles['putativecluster_color'] = cl
cat = ventricles['putativecluster_color'].astype('category')
codes = cat.cat.codes
cmap = plt.get_cmap('tab20')
ventricles['putativecluster_color'] = [to_hex(cmap(code)) for code in codes]

In [None]:
# remove the non ventricular clusters

ventricles = pd.read_hdf("./zenodo/csv/ventricles.h5ad", key="table") ## for consistency of naming downstream

ventricles = ventricles.loc[~ventricles['putativecluster_color'].isin(["#ff9896", "#d62728", "#ff7f0e"]),:]

ventricle_innerwm = datavignettes.loc[(datavignettes['subclass'].isin(['11222', '11221', '11212', '11211'])) & (datavignettes['division'] == "ventricular systems"),:].iloc[:,:-1]
color_dict = {
    '11222': "#3e4b6c",
    '11221': "#21366b",
    '11212': "#002657",
    '11211': "#2a3f6d"
}
ventricle_innerwm['putativecluster_color'] = ventricle_innerwm['subclass'].map(color_dict)
ventricles = pd.concat([ventricles, ventricle_innerwm])
lipvent = ventricles.iloc[:, :173]
lipvent

In [None]:
# check on a t-SNE

from openTSNE import TSNEEmbedding
from openTSNE import affinity
from openTSNE import initialization
import numpy as np

scaler = StandardScaler()
lip_scaled = scaler.fit_transform(lipvent)

pca = PCA(n_components=10)
lip_pca = pca.fit_transform(lip_scaled)

pca_df = pd.DataFrame(data=lip_pca, index=lipvent.index)

x_train = pca_df.values

affinities_train = affinity.PerplexityBasedNN(
    x_train,
    perplexity=30,
    metric="euclidean",
    n_jobs=8,
    random_state=42,
    verbose=True,
)


init_train = x_train[:,[0,1]] 

embedding_train = TSNEEmbedding(
    init_train,
    affinities_train,
    negative_gradient_method="fft",
    n_jobs=8,
    verbose=True,
)

embedding_train_1 = embedding_train.optimize(n_iter=500, exaggeration=1) ########## parameters...

embedding_train_N = embedding_train_1.optimize(n_iter=100, exaggeration=1.4) ##########

tsne = embedding_train_N

In [None]:
fig = plt.figure(figsize=(20, 20))
plt.scatter(embedding_train_N[:, 0], embedding_train_N[:, 1], c=ventricles['putativecluster_color'], s=50, alpha=1, rasterized=True)
plt.title('Putative clusters', fontsize=18)
plt.tick_params(axis='both', which='both', bottom=False, top=False, left=False, right=False, labelbottom=False, labelleft=False)
for spine in plt.gca().spines.values():
    spine.set_visible(False)
plt.tight_layout()
plt.show()

## Help yourself annotating clusters by using interactive plotly

In [None]:
## select the region to zoom in
"""
datavent = ventricles.loc[ventricles['Section'] == 8,:]
xx = datavent.loc[(datavent['z_index'] > 150) & (datavent['z_index'] < 230) & (-datavent['y_index'] > -190) & (-datavent['y_index'] < -75),:]

datavent = ventricles.loc[ventricles['Section'] == 7,:]
xx = datavent

datavent = ventricles.loc[ventricles['Section'] == 9,:]
xx = datavent.loc[(datavent['z_index'] > 120) & (datavent['z_index'] < 160) & (-datavent['y_index'] > -140) & (-datavent['y_index'] < -75),:]

datavent = ventricles.loc[ventricles['Section'] == 5,:]
xx = datavent.loc[(datavent['z_index'] > 175) & (datavent['z_index'] < 215) & (-datavent['y_index'] > -210) & (-datavent['y_index'] < -110),:]

datavent = ventricles.loc[ventricles['Section'] == 12,:]
xx = datavent.loc[(datavent['z_index'] > 215) & (datavent['z_index'] < 240) & (-datavent['y_index'] > -140) & (-datavent['y_index'] < -75),:]

datavent = ventricles.loc[ventricles['Section'] == 15,:]
xx = datavent.loc[(datavent['z_index'] > 310) & (datavent['z_index'] < 400) & (-datavent['y_index'] > -235) & (-datavent['y_index'] < -160),:]

datavent = ventricles.loc[ventricles['Section'] == 27,:]
xx = datavent.loc[(datavent['z_index'] > 160) & (datavent['z_index'] < 290) & (-datavent['y_index'] > -205) & (-datavent['y_index'] < -160),:]

datavent = ventricles.loc[ventricles['Section'] == 28,:]
xx = datavent#.loc[(datavent['z_index'] > 160) & (datavent['z_index'] < 290) & (-datavent['y_index'] > -205) & (-datavent['y_index'] < -160),:]

datavent = ventricles.loc[ventricles['Section'] == 11,:]
xx = datavent#.loc[(datavent['z_index'] > 160) & (datavent['z_index'] < 290) & (-datavent['y_index'] > -205) & (-datavent['y_index'] < -160),:]

datavent = ventricles.loc[ventricles['Section'] == 14,:]
xx = datavent#.loc[(datavent['z_index'] > 160) & (datavent['z_index'] < 290) & (-datavent['y_index'] > -205) & (-datavent['y_index'] < -160),:]
"""

datavent = ventricles.loc[ventricles['Section'] == 14,:]
xx = datavent#.loc[(datavent['z_index'] > 160) & (datavent['z_index'] < 290) & (-datavent['y_index'] > -205) & (-datavent['y_index'] < -160),:]

import plotly.graph_objects as go
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=xx['z_index'],
    y=-xx['y_index'],
    mode='markers',
    marker=dict(
        size=6,
        color=xx['putativecluster_color'],
        showscale=False
    ),
    text=xx['putativecluster_color'],  # to see the color value on hover
    hoverinfo='text'
))

fig.update_layout(
    xaxis=dict(range=[xx['z_index'].min(), xx['z_index'].max()], showgrid=False, zeroline=False, visible=False),
    yaxis=dict(range=[-xx['y_index'].max(), -xx['y_index'].min()], showgrid=False, zeroline=False, visible=False),
    showlegend=False,
    width=800, 
    height=600,
    plot_bgcolor='rgba(0,0,0,0)'
)

fig.update_yaxes(
    scaleanchor="x",
    scaleratio=1,
)

fig.show()

In [None]:
namingtable = {
    "cluster": [
        11111, 11112, 11121, 11122, 11211, 11212, 11221, 11222, 12111, 12112, 
        12121, 12122, 12211, 12212, 12221, 12222, 21111, 21112, 21120, 21211, 
        21212, 21221, 21222, 22111, 22112, 22121, 22122, 22211, 22212, 22221, 22222
    ],
    "zone": [
        "Mixed and hindbrain white matter", "Core callosal white matter", 
        "Callosal and cerebellar white matter", "Ventral white matter", 
        "Boundary white matter", "Thalamic and mid/hindbrain white matter", 
        "Mid/hindbrain white matter", "Mixed white matter", 
        "Choroid plexus and ventricles", "Ventricular linings", 
        "Thalamic and midbrain regions", "White and gray matter boundary", 
        "Thalamic mixed gray and white matter", "Thalamic mixed gray and white matter #2", 
        "Neuron-rich lateral white matter", "Neuron-rich lateral white matter #2", 
        "Pallidum and projections", "Cortical layer 4", 
        "Subcortical plate, hippocampus and hypothalamus", 
        "GABA-ergic Purkinje cells of the cerebellum", "Cortical layers 2-3 and 4", 
        "Piriform cortex", "Cortical layers 1 and 2-3", "Cortical layer 5", 
        "Cortical layer 6, dentate gyrus", "Striatum, hypothalamus and hippocampus", 
        "Striatum, hypothalamus and hippocampus #2", 
        "Retrosplenial, cortical, cerebellar", "Cortical layer 6 and cerebellar Y", 
        "Cerebellar glutamatergic neurons", "Cortical layer 6 and thalamic"
    ],
    "color": [
        "#360064", "#980053", "#170b3b", "#ac2f5c", "#2a3f6d", "#002657", 
        "#21366b", "#3e4b6c", "#f75400", "#ef633e", "#a5d4e6", "#6399c6", 
        "#853a00", "#edeef4", "#fdbf71", "#ce710e", "#940457", "#a2d36c", 
        "#d5edb5", "#0065d6", "#bcf18b", "#a68d68", "#79e47e", "#2f0097", 
        "#47029f", "#7500a8", "#d70021", "#ca99c9", "#d4b9da", "#e00085", 
        "#f6f3f8"
    ]
}

namingtable = pd.DataFrame(namingtable)
namingtable = namingtable.loc[namingtable['cluster'].isin([11222, 11221, 11212, 11211]),:]
namingtable

In [None]:
color_dict = {
    "#002657": "Thalamic and mid/hindbrain WM / CSF subclass",
    "#1f77b4": "Dorsal VLMCs and canals",
    "#21366b": "Mid/hindbrain WM / CSF subclass",
    "#2a3f6d": "Boundary WM / CSF subclass",
    "#2ca02c": "IIIv VLMCs / Astro-TE",
    "#3e4b6c": "Mixed WM / CSF subclass",
    "#8c564b": "Ependymal dorsomedial linining of LVs",
    "#9467bd": "Ventral ependymal / Astro-TE NN 5",
    "#98df8a": "IIIv dorsal lining / VLMCs",
    "#aec7e8": "ChP \"core wave 1\"",
    "#c49c94": "Dorsal IIIv lining and canals",
    "#c5b0d5": "Dorsal lining of LVs, canal and IVv lining",
    "#e377c2": "Ventral Ependymal IIIv and LVs",
    "#f7b6d2": "Lining of IVv",
    "#ffbb78": "ChP \"core wave 2\""
}
ventricles['cluster'] = ventricles['putativecluster_color'].map(color_dict)
ventricles['cluster']

In [None]:
# plot the clusters one by one

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
from tqdm import tqdm

dot_size = 0.3
sections_to_plot = range(1, 33)

dd2 = ventricles

global_min_z = dd2['zccf'].min()
global_max_z = dd2['zccf'].max()
global_min_y = -dd2['yccf'].max()
global_max_y = -dd2['yccf'].min()

fig, axes = plt.subplots(4, 8, figsize=(40, 20))
axes = axes.flatten()
for i, section_num in enumerate(sections_to_plot):
    ax = axes[i]
    xx = dd2[dd2["Section"] == section_num]
    sc2 = ax.scatter(xx['zccf'], -xx['yccf'],
                     c=xx['putativecluster_color'], s=dot_size, alpha=1, rasterized=True)
    ax.axis('off')
    ax.set_aspect('equal')
    ax.set_xlim(global_min_z, global_max_z)
    ax.set_ylim(global_min_y, global_max_y)
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])
plt.tight_layout()
plt.show()

In [None]:
for cl in dd2['putativecluster_color'].unique():
    
    fig, axes = plt.subplots(4, 8, figsize=(40, 20))
    axes = axes.flatten()
    for i, section_num in enumerate(sections_to_plot):
        ax = axes[i]
        xx = dd2[dd2["Section"] == section_num]
        sc2 = ax.scatter(xx['zccf'], -xx['yccf'],
                         c=xx['putativecluster_color'].astype("category").cat.codes, cmap="Greys", s=dot_size, alpha=1, rasterized=True)
        
        yy = xx[xx['putativecluster_color'] == cl]
        ax.scatter(yy['zccf'], -yy['yccf'],
                         c="red", s=dot_size, alpha=1, rasterized=True)
        
        ax.axis('off')
        ax.set_aspect('equal')
        ax.set_xlim(global_min_z, global_max_z)
        ax.set_ylim(global_min_y, global_max_y)
    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])
    plt.suptitle(color_dict[cl])
    plt.tight_layout()
    plt.show()

## Define a hierarchy of ventricular lipizone centroid lipidomes

In [None]:
import pandas as pd
df = lipvent.copy()
lower = df.quantile(0.005)
upper = df.quantile(0.995)
df_clipped = df.clip(lower=lower, upper=upper, axis=1)
min_vals = df_clipped.min()
max_vals = df_clipped.max()
df_normalized = (df_clipped - min_vals) / (max_vals - min_vals)
df_normalized

In [None]:
centroids = df_normalized.groupby(ventricles['cluster']).mean()
# fraction of the max...

toshow = centroids / centroids.max()
toward = (centroids.T / centroids.T.max()).T

import matplotlib
import scipy.cluster.hierarchy as sch
import seaborn as sns
matplotlib.rcParams['pdf.fonttype'] = 42

markers = []

for index, row in toshow.iterrows():
    top_cols = row.nlargest(6).index.tolist()
    markers.extend(top_cols)
    
toplot = toshow.loc[:, np.unique(markers)]

toplot

In [None]:
import scipy.cluster.hierarchy as sch
import seaborn as sns

tmp1 = toward.loc[:, np.unique(markers)].T
linkage = sch.linkage(sch.distance.pdist(tmp1.T), method='ward', optimal_ordering=True)

Z = linkage
dendro = dendrogram(
    Z,
    labels=tmp1.columns, 
    orientation='left',    
    leaf_font_size=8,     
    color_threshold=0,
    above_threshold_color='black'
)
plt.savefig("dendro_ventr.pdf")

leaf_sequence = dendro['ivl'][::-1]
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42

from matplotlib.patches import Patch

legend = ventricles[['putativecluster_color', 'cluster']].drop_duplicates()
legend.index = legend['cluster']

df_sorted = legend.loc[leaf_sequence,:]

patches = [Patch(color=row['putativecluster_color'], label=row['cluster']) for _, row in df_sorted.iterrows()]

plt.figure(figsize=(10, 8))
plt.legend(handles=patches, title='Legend', bbox_to_anchor=(1, 1), loc='upper left', fontsize='small', title_fontsize='medium', borderpad=1)
plt.axis('off') 
plt.tight_layout()
plt.savefig("legend_ventr.pdf")
plt.show()

In [None]:
order = sch.leaves_list(linkage)
tmp1 = tmp1.iloc[:, order]

normalized_df = toplot.T.loc[:,tmp1.columns].T

In [None]:
fig, ax1 = plt.subplots(figsize=(10, 10))
sns.heatmap(normalized_df, cmap="Grays", ax=ax1, cbar_kws={'label': 'centroids'},
            xticklabels=True, yticklabels=True, vmin=0.5, vmax=1.0)#

ax1.tick_params(axis='x', which='both', bottom=False, top=False)
ax1.tick_params(axis='y', which='both', left=False, right=False, pad=20)

plt.savefig("biochempattern.pdf")
plt.show()

## Characterize the two ChP waves and the differences between LVs and IVv

In [None]:
import numpy as np
import pandas as pd

interest_acronyms = ["VL", "V3", "V4", "AQ", "c"]
reference_points = ventricles.loc[
    ventricles['acronym'].isin(interest_acronyms),
    ['Section', 'yccf', 'zccf', 'acronym']
]
other_points = ventricles.loc[
    ~ventricles['acronym'].isin(interest_acronyms),
    ['Section', 'yccf', 'zccf', 'acronym']
]
zccf_mean_per_section = reference_points.groupby('Section')['zccf'].mean().rename('zccf_mean')
reference_points = reference_points.join(zccf_mean_per_section, on='Section')
reference_points['side'] = np.where(reference_points['zccf'] < reference_points['zccf_mean'], 'left', 'right')
other_points = other_points.join(zccf_mean_per_section, on='Section', how='left')
other_points['side'] = np.where(other_points['zccf'] < other_points['zccf_mean'], 'left', 'right')
radius = 0.5

def find_closest_interest_acronym(row, reference_data, radius):
    section = row['Section']
    yccf_point = row['yccf']
    zccf_point = row['zccf']
    side = row['side']
    
    subset = reference_data[(reference_data['Section'] == section) & (reference_data['side'] == side)]
    if subset.empty:
        return np.nan
    
    dy = subset['yccf'] - yccf_point
    dz = subset['zccf'] - zccf_point
    distances = np.sqrt(dy**2 + dz**2)
    
    min_dist = distances.min()
    if min_dist <= radius:
        min_idx = distances.idxmin()
        if isinstance(min_idx, pd.Series):
            min_idx = min_idx.iloc[0]
        closest_acronym = subset.loc[min_idx, 'acronym']
        return closest_acronym
    else:
        return np.nan

other_points['closest_acronym'] = other_points.apply(
    find_closest_interest_acronym, axis=1, args=(reference_points, radius)
)
other_points['acronym'] = other_points['closest_acronym']

def fix_series_values(df, column):
    mask = df[column].apply(lambda x: isinstance(x, pd.Series))
    
    if mask.any():
        problem_rows = df[mask].copy()
        
        for i, row in problem_rows.iterrows():
            series_val = row[column]
            fixed_val = series_val.iloc[0] if len(series_val) > 0 else np.nan
            df.at[i, column] = fixed_val
            
    return df

other_points = fix_series_values(other_points, 'closest_acronym')
other_points = fix_series_values(other_points, 'acronym')
other_points = other_points.iloc[:, :-1]
annotated_ventricles = pd.concat([reference_points, other_points])
annotated_ventricles['acronym'] = annotated_ventricles['acronym'].fillna("tbd").astype("category").cat.codes
unique_codes = annotated_ventricles['acronym'].unique()
random_colors = {code: np.random.rand(3,) for code in unique_codes}
annotated_ventricles['color'] = annotated_ventricles['acronym'].map(random_colors)

In [None]:
import matplotlib.pyplot as plt
annotated_ventricles['color'] = annotated_ventricles['color'].apply(tuple)

legend_data = annotated_ventricles[['acronym', 'color']].drop_duplicates()

fig, ax = plt.subplots(figsize=(8, 6))

for acronym, color in zip(legend_data['acronym'], legend_data['color']):
    ax.scatter([], [], color=color, label=acronym)
ax.legend(loc='upper left', title="Acronyms", frameon=True, fontsize=10, title_fontsize=12)
ax.set_axis_off()
plt.show()

In [None]:
dot_size = 0.3
sections_to_plot = range(1, 33)

dd2 = annotated_ventricles

global_min_z = dd2['zccf'].min()
global_max_z = dd2['zccf'].max()
global_min_y = -dd2['yccf'].max()
global_max_y = -dd2['yccf'].min()

fig, axes = plt.subplots(4, 8, figsize=(40, 20))
axes = axes.flatten()
for i, section_num in enumerate(sections_to_plot):
    ax = axes[i]
    xx = dd2[dd2["Section"] == section_num]
    sc2 = ax.scatter(xx['zccf'], -xx['yccf'],
                     c=xx['color'], s=dot_size, alpha=1, rasterized=True)
    ax.axis('off')
    ax.set_aspect('equal')
    ax.set_xlim(global_min_z, global_max_z)
    ax.set_ylim(global_min_y, global_max_y)
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])
plt.tight_layout()
plt.show()

In [None]:
# 2 --> IVv
# 3 --> LV

indLV = annotated_ventricles.loc[(annotated_ventricles['acronym'] == 3) & (annotated_ventricles['Section'] > 6),:].index
indIV = annotated_ventricles.loc[(annotated_ventricles['acronym'] == 2) & (annotated_ventricles['Section'] > 24),:].index
indIV

In [None]:
lv = ventricles.loc[indLV,:]
iv = ventricles.loc[indIV,:]

In [None]:
lv = lv.loc[lv['putativecluster_color'].isin(["#aec7e8", "#ffbb78"]),:]
iv = iv.loc[iv['putativecluster_color'].isin(["#aec7e8", "#ffbb78"]),:]
lv

In [None]:
lv = lv.loc[lv['SectionID'].isin([7.0,8.0,9.0,12.0]),:]
iv = iv.loc[iv['SectionID'].isin([27.0, 26.0, 28.0]),:]

In [None]:
lv['putativecluster_color'].value_counts(normalize=True)

In [None]:
iv['putativecluster_color'].value_counts(normalize=True) ##ffbb78 is enriched in the IV ventricle, while #aec7e8 is enriched in the LVs, with a full-brain analysis.

In [None]:
colors = ['#aec7e8', '#ffbb78']
proportions1 = [0.607272, 0.392728]
labels = ['ChP - core wave 1', 'ChP - core wave 2']
proportions2 = [0.43956, 0.56044]

fig, axes = plt.subplots(1, 2, figsize=(10, 3), sharey=True)

axes[0].bar([0], [proportions1[0]], color=colors[0], label=labels[0])
axes[0].bar([0], [proportions1[1]], bottom=[proportions1[0]], color=colors[1], label=labels[1])
axes[0].set_ylabel('Proportion')
axes[0].set_xticks([])
axes[1].bar([0], [proportions2[0]], color=colors[0], label=labels[0])
axes[1].bar([0], [proportions2[1]], bottom=[proportions2[0]], color=colors[1], label=labels[1])
axes[1].set_xticks([])
axes[1].legend(title='Colors', loc='center left', bbox_to_anchor=(1, 0.5))
plt.tight_layout()
plt.show()

In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm

def nested_bootstrap_prop(df, color_hex, B=2000, random_state=None):
    """
    Cluster + within‐cluster bootstrap of a binary “is_CHPI” indicator,
    where clusters = SectionID, cells within each cluster are resampled
    at the cell level.

    Returns array of length B of the bootstrap replicate proportions.
    """
    rng = np.random.default_rng(random_state)
    secs = df['SectionID'].unique()
    nsec = len(secs)
    props = np.empty(B)
    
    for i in tqdm(range(B)):
        # sample clusters with replacement
        sampled_secs = rng.choice(secs, size=nsec, replace=True)
        
        # for each cluster, sample its cells (with replacement) up to its original size
        parts = []
        for sec in sampled_secs:
            sec_cells = df[df['SectionID'] == sec]
            # sample cells within this section
            parts.append(
                sec_cells.sample(n=len(sec_cells), replace=True, random_state=rng)
            )
        bdata = pd.concat(parts, ignore_index=True)
        
        # compute proportion of CHPI‐labeled cells
        props[i] = (bdata['putativecluster_color'] == color_hex).mean()
    
    return props

# ─── USER MUST SET THIS ─────────────────────────────────────────────────────────
# the hex code you used to label CHPI cells, e.g. "#1f77b4"
chp1_hex = "#aec7e8"
# ────────────────────────────────────────────────────────────────────────────────

# number of bootstrap replicates
B = 2000

# run the nested bootstrap on each ventricle
lat_props = nested_bootstrap_prop(lv, chp1_hex, B=B, random_state=42)
iv_props  = nested_bootstrap_prop(iv, chp1_hex, B=B, random_state=42)

# difference in each replicate
diff_props = lat_props - iv_props

# empirical 95% CIs
lat_ci  = np.percentile(lat_props, [2.5, 97.5])
iv_ci   = np.percentile(iv_props, [2.5, 97.5])
diff_ci = np.percentile(diff_props, [2.5, 97.5])

# observed proportions
obs_lat  = (lv['putativecluster_color'] == chp1_hex).mean()
obs_iv   = (iv['putativecluster_color'] == chp1_hex).mean()
obs_diff = obs_lat - obs_iv

# print results
print(f"Lateral   CHPI prop: {obs_lat:.3f}  95% CI = [{lat_ci[0]:.3f}, {lat_ci[1]:.3f}]") ########## 90!!!!!!!
print(f"Fourth    CHPI prop: {obs_iv:.3f}   95% CI = [{iv_ci[0]:.3f}, {iv_ci[1]:.3f}]")
print(f"Difference (Lat–4th): {obs_diff:.3f}  95% CI = [{diff_ci[0]:.3f}, {diff_ci[1]:.3f}]")


In [None]:
# empirical 95% CIs
lat_ci  = np.percentile(lat_props, [5, 95])
iv_ci   = np.percentile(iv_props, [5, 95])
diff_ci = np.percentile(diff_props, [5, 95])

# observed proportions
obs_lat  = (lv['putativecluster_color'] == chp1_hex).mean()
obs_iv   = (iv['putativecluster_color'] == chp1_hex).mean()
obs_diff = obs_lat - obs_iv

# print results
print(f"Lateral   CHPI prop: {obs_lat:.3f}  95% CI = [{lat_ci[0]:.3f}, {lat_ci[1]:.3f}]") ########## 90!!!!!!!
print(f"Fourth    CHPI prop: {obs_iv:.3f}   95% CI = [{iv_ci[0]:.3f}, {iv_ci[1]:.3f}]")
print(f"Difference (Lat–4th): {obs_diff:.3f}  95% CI = [{diff_ci[0]:.3f}, {diff_ci[1]:.3f}]")


In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm


def bootstrap_prop(df, color_hex, B=2000, random_state=None):
    """
    Simple bootstrap of a binary “is_CHPI” indicator,
    sampling pixels across the entire dataset (all sections pooled).

    Returns array of length B of bootstrap replicate proportions.
    """
    rng = np.random.default_rng(random_state)
    n = len(df)
    props = np.empty(B)

    for i in tqdm(range(B), desc="Bootstrapping pixels"):
        # sample pixels across all sections with replacement
        sample = df.sample(n=n, replace=True, random_state=rng)
        # compute proportion of CHPI‐labeled pixels
        props[i] = (sample['putativecluster_color'] == color_hex).mean()

    return props


# ─── USER MUST SET THIS ─────────────────────────────────────────────────────────
# the hex code you used to label CHPI pixels, e.g. "#1f77b4"
chp1_hex = "#aec7e8"
# ────────────────────────────────────────────────────────────────────────────────

# number of bootstrap replicates
B = 2000

# run the simple bootstrap on each ventricle (pooled pixels)
lat_props = bootstrap_prop(lv, chp1_hex, B=B, random_state=42)
iv_props  = bootstrap_prop(iv, chp1_hex, B=B, random_state=42)

# difference in each replicate
diff_props = lat_props - iv_props

# empirical 95% CIs (2.5th and 97.5th percentiles)
lat_ci  = np.percentile(lat_props, [2.5, 97.5])
iv_ci   = np.percentile(iv_props, [2.5, 97.5])
diff_ci = np.percentile(diff_props, [2.5, 97.5])

# observed proportions
obs_lat  = (lv['putativecluster_color'] == chp1_hex).mean()
obs_iv   = (iv['putativecluster_color'] == chp1_hex).mean()
obs_diff = obs_lat - obs_iv

# print results
print(f"Lateral   CHPI prop: {obs_lat:.3f}  95% CI = [{lat_ci[0]:.3f}, {lat_ci[1]:.3f}]")
print(f"Fourth    CHPI prop: {obs_iv:.3f}   95% CI = [{iv_ci[0]:.3f}, {iv_ci[1]:.3f}]")
print(f"Difference (Lat–4th): {obs_diff:.3f}  95% CI = [{diff_ci[0]:.3f}, {diff_ci[1]:.3f}]")


In [None]:
import matplotlib.pyplot as plt

# Replace these with your actual bootstrap CI endpoints:
# e.g., lat_ci = [0.58, 0.63], iv_ci = [0.41, 0.47]
lat_ci = [0.610, 0.656]
iv_ci  = [0.405, 0.476]

colors = ['#aec7e8', '#ffbb78']
proportions1 = [0.633, 0.366]  # Lateral: CHP1, CHP2
proportions2 = [0.43956, 0.56044]    # Fourth: CHP1, CHP2
labels = ['ChP - core wave 1', 'ChP - core wave 2']

fig, axes = plt.subplots(1, 2, figsize=(10, 3), sharey=True)

# Lateral ventricle
axes[0].bar(0, proportions1[0], color=colors[0])
axes[0].bar(0, proportions1[1], bottom=proportions1[0], color=colors[1])
axes[0].errorbar(
    0,
    proportions1[0],
    yerr=[
        [proportions1[0] - lat_ci[0]],
        [lat_ci[1] - proportions1[0]]
    ],
    fmt='none',
    ecolor='black',
    capsize=5
)
axes[0].set_xticks([0])
axes[0].set_xticklabels(['Lateral'])
axes[0].set_ylabel('Proportion')
axes[0].set_title('Lateral Ventricle')

# Fourth ventricle
axes[1].bar(0, proportions2[0], color=colors[0])
axes[1].bar(0, proportions2[1], bottom=proportions2[0], color=colors[1])
axes[1].errorbar(
    0,
    proportions2[0],
    yerr=[
        [proportions2[0] - iv_ci[0]],
        [iv_ci[1] - proportions2[0]]
    ],
    fmt='none',
    ecolor='black',
    capsize=5
)
axes[1].set_xticks([0])
axes[1].set_xticklabels(['Fourth'])
axes[1].set_title('Fourth Ventricle')

axes[1].legend(labels, title='Cell Type', loc='center left', bbox_to_anchor=(1, 0.5))

plt.tight_layout()
plt.savefig("propsstacked.pdf")
plt.show()


In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm

def bootstrap_prop(df, color_hex, B=2000, random_state=None):
    """
    Simple bootstrap of a binary “is_CHPI” indicator,
    sampling pixels across the entire dataset (all sections pooled).

    Returns array of length B of bootstrap replicate proportions.
    """
    rng = np.random.default_rng(random_state)
    n = len(df)
    props = np.empty(B)

    for i in tqdm(range(B), desc="Bootstrapping pixels"):
        # sample pixels across all sections with replacement
        sample = df.sample(n=n, replace=True, random_state=rng)
        # compute proportion of CHPI‐labeled pixels
        props[i] = (sample['putativecluster_color'] == color_hex).mean()

    return props

# ─── USER MUST SET THIS ─────────────────────────────────────────────────────────
# the hex code you used to label CHPI pixels, e.g. "#1f77b4"
chp1_hex = "#aec7e8"
# ────────────────────────────────────────────────────────────────────────────────

# number of bootstrap replicates
B = 2000

# run the simple bootstrap on each ventricle (pooled pixels)
lat_props = bootstrap_prop(lv, chp1_hex, B=B, random_state=42)
iv_props  = bootstrap_prop(iv, chp1_hex, B=B, random_state=42)

# difference in each replicate
diff_props = lat_props - iv_props

# empirical 95% CIs (2.5th and 97.5th percentiles)
lat_ci  = np.percentile(lat_props, [2.5, 97.5])
iv_ci   = np.percentile(iv_props, [2.5, 97.5])
diff_ci = np.percentile(diff_props, [2.5, 97.5])

# observed proportions
obs_lat  = (lv['putativecluster_color'] == chp1_hex).mean()
obs_iv   = (iv['putativecluster_color'] == chp1_hex).mean()
obs_diff = obs_lat - obs_iv

# print results
print(f"Lateral   CHPI prop: {obs_lat:.3f}  95% CI = [{lat_ci[0]:.3f}, {lat_ci[1]:.3f}]")
print(f"Fourth    CHPI prop: {obs_iv:.3f}   95% CI = [{iv_ci[0]:.3f}, {iv_ci[1]:.3f}]")
print(f"Difference (Lat–4th): {obs_diff:.3f}  95% CI = [{diff_ci[0]:.3f}, {diff_ci[1]:.3f}]")

# two‐sided bootstrap p‐value for difference
pval = 2 * min((diff_props <= 0).mean(), (diff_props >= 0).mean())
print(f"P‐value (two‐sided bootstrap) = {pval:.3f}")


In [None]:
lv['putativecluster_color'].groupby(lv['Section']).value_counts(normalize=True)

In [None]:
df = lv['putativecluster_color'].groupby(lv['Section']).value_counts(normalize=True).reset_index()
chpw1_lv = df[df['putativecluster_color'] == '#aec7e8']['proportion'].values
chpw2_lv = df[df['putativecluster_color'] == '#ffbb78']['proportion'].values

df = iv['putativecluster_color'].groupby(iv['Section']).value_counts(normalize=True).reset_index()
chpw1_iv = df[df['putativecluster_color'] == '#aec7e8']['proportion'].values
chpw2_iv = df[df['putativecluster_color'] == '#ffbb78']['proportion'].values
chpw1_iv

In [None]:
# a function to check for differential lipids between two groups

from scipy.stats import mannwhitneyu, entropy
import matplotlib.pyplot as plt
from tqdm import tqdm
from statsmodels.stats.multitest import multipletests
from tqdm import tqdm

def differential_lipids(lipidata, kmeans_labels, min_fc=0.2, pthr=0.05):
    results = []

    a = lipidata.loc[kmeans_labels == 0,:]
    b = lipidata.loc[kmeans_labels == 1,:]
    
    for rrr in range(lipidata.shape[1]):
       
        groupA = a.iloc[:,rrr]
        groupB = b.iloc[:,rrr]
    
        # log2 fold change
        meanA = np.mean(groupA) + 0.00000000001
        meanB = np.mean(groupB) + 0.00000000001
        log2fold_change = np.log2(meanB / meanA) if meanA > 0 and meanB > 0 else np.nan
    
        # Wilcoxon test
        try:
            _, p_value = mannwhitneyu(groupA, groupB, alternative='two-sided')
        except ValueError:
            p_value = np.nan
    
        results.append({'lipid': rrr, 'log2fold_change': log2fold_change, 'p_value': p_value})

    results_df = pd.DataFrame(results)

    # correct for multiple testing
    reject, pvals_corrected, _, _ = multipletests(results_df['p_value'].values, alpha=0.05, method='fdr_bh')
    results_df['p_value_corrected'] = pvals_corrected
    
    return results_df

In [None]:
lipidata = ventricles.iloc[:, :173]
l1 = '#aec7e8'
l2 = '#ffbb78'

lipidata = lipidata.loc[(ventricles['putativecluster_color'] == l1) | (ventricles['putativecluster_color'] == l2),:]

labels = ventricles.loc[(ventricles['putativecluster_color'] == l1) | (ventricles['putativecluster_color'] == l2),'putativecluster_color']
labels[labels == l1] = 1
labels[labels == l2] = 0

In [None]:
difflips = differential_lipids(lipidata, labels)
difflips.lipid = lipidata.columns

print(difflips.sort_values('log2fold_change')[:20])
print(difflips.sort_values('log2fold_change')[-20:])

In [None]:
difflips_toplot = pd.concat([difflips.sort_values('log2fold_change')[:10], difflips.sort_values('log2fold_change')[-10:]], axis=0)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42

def get_color(lipid):
    if lipid.startswith('TG'):
        return 'lightblue'
    elif lipid.startswith('PE O'):
        return 'purple'
    elif lipid.startswith(('HexCer', 'Hex2Cer')):
        return 'lightgreen'
    elif lipid.startswith(('PC', 'PE', 'PG', 'PS', 'PI')) and not lipid.startswith(('PE O', 'PC O')):
        return 'yellow'
    else:
        return 'gray'

colors = difflips_toplot['lipid'].apply(get_color)

plt.figure(figsize=(6, 6))
bars = plt.bar(range(len(difflips_toplot)), difflips_toplot['log2fold_change'], color=colors)

plt.xlabel('Lipids')
plt.ylabel('Log2 Fold Change')
plt.title('Log2 Fold Change of Lipids')
plt.xticks(range(len(difflips_toplot)), difflips_toplot['lipid'], rotation=90)

plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

plt.tight_layout()
plt.show()

In [None]:
# preprocess and extract top differential lipids
differentiallipids = difflips.loc[(difflips['p_value_corrected'] < 0.05) & (np.abs(difflips['log2fold_change']) > 0.4),:].sort_values('log2fold_change')['lipid'].values
filtered_datavignettes = datavignettes.loc[
    ~datavignettes['lipizone_names'].str.contains('General|Periventricular|ventricular', na=False)
]
iv['lipizone_names'] = iv['putativecluster_color']
lv['lipizone_names'] = lv['putativecluster_color']

filtered_datavignettes = pd.concat([filtered_datavignettes, iv.iloc[:, :-3], lv.iloc[:, :-3]])

# normalize and clip
datemp = filtered_datavignettes.iloc[:,:173].groupby(filtered_datavignettes['lipizone_names']).mean()
p2 = datemp.quantile(0.01)
p98 = datemp.quantile(0.99)
datemp_values = datemp.values
p2_values = p2.values
p98_values = p98.values
normalized_values = (datemp_values - p2_values) / (p98_values - p2_values)
clipped_values = np.clip(normalized_values, 0, 1)

# separate ChP vs other (nonventricular) lipizones
centroids = pd.DataFrame(clipped_values, columns=datemp.columns, index=datemp.index)
allcentroids = centroids.loc[~centroids.index.isin(['#aec7e8', '#ffbb78']),differentiallipids]
centroids = centroids.loc[centroids.index.isin(['#aec7e8', '#ffbb78']),differentiallipids]
centroids

In [None]:
# calculate the relative lipidomic distance of lipizones to each of the two ChP waves
def relative_distance(c, a, b):
    c = c.values
    a = a.values
    b = b.values
    dist_ca = np.linalg.norm(c - a)
    dist_cb = np.linalg.norm(c - b)
    return dist_ca / (dist_ca + dist_cb)
allcentroids['relative_distance'] = allcentroids.apply(
    lambda row: relative_distance(row, centroids.loc['#aec7e8',:], centroids.loc['#ffbb78',:]), axis=1
)
allcentroids['color'] = "#aec7e8"
allcentroids.loc[allcentroids['relative_distance'] > 0.5, 'color'] = "#ffbb78"

# calculate average AP position of lipizones
avepos = filtered_datavignettes['xccf'].groupby(filtered_datavignettes['lipizone_names']).mean().loc[allcentroids.index]

In [None]:
from scipy.stats import pearsonr
corr_coef, p_val = pearsonr(allcentroids['relative_distance'], avepos)
p_val

In [None]:
plt.scatter(allcentroids['relative_distance'], avepos, c=allcentroids['color'])
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.xlabel('Relative Euclidean distance from ChP wave to brain-wide lipizones')
plt.ylabel('Mean anteroposterior position of lipizone')
plt.tight_layout()
plt.show()

In [None]:
allcentroids.index[(allcentroids['relative_distance']<0.5) & (avepos > 10.2)] # the upper bubble is overwhelmingly cerebellar cortex (check in space)

In [None]:
outlierbubble = allcentroids.index[(allcentroids['relative_distance']<0.5) & (avepos > 10.2)]
outlierbubble

In [None]:
centroids = pd.DataFrame(clipped_values, columns=datemp.columns, index=datemp.index)

selected_columns = centroids.filter(regex=r'^(PE O-|PC O-)').columns

ccc = centroids.loc[:, selected_columns].iloc[:2,:] + 1e-7

plasmalogens = dict(np.log2(ccc.iloc[0,:] / ccc.iloc[1,:]))

sorted_plasmalogens = dict(sorted(plasmalogens.items(), key=lambda x: x[1], reverse=True))
clipped_values = {k: min(v, 2.0) for k, v in sorted_plasmalogens.items()}

In [None]:
plt.figure(figsize=(2, 6))
bars = plt.bar(clipped_values.keys(), clipped_values.values(), color='skyblue')

for bar, value, orig_value in zip(bars, clipped_values.values(), sorted_plasmalogens.values()):
    if value == 2.0:
        plt.text(bar.get_x() + bar.get_width() / 2, value, f'{orig_value:.2e}', 
                 ha='center', va='bottom', fontsize=8, color='black')

plt.xlabel('Lipids')
plt.ylabel('Intensity (Clipped at 2.0)')
plt.title('Clipped Intensity of Plasmalogens')
plt.xticks(rotation=90)

plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

plt.tight_layout()
plt.show()

## Characterize the sphingomyelins, from brain to ventricle

In [None]:
# there are sphingomyelins that have high measurement dropout & did not reach acceptable recovery scores, but they are worth exploring...
embeddings = pd.read_hdf("./zenodo/multimodal/corrected_nmfall_nochunking.h5ad", key="table")
coordinates = datavignettes.loc[datavignettes.index.isin(embeddings.index), ['SectionID', 'x', 'y']]
embeddings = embeddings.loc[coordinates.index,:]

# we will let XGB recover them now even if the generalization of the model was not satisfying... and append them to our dataset
import joblib

coordinates = coordinates[['SectionID',	'x',	'y']]
for model_path in ['./zenodo/mixed/807.633871_xgb_model.joblib', './zenodo/mixed/809.650753_xgb_model.joblib']:
    try:
        model = joblib.load(model_path)
        coordinates[model_path] = model.predict(embeddings)
        coordinates.columns = [
            col.replace('_xgb_model.joblib', '') if i >= 3 else col 
            for i, col in enumerate(coordinates.columns)
        ]
    except:
        print(file + " was not imputable")
coordinates

In [None]:
datavignettes['SM 40:2;O2'] = coordinates['./zenodo/mixed/807.633871']
datavignettes['SM 40:1;O2'] = coordinates['./zenodo/mixed/809.650753']

In [None]:
columns_starting_with_SM = [col for col in datavignettes.columns if str(col).startswith("SM")]

tsne = datavignettes[['tsne1', 'tsne2']]
tsne_downsampled = tsne[::50].values

def plot_grid(datavignettes, columns, tsne, grid_columns=6):
    num_plots = len(columns)
    grid_rows = (num_plots + grid_columns - 1) // grid_columns 
    
    fig, axes = plt.subplots(grid_rows, grid_columns, figsize=(grid_columns * 2, grid_rows * 2))
    axes = axes.flatten()
    
    for i, col in enumerate(columns):
        c_data = np.clip(datavignettes[::50][col].values, 
                         np.percentile(datavignettes[::50][col].values, 10), 
                         np.percentile(datavignettes[::50][col].values, 98))
        
        axes[i].scatter(tsne[:, 0], tsne[:, 1], c=c_data, cmap="inferno", alpha=0.7, s=0.01, rasterized=True)
        axes[i].set_title(col, fontsize=6)
        axes[i].set_xticks([])
        axes[i].set_yticks([])
        axes[i].spines['top'].set_visible(False)
        axes[i].spines['right'].set_visible(False)
        axes[i].spines['left'].set_visible(False)
        axes[i].spines['bottom'].set_visible(False)
    
    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])
    
    plt.tight_layout()
    plt.show()

plot_grid(datavignettes, columns_starting_with_SM, tsne_downsampled)

In [None]:
import seaborn as sns
smdf = datavignettes.loc[:,columns_starting_with_SM].corr()
cluster_map = sns.clustermap(smdf, cmap="inferno", center=0)
plt.show()
reordered_data = smdf.iloc[cluster_map.dendrogram_row.reordered_ind, 
                           cluster_map.dendrogram_col.reordered_ind]

sns.heatmap(reordered_data, cmap="inferno", center=0, xticklabels=True, yticklabels=True)
plt.tick_params(axis='x', labelsize=8)  
plt.tick_params(axis='y', labelsize=8) 
plt.show()

In [None]:
datavignettes.loc[datavignettes['subclass_name'] == 'Ventricular linings',  'subclass_name'] = 'Choroid plexus and ventricles'
means = datavignettes.loc[:,reordered_data.index].groupby(datavignettes['subclass_name']).mean()
enr = means / means.mean()
ser = enr.loc["Choroid plexus and ventricles",:]

fig, ax = plt.subplots()
ser.plot(kind='bar', color='black', ax=ax)

ax.axhline(1, color='darkred', linestyle='--')

plt.savefig("bars_sm.pdf")
plt.show()

In [None]:
columns_to_move = ['SM 40:2;O2', 'SM 40:1;O2']
remaining_columns = [col for col in datavignettes.columns if col not in columns_to_move]
new_column_order = columns_to_move + remaining_columns

datavignettes = datavignettes[new_column_order]

In [None]:
import pickle

file_path = './zenodo/mixed/allen_name_to_annots.pkl'

with open(file_path, 'rb') as file:
    allen_name_to_annots = pickle.load(file)
regions = [
    "Frontal pole, cerebral cortex",
    "Somatomotor areas",
    "Somatosensory areas",
    "Gustatory areas",
    "Visceral area",
    "Auditory areas",
    "Visual areas",
    "Anterior cingulate area",
    "Prelimbic area",
    "Infralimbic area",
    "Orbital area",
    "Agranular insular area",
    "Retrosplenial area",
    "Posterior parietal association areas",
    "Temporal association areas",
    "Perirhinal area",
    "Ectorhinal area",
    "Olfactory areas",
    "Ammon's horn",
    "Dentate gyrus",
    "Fasciola cinerea",
    "Induseum griseum",
    "Retrohippocampal region",
    "Layer 6b, isocortex",
    "Claustrum",
    "Endopiriform nucleus",
    "Lateral amygdalar nucleus",
    "Basolateral amygdalar nucleus",
    "Basomedial amygdalar nucleus",
    "Posterior amygdalar nucleus",
    "Striatum dorsal region",
    "Striatum ventral region",
    "Nucleus accumbens",
    "Fundus of striatum",
    "Olfactory tubercle",
    "Lateral strip of striatum",
    "Lateral septal complex",
    "Striatum-like amygdalar nuclei",
    "Pallidum",
    "Thalamus, sensory-motor cortex related",
    "Thalamus, polymodal association cortex related",
    "Periventricular zone",
    "Periventricular region",
    "Hypothalamic medial zone",
    "Hypothalamic lateral zone",
    "Median eminence",
    "Midbrain, sensory related",
    "Midbrain, motor related",
    "Midbrain, behavioral state related",
    "Pons, sensory related",
    "Pons, motor related",
    "Pons, behavioral state related",
    "Medulla, sensory related",
    "Medulla, motor related",
    "Medulla, behavioral state related",
    "Cerebellar cortex",
    "Cerebellar nuclei",
    "cranial nerves",
    "cerebellum related fiber tracts",
    "supra-callosal cerebral white matter",
    "lateral forebrain bundle system",
    "extrapyramidal fiber systems",
    "medial forebrain bundle system",
    "ventricular systems"
]

datavignettes['region'] = "General"
for i in regions:
    datavignettes['region'][datavignettes['id'].isin(allen_name_to_annots[i])] = i

In [None]:
datavignettes['ventricles'] = "Non-ventricular"
datavignettes.loc[datavignettes['region'] == "ventricular systems",'ventricles'] = "Ventricular"

datavignettes['ventricles'].value_counts()

In [None]:
kmeans_labels = np.array(datavignettes['ventricles'])
kmeans_labels[datavignettes['ventricles'] == "Ventricular"] = 1
kmeans_labels[datavignettes['ventricles'] != "Ventricular"] = 0

difflips = differential_lipids(datavignettes.iloc[:, :175], kmeans_labels)
difflips

In [None]:
difflips.index = datavignettes.iloc[:, :175].columns
difflips

In [None]:
difflips = difflips.sort_values('log2fold_change')
difflips = difflips.loc[difflips['p_value_corrected'] < 0.01,:]
ventricular_markers = np.intersect1d(difflips.loc[difflips['log2fold_change'] > 0.2,:].index.values, columns_starting_with_SM)

df = difflips
colors = []
for lipid in df.index:
    if lipid in ventricular_markers:
        colors.append('red')
    elif lipid.startswith('SM'):
        colors.append('orange')
    else:
        colors.append('gray')
        
plt.figure(figsize=(10, 6))
bars = plt.bar(df.index, df['log2fold_change'], color=colors)
sm_indices = [i for i, lipid in enumerate(df.index) if lipid.startswith("SM")]
plt.xticks(sm_indices, [df.index[i] for i in sm_indices], rotation=90, ha='center')

plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

plt.ylabel('logFC between ventricles and all brain')
plt.xlabel("Sorted lipid specied")
plt.title('Lipid log2fold change')
plt.tight_layout() 
plt.show()

In [None]:
datavent = datavignettes.loc[datavignettes['Section'] == 8,:]# similarly for the other ventricles / sections
xx = datavent.loc[(datavent['z_index'] > 150) & (datavent['z_index'] < 230) & (-datavent['y_index'] > -190) & (-datavent['y_index'] < -75),:]

dot_size = 0.2  
num_plots = len(ventricular_markers)
rows, cols = 1, num_plots

fig, axes = plt.subplots(rows, cols, figsize=(6, 15))
axes = axes.flatten()

for iii in range(num_plots):
    
    ax = axes[iii]
    
    sc1 = ax.scatter(xx['z_index'], -xx['y_index'],
                     c=xx[ventricular_markers[iii]], cmap='inferno', s=dot_size*30, alpha=0.7)
    
    ax.axis('off')
    ax.set_title(ventricular_markers[iii])
    ax.set_aspect('equal')
    
for ax in axes[num_plots:]:
    ax.axis('off')

plt.tight_layout()
plt.show()

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import Normalize
from matplotlib.cm import ScalarMappable

global_min_z = datavignettes['zccf'].min()
global_max_z = datavignettes['zccf'].max()
global_min_y = -(datavignettes['yccf'].max())
global_max_y = -(datavignettes['yccf'].min()) 

for currentPC in ventricular_markers:
    results = []
    filtered_data = pd.concat([datavignettes[['yccf', 'zccf', 'Section']], datavignettes.loc[:, str(currentPC)]], axis=1)

    for section in filtered_data['Section'].unique():
        subset = filtered_data[filtered_data['Section'] == section]

        perc_2 = subset[str(currentPC)].quantile(0.20)
        perc_98 = subset[str(currentPC)].quantile(0.98)

        results.append([section, perc_2, perc_98])
    
    percentile_df = pd.DataFrame(results, columns=['Section', '2-perc', '98-perc'])
    med2p = percentile_df['2-perc'].median()
    med98p = percentile_df['98-perc'].median()

    cmap = plt.cm.inferno

    unique_sections = np.sort(filtered_data['Section'].unique())
    num_sections = len(unique_sections)

    fig, axes = plt.subplots(1, num_sections, figsize=(num_sections * 5, 5))
    if num_sections == 1:
        axes = [axes]  # Make sure axes is always iterable

    for i, section in enumerate(unique_sections):
        ax = axes[i]
        ddf = filtered_data[(filtered_data['Section'] == section)]

        ax.scatter(ddf['zccf'], -ddf['yccf'], c=ddf[str(currentPC)], cmap="inferno", s=0.5, rasterized=True, vmin=med2p, vmax=med98p) 
        ax.axis('off')
        ax.set_aspect('equal')
        ax.set_xlim(global_min_z, global_max_z)
        ax.set_ylim(global_min_y, global_max_y)

    
    plt.tight_layout(rect=[0, 0, 0.9, 1])
    plt.show()


## Describe the geometry of the lipidome of the ventricular walls

In [None]:
color_dict = {
    "#002657": "Thalamic and mid/hindbrain WM / CSF subclass",
    "#1f77b4": "Dorsal VLMCs and canals",
    "#21366b": "Mid/hindbrain WM / CSF subclass",
    "#2a3f6d": "Boundary WM / CSF subclass", 
    "#2ca02c": "IIIv VLMCs / Astro-TE",
    "#3e4b6c": "Mixed WM / CSF subclass",
    "#8c564b": "Ependymal dorsomedial linining of LVs",
    "#9467bd": "Ventral ependymal / Astro-TE NN 5",
    "#98df8a": "IIIv dorsal lining / VLMCs",
    "#aec7e8": "ChP \"core wave 1\"",
    "#c49c94": "Dorsal IIIv lining and canals",
    "#c5b0d5": "Dorsal lining of LVs, canal and IVv lining",
    "#e377c2": "Ventral Ependymal IIIv and LVs",
    "#f7b6d2": "Lining of IVv",
    "#ffbb78": "ChP \"core wave 2\""
}
ventricles['cluster'] = ventricles['putativecluster_color'].map(color_dict)
ventricles['cluster']

In [None]:
# isolate the pixels belonging to ventricular walls lipizones

lining_clusters = np.setdiff1d(list(color_dict.values()), np.array(["ChP \"core wave 2\"", "ChP \"core wave 1\"", "Mixed WM / CSF subclass","Boundary WM / CSF subclass", "Mid/hindbrain WM / CSF subclass","Thalamic and mid/hindbrain WM / CSF subclass"]))
perivent = ventricles.loc[(ventricles['cluster'].isin(lining_clusters)) & (ventricles['Section'].isin([4,5,7,8])),:] #,9,11,12  #(ventricles['subclass']=='12112') & 
perivent = perivent.loc[perivent.index.isin(annotated_ventricles.loc[(annotated_ventricles['acronym'] == 3),:].index),:]

dot_size = 0.3
sections_to_plot = range(1, 33)

dd2 = perivent

global_min_z = dd2['zccf'].min()
global_max_z = dd2['zccf'].max()
global_min_y = -dd2['yccf'].max()
global_max_y = -dd2['yccf'].min()

fig, axes = plt.subplots(4, 8, figsize=(40, 20))
axes = axes.flatten()
for i, section_num in enumerate(sections_to_plot):
    ax = axes[i]
    xx = dd2[dd2["Section"] == section_num]
    sc2 = ax.scatter(xx['zccf'], -xx['yccf'],
                     c="red", s=dot_size, alpha=1, rasterized=True)
    ax.axis('off')
    ax.set_aspect('equal')
    ax.set_xlim(global_min_z, global_max_z)
    ax.set_ylim(global_min_y, global_max_y)
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])
plt.tight_layout()
plt.show()

In [None]:
from scipy import stats
from statsmodels.stats.multitest import fdrcorrection

rs = []
rs_all = []

for SEC in perivent['Section'].unique():
    
    sub = perivent.loc[perivent['Section'] == SEC,:]
    target_column = 'yccf'
    
    correlations = {}
    p_values = []
    for column in np.setdiff1d(sub.columns[:173], ['Section']):
        correlation, p_value = stats.spearmanr(sub[column], sub[target_column])
        correlations[column] = {
            'correlation': correlation,
            'p_value': p_value
        }
        p_values.append(p_value)
    
    _, p_values_adjusted = fdrcorrection(p_values)
    
    for (column, p_adjusted) in zip(np.setdiff1d(sub.columns[:173], ['Section']), p_values_adjusted):
        correlations[column]['p_value_adjusted'] = p_adjusted
    
    result_df = pd.DataFrame.from_dict(correlations, orient='index')
    rs_all.append(result_df)
    result_df = result_df.reindex(result_df['correlation'].abs().sort_values(ascending=False).index)
    result_df = result_df.loc[(result_df['p_value_adjusted'] < 0.1) & (np.abs(result_df['correlation']) > 0.4),:]

    rs.append(np.array(result_df.index))
    

rs

In [None]:
uni, vc = np.unique(np.concatenate(rs), return_counts=True)
hits = uni[vc > 3] ### these are promising lining gradients
hits

In [None]:
allcors = pd.concat([pd.DataFrame(r['correlation']) for r in rs_all], axis=1)
allcors.columns = ['4','5','7','8']
allcors = allcors.loc[hits,:]

fig, ax = plt.subplots(figsize=(2, 6))
width = 0.2
x = np.arange(len(allcors.index))

for i, column in enumerate(allcors.columns):
    colors = ['blue' if val >= 0 else 'red' for val in allcors[column]]
    bars = ax.bar(x + i*width, allcors[column], width, color=colors, edgecolor='black')  

    for bar in bars:
        bar.set_facecolor(bar.get_facecolor()[:-1] + (0.3,)) 

ax.set_ylabel("Spearman's correlation with DV axis")
ax.set_xticks(x + width * 1.5)
ax.set_xticklabels(allcors.index, rotation=90, ha='center') 
ax.yaxis.set_ticks_position('left')

ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.xaxis.set_ticks_position('none')
ax.axhline(y=0, color='k', linestyle='-', linewidth=0.5)
plt.tight_layout()
plt.savefig("sectionwise_gradients.pdf", format="pdf", bbox_inches="tight")
plt.show()

In [None]:
from matplotlib.gridspec import GridSpec

fig = plt.figure(figsize=(8, 3))
gs = GridSpec(3, len(hits), figure=fig)

sections = [4.0, 5.0, 7.0]
for row_idx, section in enumerate(sections):
    xx = dd2[dd2["Section"] == section]
    for col_idx, lipid in enumerate(hits):
        ax = fig.add_subplot(gs[row_idx, col_idx])
        sc1 = ax.scatter(xx['z_index'], -xx['y_index'],
                         c=xx[lipid], s=dot_size*5, alpha=0.7, cmap="inferno")

        ax.set_aspect('equal')
        ax.set_xticks([])
        ax.set_yticks([])

        if row_idx == 0:
            ax.set_title(lipid, fontsize=8, wrap=True)

plt.tight_layout()
plt.show()