## Reference atlas data - lipizone centroid profiles

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42

atlas = pd.read_parquet("./zenodo/maindata_2.parquet")
atlas = atlas.loc[atlas['Sample'] == "ReferenceAtlas",:]
atlas

In [None]:
# 0-1 normalization with extreme values clipping

data = atlas.iloc[:,:173]
datemp = data.copy() 
p2 = datemp.quantile(0.01)
p98 = datemp.quantile(0.99)

datemp_values = datemp.values
p2_values = p2.values
p98_values = p98.values

normalized_values = (datemp_values - p2_values) / (p98_values - p2_values)

clipped_values = np.clip(normalized_values, 0, 1)

reference = pd.DataFrame(clipped_values, index = data.index, columns = data.columns)

reference['lipizone_names'] = atlas['lipizone_names']

# calculate the lipizone centroids
centroids = reference.groupby('lipizone_names').mean()
centroids

## Lipidomic correlation between pairs of lipizones

In [None]:
lipcor = centroids.T.corr()
lipcor

## Prevalent Allen Brain color

In [None]:
# also annotate allen region and division!

import pickle

file_path = './zenodo/mixed/allen_name_to_annots.pkl'

with open(file_path, 'rb') as file:
    allen_name_to_annots = pickle.load(file)

divisions = ['Olfactory areas', 'Isocortex', 'Hippocampal formation', 'Cortical subplate', 'Striatum', 'Pallidum', 'Thalamus', 'Hypothalamus', 'Midbrain', 'Hindbrain', 'Cerebellum', 'fiber tracts', 'ventricular systems']#, ventricular systems']

atlas['division'] = "General"
for i in divisions:
    atlas['division'][atlas['id'].isin(allen_name_to_annots[i])] = i
    
atlas['division'].value_counts()

In [None]:
atlas['lipizone'] = atlas['lipizone_names']
lba = atlas
lba = lba[lba['division'] != 'General']
grouped = lba.groupby('lipizone')['division'].value_counts()
total_counts = lba['division'].value_counts()
average_occurrence = total_counts / len(lba['lipizone'].unique())

most_enriched_acronyms = {}

for color, group in grouped.groupby(level=0):
    enrichment = group / average_occurrence[group.index.get_level_values('division')]
    most_enriched_acronyms[color] = enrichment.idxmax()[1]
    
most_enriched_acronyms

allendata = pd.DataFrame(most_enriched_acronyms.values(), index = most_enriched_acronyms.keys(), columns = ['top_allen_division'])
allendata

## Number of overlapping divisions (regionalized vs boundary-crossing)

In [None]:
lipixacro = atlas[['lipizone_names', 'division']]

co_occurrence_counts = lipixacro.groupby(["lipizone_names", "division"]).size().reset_index(name="count")
filtered_counts = co_occurrence_counts[co_occurrence_counts["count"] >= 50]
filtered_counts = filtered_counts.loc[filtered_counts['division'] != "General",:]
diversity_ofacronynms = filtered_counts.groupby("lipizone_names")["division"].nunique().reset_index(name="unique_acronym_count")

diversity_ofacronynms

In [None]:
pivot_df = filtered_counts.pivot(index="lipizone_names", columns="division", values="count")
pivot_df = pivot_df.fillna(0)
pivot_df = (pivot_df.T / pivot_df.T.sum()).T
pivot_df

In [None]:
plt.hist(diversity_ofacronynms.unique_acronym_count, bins=100)
plt.show()

## X, Y, Z

In [None]:
# annotate for each lipizone its average xccf, yccf, zccf [for the zccf, use half of the brain]

half = atlas.loc[atlas['zccf'] < (atlas['zccf'].max() - atlas['zccf'].min()) / 2,:]

lipizone_mean_coordinates = half[['xccf', 'yccf', 'zccf']].groupby(half['lipizone']).mean()
lipizone_mean_coordinates

In [None]:
# annotate for each lipizone its variability of xccf, yccf, zccf [for the zccf, use half of the brain]

half = atlas.loc[atlas['zccf'] < (atlas['zccf'].max() - atlas['zccf'].min()) / 2,:]

lipizone_var_coordinates = half[['xccf', 'yccf', 'zccf']].groupby(half['lipizone_names']).var()
lipizone_var_coordinates

## Lipizone neighborhood

In [None]:
from tqdm import tqdm

atlas['SectionID'] = atlas['SectionID'].astype(int)
atlas['x'] = atlas['x'].astype(int)
atlas['y'] = atlas['y'].astype(int)

atlas['neighbors'] = [[] for _ in range(len(atlas))]

for section_id, group in tqdm(atlas.groupby('SectionID')):
    coord_set = set(zip(group['x'], group['y']))
    
    for idx, row in group.iterrows():
        x0, y0 = row['x'], row['y']
        
        neighbor_coords = [
            (x0 - 1, y0 - 1), (x0 - 1, y0), (x0 - 1, y0 + 1),
            (x0,     y0 - 1),               (x0,     y0 + 1),
            (x0 + 1, y0 - 1), (x0 + 1, y0), (x0 + 1, y0 + 1),
        ]
        
        existing_neighbors = [
            f'section{section_id}_pixel{nx}_{ny}'
            for nx, ny in neighbor_coords
            if (nx, ny) in coord_set
        ]
        
        atlas.at[idx, 'neighbors'] = existing_neighbors

atlas['idd'] = atlas.apply(
    lambda row: f'section{row.SectionID}_pixel{row.x}_{row.y}', axis=1
)
id_to_lipizone = pd.Series(atlas.lipizone_names.values, index=atlas.idd).to_dict()

def map_neighbors_to_names(neighbors):
    return [id_to_lipizone.get(neighbor_id, None) for neighbor_id in neighbors]

atlas['neighbor_names'] = atlas['neighbors'].apply(map_neighbors_to_names)


id_to_lipizone = pd.Series(atlas['class'].values, index=atlas.idd).to_dict()

def map_neighbors_to_names(neighbors):
    return [id_to_lipizone.get(neighbor_id, None) for neighbor_id in neighbors]

atlas['neighbor_classnames'] = atlas['neighbors'].apply(map_neighbors_to_names)
atlas['neighbor_classnames']

In [None]:
from collections import Counter

grouped = atlas.groupby('lipizone_names')['neighbor_classnames'].apply(lambda lists: [neighbor for sublist in lists for neighbor in sublist])

def calculate_proportions(neighbor_list):
    total = len(neighbor_list)
    counts = Counter(neighbor_list)
    proportions = {classname: count / total for classname, count in counts.items()}
    return proportions

proportion_df = grouped.apply(calculate_proportions).reset_index()
proportion_expanded = proportion_df.set_index('lipizone_names')['neighbor_classnames'].apply(pd.Series).fillna(0)
proportion_expanded

## Allen density map

In [None]:
from bg_atlasapi import BrainGlobeAtlas
allen = BrainGlobeAtlas("allen_mouse_25um")

reference_image = allen.reference
print(reference_image.shape)

plt.imshow(reference_image[:, 100, :])
plt.show()

In [None]:
nan_mask = atlas[['x_index', 'y_index', 'z_index']].isna().any(axis=1)

atlas['density'] = np.nan
valid_pixels = atlas.loc[~nan_mask, ['x_index', 'y_index', 'z_index']]
valid_indices = valid_pixels.astype(int)
X_MAX, Y_MAX, Z_MAX = reference_image.shape

in_bounds_mask = (
    (valid_indices['x_index'] >= 0) & (valid_indices['x_index'] < X_MAX) &
    (valid_indices['y_index'] >= 0) & (valid_indices['y_index'] < Y_MAX) &
    (valid_indices['z_index'] >= 0) & (valid_indices['z_index'] < Z_MAX)
)

valid_and_inbounds_indices = valid_indices.loc[in_bounds_mask]

atlas.loc[valid_and_inbounds_indices.index, 'density'] = reference_image[
    valid_and_inbounds_indices['x_index'].values,
    valid_and_inbounds_indices['y_index'].values,
    valid_and_inbounds_indices['z_index'].values
]

lipizone_density = atlas[['density']].groupby(atlas['lipizone']).mean()
lipizone_density

In [None]:
plt.hist(lipizone_density.density, bins=100)
plt.show()

## Cell type from lipidome deconvolution

In [None]:
# repeat the deconvo with NNLS on neuro vs oligo only to have a sort of dual grey vs white matter score

# prepare the average expression profiles for the 4 core cell types from Fitzner et al
ctl = pd.read_csv("./zenodo/csv/celltype_lipidomes.csv", index_col=0)
ctl = ctl.iloc[2:,:]

fitznerlips = ctl.index

oli = np.nanmean(ctl[['OligodendrocytesDIV11', 'OligodendrocytesDIV12',
       'OligodendrocytesDIV13', 'OligodendrocytesDIV251',
       'OligodendrocytesDIV252', 'OligodendrocytesDIV253',
       'OligodendrocytesDIV41', 'OligodendrocytesDIV42',
       'OligodendrocytesDIV43']].astype(float),axis=1)

neu = np.nanmean(ctl[['NeuronsDIV101', 'NeuronsDIV102', 'NeuronsDIV103', 'NeuronsDIV161',
       'NeuronsDIV162', 'NeuronsDIV163', 'NeuronsDIV51', 'NeuronsDIV52',
       'NeuronsDIV53']].astype(float),axis=1)

# some filtering and rescaling
aveprof = pd.DataFrame([oli, neu], columns=ctl.index, index = ["oli", "neu"]).T
aveprof.fillna(0, inplace=True)
names = pd.read_csv("./zenodo/csv/goslinfitzner_celltyoes.tsv", sep="\t")
namesgoslin = names[['Original Name', 'Species Name']]
namesgoslin.index = namesgoslin['Original Name']
aveprof.index = namesgoslin.loc[:,'Species Name']
aveprof = aveprof.T
aveprof = aveprof.loc[:, aveprof.sum() > 0]
incommon = np.intersect1d(aveprof.columns.astype(str), centroids.columns.astype(str))
aveprof = aveprof.loc[:, incommon]
centroids = centroids.loc[:, aveprof.columns]
centroids = (centroids - centroids.min()) / (centroids.max() - centroids.min())

# do the pseudoinverse for a vanilla deconvolution
aveprof_pinv = np.linalg.pinv(aveprof.values)
matrix_X = np.dot(centroids.values, aveprof_pinv)
matrix_X_df = pd.DataFrame(matrix_X, index=centroids.index, columns=aveprof.index)
matrix_X_df = (matrix_X_df.T / matrix_X_df.sum(axis=1)).T

from scipy.optimize import nnls
matrix_X = np.column_stack([nnls(aveprof.T.values, centroids.values[i, :])[0] for i in range(centroids.values.shape[0])])
oligioneurocompo = pd.DataFrame(matrix_X, index = aveprof.index, columns = centroids.index).T
oligioneurocompo = (oligioneurocompo.T/oligioneurocompo.sum(axis=1)).T
plt.hist(oligioneurocompo['neu'])
plt.hist(oligioneurocompo['oli'])

## Abundance

In [None]:
abundance = pd.DataFrame(atlas['lipizone_names'].value_counts(normalize=True))
abundance

## M vs F proportions

In [None]:
alldata = pd.read_parquet("./zenodo/maindata_2.parquet")

In [None]:
sample_x_lipiz_counts = (
    alldata.groupby("Sample")["lipizone_color"] 
    .value_counts(normalize=True)          
    .unstack(fill_value=0)                
)
sample_x_lipiz_counts

In [None]:
propfemales = sample_x_lipiz_counts.loc[["Female1", "Female2", "Female3"],:].mean()
propmales = sample_x_lipiz_counts.loc[["Male1", "Male2", "Male3"],:].mean()

propfemales = propfemales.loc[propmales.index]

propmales_vs_females = propmales / (propmales + propfemales)
propfemales_vs_males = 1 - propmales_vs_females
lipizonetocolor2 = alldata[['lipizone_color', 'lipizone_names']].drop_duplicates().reset_index().iloc[:,1:]

mapping_dict = dict(zip(lipizonetocolor2['lipizone_color'], lipizonetocolor2['lipizone_names']))
propfemales_vs_males.index = propfemales_vs_males.index.map(mapping_dict)
propfemales_vs_males 

## Reproducibility

In [None]:
data = atlas.iloc[:,:173]
datemp = data.copy() 
p2 = datemp.quantile(0.01)
p98 = datemp.quantile(0.99)
datemp_values = datemp.values
p2_values = p2.values
p98_values = p98.values

normalized_values = (datemp_values - p2_values) / (p98_values - p2_values)

clipped_values = np.clip(normalized_values, 0, 1)

reference = pd.DataFrame(clipped_values, index = data.index, columns = data.columns)

reference['lipizone_names'] = atlas['lipizone_names']

# calculate the lipizone centroids
centroids = reference.groupby('lipizone_names').mean()
centroids

In [None]:
second = alldata.loc[alldata['Sample'] == "SecondAtlas",:]
datemp_values = second.values[:,:173]
normalized_values = (datemp_values - p2_values) / (p98_values - p2_values)
clipped_values = np.clip(normalized_values, 0, 1)

r2 = pd.DataFrame(clipped_values, index = second.index, columns = second.columns[:173])
r2['lipizone_names'] = second['lipizone_names']

# calculate the lipizone centroids
centroids_a2 = r2.groupby('lipizone_names').mean()
centroids_a2

In [None]:
reproducibility_twoatlases = pd.Series(
    [
        centroids.iloc[i, :].corr(centroids_a2.iloc[i, :])
        for i in range(len(centroids))
    ],
    index=centroids.index,
    name="pearson_r"
)
reproducibility_twoatlases

In [None]:
plt.hist(reproducibility_twoatlases[0], bins=50)
plt.show()

## Plot the heatmaps that accompany the dataset overview dendrogam

In [None]:
abundance.columns = ['count']
propfemales_vs_males = pd.DataFrame(propfemales_vs_males, columns = ['male'])
propfemales_vs_males['female'] = 1-propfemales_vs_males

reproducibility_twoatlases = pd.DataFrame(reproducibility_twoatlases)
reproducibility_twoatlases.columns = ['reproducibility']
celltypesdeconvo = oligioneurocompo
proportion_expanded.columns = proportion_expanded.columns.astype(str)
toplot = pd.concat([allendata, lipizonetocolor, proportion_expanded, celltypesdeconvo,pivot_df, lipizone_mean_coordinates, diversity_ofacronynms, lipizone_density, abundance, reproducibility_twoatlases, propfemales_vs_males], axis=1)
toplot

In [None]:
clusterbyzone = atlas[['lipizone_names', 'lipizone_color']].drop_duplicates().reset_index().iloc[:,1:]
clusterbyzone.index = clusterbyzone['lipizone_names']
clusterbyzone

In [None]:
mostabundant = (
    atlas
    .groupby('class')['lipizone_color']
    .agg(lambda x: x.value_counts().idxmax())
    .reset_index(name='lipizone_color')
)
mostabundant.index = mostabundant['class']
mostabundant

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import scipy.cluster.hierarchy as sch

def create_lipid_visualization(toplot, width_ratios=None):
    # Default width ratios (removed entries at original positions 4, 6, and 13)
    if width_ratios is None:
        width_ratios = [0.5, 0.5, 1, 5,   # clusterbyzone, lipizone, deconv, Allen
                        0.5,               # blank column (originally index 5)
                        1, 1, 1,           # xccf, yccf, zccf
                        3, 2, 0.5,         # neighborhood, male/female, reproducibility
                        3]                 # abundance

    # Normalize values for the continuous color maps
    def normalize_column(column):
        return (column - column.min()) / (column.max() - column.min())

    # Percentile-based normalization for heatmaps
    def percentile_normalize(data, low=2, high=98):
        low_val, high_val = np.percentile(data, [low, high])
        return np.clip((data - low_val) / (high_val - low_val), 0, 1)

    plt.close('all')
    fig, axes = plt.subplots(
        1, 12,
        figsize=(20, 30),
        gridspec_kw={
            'width_ratios': width_ratios,
            'wspace': 0.1,
            'hspace': 0
        },
        dpi=300,
        sharey=True
    )

    # hide spines and ticks
    for ax in axes:
        for spine in ax.spines.values():
            spine.set_visible(False)
        ax.set_xticks([])
        ax.set_yticks([])

    # 1. Clusterbyzone color
    zone_colors = np.array([mcolors.to_rgba(color) for color in clusterbyzone.loc[toplot.index, 'lipizone_color']])
    axes[0].imshow(zone_colors.reshape(-1, 1, 4), aspect='auto', interpolation='nearest')

    # 2. Lipizone color
    lipizone_colors = np.array([mcolors.to_rgba(color) for color in toplot['lipizone_color']])
    axes[1].imshow(lipizone_colors.reshape(-1, 1, 4), aspect='auto', interpolation='nearest')

    # 3. Oligodendrocyte vs Neuron deconvolution scores
    astro_cols = ['oli', 'neu']
    astro_cmap = ['#E4E4E4', 'darkgreen']
    cumulative = np.zeros(len(toplot))
    for idx, col in enumerate(astro_cols):
        axes[2].barh(range(len(toplot)), toplot[col], left=cumulative, color=astro_cmap[idx], label=col)
        cumulative += toplot[col]
    axes[2].set_title('Oligodendrocyte vs Neuron', fontsize=8)

    # 4. Allen Brain divisions
    acronym_cols = [
        'fiber tracts', 'Hindbrain', 'Midbrain', 'Thalamus', 'Hypothalamus',
        'Pallidum', 'Striatum', 'Isocortex', 'Cerebellum', 'Cortical subplate',
        'Olfactory areas', 'Hippocampal formation', 'ventricular systems'
    ]
    category_cmap = ['#E4E4E4', '#F8AEA1', '#D38EBD', '#F48D99', '#EC6961',
                     '#9FAFD5', '#AEDFF8', '#C6E3BC', '#F0F09C', '#A5D49C',
                     '#ADDCCA', '#9BCD6F', '#808080']
    cumulative = np.zeros(len(toplot))
    for idx, col in enumerate(acronym_cols):
        axes[3].barh(range(len(toplot)), toplot[col], left=cumulative, color=category_cmap[idx])
        cumulative += toplot[col]
    axes[3].set_title('Allen Brain divisions', fontsize=8)

    # (axes[4] left blank)

    # 5‑7. Bar plots for xccf, yccf, zccf (now axes[5], [6], [7])
    bar_cols = ['xccf', 'yccf', 'zccf']
    for i, col in enumerate(bar_cols):
        idx_ax = 5 + i
        norm_values = normalize_column(toplot[col])
        colors = plt.cm.coolwarm(norm_values)
        for idx_row, (value, color) in enumerate(zip(norm_values, colors)):
            axes[idx_ax].barh(idx_row, value, color=color)
        axes[idx_ax].set_title(col.upper(), fontsize=8)
        axes[idx_ax].set_xlim(0, 1)

    # 8. Neighborhood (axes[8])
    category_cols = ['221', '212', '211', '121', '222', '112', '122', '111']
    category_cmap = mostabundant.loc[category_cols, 'lipizone_color']
    cumulative = np.zeros(len(toplot))
    for idx, col in enumerate(category_cols):
        axes[8].barh(range(len(toplot)), toplot[col], left=cumulative, color=category_cmap[idx])
        cumulative += toplot[col]
    axes[8].set_title('Neighborhood', fontsize=8)

    # 9. Male/Female frequency (axes[9])
    axes[9].barh(range(len(toplot)), toplot['male'], color='darkblue', label='Male')
    axes[9].barh(range(len(toplot)), toplot['female'], left=toplot['male'], color='pink', label='Female')
    axes[9].set_title('Male/Female frequency', fontsize=8)

    # 10. Reproducibility between atlases (axes[10])
    axes[10].barh(range(len(toplot)), toplot['reproducibility'], color='lightblue')
    axes[10].set_title('Reproducibility between atlases', fontsize=8)

    # 11. Abundance (axes[11])
    normalized_count = percentile_normalize(toplot['count'], low=5, high=95)
    lipizone_colors = np.array([mcolors.to_rgba(color) for color in toplot['lipizone_color']])
    for i, (value, color) in enumerate(zip(normalized_count, lipizone_colors)):
        axes[11].barh(i, value, color=color)
    axes[11].set_title('Abundance', fontsize=8)

    plt.tight_layout()
    plt.show()

# Call the function with the new default (or pass your own 12‑entry list)
create_lipid_visualization(toplot)

## Draw the dendrogram (ex novo, easier)

In [None]:
import numpy as np
from scipy.cluster.hierarchy import dendrogram
import matplotlib.pyplot as plt

def generate_custom_atlas():
    linkage_matrix = [
        [0, 1, 1.0, 2],   # Merge nodes 0 and 1
        [2, 3, 1.0, 2],   # Merge nodes 2 and 3
        [4, 5, 1.0, 2],   # Merge nodes 4 and 5
        [6, 7, 1.0, 2],   # Merge nodes 6 and 7
        [8, 9, 1.0, 2],   # Merge nodes 8 and 9
        [10, 11, 1.0, 2],  # Merge nodes 10 and 11
        [12, 13, 1.0, 2],  # Merge nodes 12 and 13
        [14, 15, 1.0, 2],  # Merge nodes 14 and 15
        [16, 17, 1.0, 2],  # Merge nodes 16 and 17
        [18, 19, 1.0, 2],  # Merge nodes 18 and 19
        [20, 21, 1.0, 2],  # Merge nodes 20 and 21
        [22, 23, 1.0, 2],  # Merge nodes 22 and 23
        [24, 25, 1.0, 2],  # Merge nodes 24 and 25
        [26, 27, 1.0, 2],  # Merge nodes 26 and 27
        [28, 29, 1.0, 2],  # Merge nodes 28 and 29
        [30, 31, 1.0, 2],  # Merge nodes 30 and 31
        [32, 33, 2.0, 4],  # Merge clusters of size 2 into a cluster of size 4
        [34, 35, 2.0, 4],  # Continue merging...
        [36, 37, 2.0, 4],
        [38, 39, 2.0, 4],
        [40, 41, 2.0, 4],
        [42, 43, 2.0, 4],
        [44, 45, 2.0, 4],
        [46, 47, 2.0, 4],
        [48, 49, 4.0, 8],  # Merge clusters of size 4 into a cluster of size 8
        [50, 51, 4.0, 8],
        [52, 53, 4.0, 8],
        [54, 55, 4.0, 8],
        [56, 57, 8.0, 16],  # Merge clusters of size 8 into a cluster of size 16
        [58, 59, 8.0, 16],
        [60, 61, 16.0, 32], 
    ]
    return np.array(linkage_matrix)

plt.figure(figsize=(10, 7))
linkage_matrix = generate_custom_atlas()
dendrogram(
    linkage_matrix,
    orientation='left',
    color_threshold=0,
    above_threshold_color='black',
    no_labels=True, 
)

plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['bottom'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
plt.tick_params(axis='both', which='both', length=0)
plt.xticks([])
plt.yticks(fontsize=12)
plt.tight_layout()

plt.show()

## Check visually how the split behaves to assign manually names to the main branches

## Some basic tSNE and spatial colorings

In [None]:
tsne = alldata[['tsne1', 'tsne2']]

DS = 10 #####
S = 0.0005

n_rows, n_cols = 2, 4

fig, axes = plt.subplots(n_rows, n_cols, figsize=(20, 10))
axes = axes.flatten() 

labels = atlas.loc[tsne.index, "Sample"]
scatter = axes[0].scatter(
    tsne.iloc[:, 0][::DS],
    tsne.iloc[:, 1][::DS],
    c=labels.astype("category").cat.codes[::DS],
    cmap="tab20",
    s=S,
    alpha=0.3,
    rasterized=True
)
axes[0].set_title('Sample')

labels = atlas.loc[tsne.index, 'Sex']
scatter = axes[1].scatter(
    tsne.iloc[:, 0][::DS],
    tsne.iloc[:, 1][::DS],
    c=labels.astype("category").cat.codes[::DS],
    cmap="viridis",
    s=S,
    alpha=0.3,
    rasterized=True
)
axes[1].set_title('Sex')

labels = atlas.loc[tsne.index, 'allencolor'].values
scatter = axes[2].scatter(
    tsne.iloc[:, 0][labels!="#000000"][::DS],
    tsne.iloc[:, 1][labels!="#000000"][::DS],
    c=labels_allencolor[labels!="#000000"][::DS],
    s=S*10,
    alpha=0.3,
    rasterized=True
)
axes[2].set_title('Allen color')

labels = splits.loc[tsne.index, 'lipizone_color']
scatter = axes[3].scatter(
    tsne.iloc[:, 0][::DS],
    tsne.iloc[:, 1][::DS],
    c=labels[::DS],
    s=S,
    alpha=0.3,
    rasterized=True
)
axes[3].set_title('Lipizone Color')

labels = atlasTMP.loc[atlasTMP.index.isin(tsne.index), 'xccf']
scatter = axes[4].scatter(
    tsne.loc[labels.index, 0][::DS],
    tsne.loc[labels.index, 1][::DS],
    c=labels[::DS],
    cmap="coolwarm",
    s=S,
    alpha=0.8,
    rasterized=True
)
axes[4].set_title('XCCF')

labels = atlasTMP.loc[atlasTMP.index.isin(tsne.index), 'yccf']
scatter = axes[5].scatter(
    tsne.loc[labels.index, 0][::DS],
    tsne.loc[labels.index, 1][::DS],
    c=labels[::DS],
    cmap="coolwarm",
    s=S,
    alpha=0.8,
    rasterized=True
)
axes[5].set_title('YCCF')

labels = atlasTMP.loc[atlasTMP.index.isin(tsne.index), 'zccf']
scatter = axes[6].scatter(
    tsne.loc[labels.index, 0][::DS],
    tsne.loc[labels.index, 1][::DS],
    c=labels[::DS],
    cmap="coolwarm",
    s=S,
    alpha=0.8,
    rasterized=True
)
axes[6].set_title('ZCCF')

for ax in axes:
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['bottom'].set_visible(False)
    ax.spines['left'].set_visible(False)
    
    ax.set_xticks([])
    ax.set_yticks([])
    
    ax.set_xticklabels([])
    ax.set_yticklabels([])

plt.tight_layout()

plt.show()

In [None]:
datavignettes = alldata
datavignettes

In [None]:
datavignettes['wmvsgm'] = datavignettes['level_1']
most_frequent_colors = datavignettes.groupby(datavignettes['supertype'])['lipizone_color'].agg(lambda x: x.mode().iloc[0])
datavignettes['supertype_color'] = datavignettes['supertype'].map(most_frequent_colors)
most_frequent_colors = datavignettes.groupby(datavignettes['subclass'])['lipizone_color'].agg(lambda x: x.mode().iloc[0])
datavignettes['subclass_color'] = datavignettes['subclass'].map(most_frequent_colors)
most_frequent_colors = datavignettes.groupby(datavignettes['class'])['subclass_color'].agg(lambda x: x.mode().iloc[0])
datavignettes['class_color'] = datavignettes['class'].map(most_frequent_colors)
most_frequent_colors = datavignettes.groupby(datavignettes['wmvsgm'])['class_color'].agg(lambda x: x.mode().iloc[0])
datavignettes['wmvsgm'] = datavignettes['wmvsgm'].map(most_frequent_colors)
datavignettes['oligo'] = datavignettes['lipizone_names'].map(celltypesdeconvo['oli'])

import matplotlib.pyplot as plt

fig, axs = plt.subplots(1, 6, figsize=(12, 3), squeeze=False)
axs_flat = axs.flatten()

for i, currentLipid in enumerate(['wmvsgm', 'supertype_color', 'subclass_color', 'class_color', 'oligo']):
    ax = axs_flat[i]
    ax.scatter(
        tsne.iloc[:, 0][::10],
        tsne.iloc[:, 1][::10],
        s=0.1,
        c=datavignettes[currentLipid][::10],
        alpha=0.5,
        rasterized=True
    )
    ax.set_aspect('equal')
    ax.axis('off')
    for spine in ax.spines.values():
        spine.set_visible(False)
    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_title(f'{currentLipid}', fontsize=10)

plt.tight_layout()
plt.show()

In [None]:
i = 0
datavignettes = atlas
currentLipid = datavignettes.columns[i]

tsne = datavignettes[['tsne1','tsne2']]

results = []

for section in datavignettes['Section'].unique():
    subset = datavignettes[datavignettes['Section'] == section]
    perc_2 = subset[currentLipid].quantile(0.02)
    perc_98 = subset[currentLipid].quantile(0.98)
    results.append([section, perc_2, perc_98])

percentile_df = pd.DataFrame(results, columns=['Section', '2-perc', '98-perc'])
med2p = percentile_df['2-perc'].median()
med98p = percentile_df['98-perc'].median()

plt.scatter(
                tsne.iloc[:, 0][::10],
                tsne.iloc[:, 1][::10],
                s=0.05,
                c=datavignettes.iloc[:, i][::10],
                vmin=med2p,
                vmax=med98p,
                alpha=0.5,
                cmap="plasma",
                rasterized=True
            )
ax_tsne = plt.gca()
ax_tsne.set_aspect('equal')
ax_tsne.axis('off') 
for spine in ax_tsne.spines.values():
    spine.set_visible(False)
ax_tsne.set_xticks([])
ax_tsne.set_yticks([])
ax_tsne.set_title(f'{currentLipid}', fontsize=10)

plt.show()

In [None]:
dd2 = atlas

coordinates = dd2[['xccf', 'yccf', 'zccf']]
conto = np.load("./zenodo/mixed/eroded_annot.npy")
coordinates = coordinates.fillna(0)
coordinates = coordinates.replace([np.inf, -np.inf], 0)
xs,ys,zs = (coordinates['xccf']*40).astype(int), (coordinates['yccf']*40).astype(int), (coordinates['zccf']*40).astype(int)
xs.loc[xs>527]=527
ys.loc[ys>319]=319
zs.loc[zs>455]=455
coordinates['border'] = conto[xs,ys,zs]
print(np.sum(coordinates['border']))

unique_sections = sorted(dd2['Section'].dropna().unique())
n_sections = len(unique_sections)

n_rows, n_cols = 9, 3
fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 30))
axes = axes.flatten()

global_min_z = dd2['zccf'].min()
global_max_z = dd2['zccf'].max()
global_min_y = -dd2['yccf'].max() 
global_max_y = -dd2['yccf'].min()  

for i, section_num in enumerate(unique_sections):
    ax = axes[i]
    xx = dd2[dd2["Section"] == section_num]

    sc1 = ax.scatter(
        xx['zccf'], 
        -xx['yccf'],
        c=xx['lipizone_color'].values,
        s=0.05, 
        alpha=1, 
        rasterized=True
    )
    
    cont = coordinates.loc[xx.index]
    cont = cont[cont['border'] == 1]
    ax.scatter(
        cont['zccf'], 
        -cont['yccf'],
        c='black', 
        s=0.2, 
        alpha=0.2, 
        rasterized=True
    )
    
    ax.axis('off')
    ax.set_aspect('equal')  
    ax.set_xlim(global_min_z, global_max_z)
    ax.set_ylim(global_min_y, global_max_y)
    
for j in range(n_sections, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

## Summarize the dataset lipid content

In [None]:
import re

df = pd.DataFrame(atlas.columns[:173])
df.columns = ["lipid_name"]

# extract the "class" etc from the lipid_name
df["class"] = df["lipid_name"].apply(lambda x: re.split(' |\(', x)[0])
df["carbons"] = df["lipid_name"].apply(lambda x: int(re.search(r'(\d+):', x).group(1)) if re.search(r'(\d+):', x) else np.nan)
df["insaturations"] = df["lipid_name"].apply(lambda x: int(re.search(r':(\d+)', x).group(1)) if re.search(r':(\d+)', x) else np.nan)
df["insaturations_per_Catom"] = df["insaturations"] / df["carbons"]

colors = pd.read_hdf("./zenodo/mixed/lipidclasscolors.h5ad", key="table")
colors

In [None]:
lipid_class_counts = pd.DataFrame(df['class'].value_counts()[::-1])
lipid_class_counts = pd.DataFrame(df['class'].value_counts(), columns=['count'])
lipid_class_counts['classcolors'] = (
    colors['classcolors']
      .reindex(lipid_class_counts.index, fill_value="#000000")
)

import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.barh(
    y=lipid_class_counts.index, 
    width=lipid_class_counts['count'], 
    color=lipid_class_counts['classcolors']  
)

plt.title('Lipid Class Distribution', fontsize=14)
plt.xlabel('Counts', fontsize=12)
plt.ylabel('Lipid Class', fontsize=12)
for spine in ['top', 'right', 'left', 'bottom']:
    plt.gca().spines[spine].set_visible(False)
plt.tick_params(axis='x', which='both', bottom=False, top=False)
plt.tick_params(axis='y', which='both', left=False, right=False)

plt.tight_layout()

plt.show()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

cmap = plt.cm.Reds
vmin = df['insaturations'].min()
vmax = df['insaturations'].max()

bins = np.linspace(df['carbons'].min(), df['carbons'].max(), 31)
bin_centers = 0.5 * (bins[:-1] + bins[1:])
df['carbon_bins'] = pd.cut(df['carbons'], bins, right=False)
grouped = df.groupby(['carbon_bins', 'insaturations']).size().unstack(fill_value=0)

grouped_normalized = grouped

plt.figure(figsize=(10, 6))
bottoms = np.zeros(len(bin_centers))

for insaturation in grouped_normalized.columns:
    color = cmap((insaturation - vmin) / (vmax - vmin)) 
    plt.bar(
        bin_centers,
        grouped_normalized[insaturation],
        width=np.diff(bins),
        bottom=bottoms,
        color=color,
        label=f"Insaturation {insaturation:.1f}",
        edgecolor="none",
    )
    bottoms += grouped_normalized[insaturation]

plt.title('Total Chain Length')

for spine in ['top', 'right', 'left', 'bottom']:
    plt.gca().spines[spine].set_visible(False)
plt.tick_params(axis='x', which='both', bottom=False, top=False)
plt.tick_params(axis='y', which='both', left=False, right=False)

plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
import numpy as np

insaturation_values = grouped.columns.to_list()

colors = [cmap((value - vmin) / (vmax - vmin)) for value in insaturation_values]

custom_cmap = LinearSegmentedColormap.from_list("custom_insaturation", colors, N=256)

plt.figure(figsize=(8, 3))
plt.imshow([np.linspace(vmin, vmax, 256)], aspect='auto', cmap=custom_cmap)
plt.colorbar(label="Insaturation")
plt.title("Custom Colormap for Insaturation")
plt.axis("off")

plt.tight_layout()
plt.show()

## Heatmaps to show combinatorial brain partitioning by lipizones

In [None]:
normalized_df = centroids / centroids.mean()
import scipy.cluster.hierarchy as sch
linkage = sch.linkage(sch.distance.pdist(normalized_df.T), method='weighted', optimal_ordering=True)
order = sch.leaves_list(linkage)
normalized_df = normalized_df.iloc[:, order]

order = np.argmax(normalized_df.values, axis=1)
order = np.argsort(order)
normalized_df = normalized_df.iloc[order,:]
normalized_df = normalized_df.dropna().replace([np.inf, -np.inf], np.nan).dropna()

plt.figure(figsize=(20, 5))
sns.heatmap(normalized_df.T, cmap="inferno", cbar_kws={'label': 'Enrichment'}, xticklabels=True, yticklabels=True, vmin = 1, vmax = 2)

plt.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
plt.tick_params(axis='y', which='both', left=False, right=False)

plt.yticks(rotation=0)
plt.title('Enrichment of lipids across major brain divisions')

plt.tight_layout()
plt.show()

In [None]:
atlas1 = alldata.loc[alldata['Sample'] == "ReferenceAtlas",:]
mtdft = atlas1[['lipizone_names', 'division']]
counts = mtdft.groupby(['lipizone_names', 'division']).size().unstack(fill_value=0)
percentages = counts.div(counts.sum(axis=1), axis=0)
percentages = percentages / percentages.mean()
normalized_df = percentages
normalized_df = normalized_df.loc[:, normalized_df.columns != "General"]

import scipy.cluster.hierarchy as sch
linkage = sch.linkage(sch.distance.pdist(normalized_df.T), method='weighted', optimal_ordering=True)
order = sch.leaves_list(linkage)
normalized_df = normalized_df.iloc[:, order]

order = np.argmax(normalized_df.values, axis=1)
order = np.argsort(order)
normalized_df = normalized_df.iloc[order,:]
normalized_df = normalized_df.dropna().replace([np.inf, -np.inf], np.nan).dropna()

atlas = normalized_df.copy()

# plot
fig, ax1 = plt.subplots(figsize=(20, 5))
sns.heatmap(normalized_df.T, cmap="Reds", ax=ax1, cbar_kws={'label': 'Enrichment'},
            xticklabels=True, yticklabels=True, vmin=1.5, vmax=4)
ax1.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
ax1.tick_params(axis='y', which='both', left=False, right=False, pad=20)
ax1.set_title('Enrichment of lipizones across major brain divisions')

plt.tight_layout()
plt.show()

In [None]:
atlasb3 = alldata.loc[alldata['Sample'] == "SecondAtlas",:]
mtdft3 = atlasb3[['lipizone_names', 'division']]

counts = mtdft3.groupby(['lipizone_names', 'division']).size().unstack(fill_value=0)
percentages = counts.div(counts.sum(axis=1), axis=0)
percentages = percentages / percentages.mean()
normalized_df = percentages
normalized_df = normalized_df.loc[:, normalized_df.columns != "General"]
normalized_df = normalized_df.loc[atlas.index, atlas.columns]
normalized_df

In [None]:
fig, ax1 = plt.subplots(figsize=(20, 5))
sns.heatmap(normalized_df.T, cmap="Reds", ax=ax1, cbar_kws={'label': 'Enrichment'},
            xticklabels=True, yticklabels=True, vmin=1.5, vmax=4)
ax1.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
ax1.tick_params(axis='y', which='both', left=False, right=False, pad=20)
ax1.set_title('Enrichment of lipizones across major brain divisions')

plt.tight_layout()
plt.show()

In [None]:
datamean = alldata.iloc[:,:173].groupby(alldata['division']).mean()

normalized_df = datamean / datamean.mean()
normalized_df = normalized_df.loc[normalized_df.index != "General",:]
normalized_df = normalized_df.T

import scipy.cluster.hierarchy as sch
linkage = sch.linkage(sch.distance.pdist(normalized_df.T), method='weighted', optimal_ordering=True)
order = sch.leaves_list(linkage)
normalized_df = normalized_df.iloc[:, order]

order = np.argmax(normalized_df.values, axis=1)
order = np.argsort(order)
normalized_df = normalized_df.iloc[order,:]
normalized_df = normalized_df.dropna().replace([np.inf, -np.inf], np.nan).dropna()

plt.figure(figsize=(20, 5))
sns.heatmap(normalized_df.T, cmap="Blues", cbar_kws={'label': 'Enrichment'}, xticklabels=False, yticklabels=True, vmin = 1, vmax = 2)

plt.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
plt.tick_params(axis='y', which='both', left=False, right=False)

plt.yticks(rotation=0)
plt.title('Enrichment of lipids across major brain divisions')

plt.tight_layout()
plt.show()

## Prepare the subclass naming table crafted by manual curation

In [None]:
namingtable = {
    "cluster": [
        11111, 11112, 11121, 11122, 11211, 11212, 11221, 11222, 12111, 12112, 
        12121, 12122, 12211, 12212, 12221, 12222, 21111, 21112, 21120, 21211, 
        21212, 21221, 21222, 22111, 22112, 22121, 22122, 22211, 22212, 22221, 22222
    ],
    "zone": [
        "Mixed and hindbrain white matter", "Core callosal white matter", 
        "Callosal and cerebellar white matter", "Ventral white matter", 
        "Boundary white matter", "Thalamic and mid/hindbrain white matter", 
        "Mid/hindbrain white matter", "Mixed white matter", 
        "Choroid plexus and ventricles", "Ventricular linings", 
        "Thalamic and midbrain regions", "White and gray matter boundary", 
        "Thalamic mixed gray and white matter", "Thalamic mixed gray and white matter #2", 
        "Neuron-rich lateral white matter", "Neuron-rich lateral white matter #2", 
        "Pallidum and projections", "Cortical layer 4", 
        "Subcortical plate, hippocampus and hypothalamus", 
        "GABA-ergic Purkinje cells of the cerebellum", "Cortical layers 2-3 and 4", 
        "Piriform cortex", "Cortical layers 1 and 2-3", "Cortical layer 5", 
        "Cortical layer 6, dentate gyrus", "Striatum, hypothalamus and hippocampus", 
        "Striatum, hypothalamus and hippocampus #2", 
        "Retrosplenial, cortical, cerebellar", "Cortical layer 6 and cerebellar Y", 
        "Cerebellar glutamatergic neurons", "Cortical layer 6 and thalamic"
    ],
    "color": [
        "#360064", "#980053", "#170b3b", "#ac2f5c", "#2a3f6d", "#002657", 
        "#21366b", "#3e4b6c", "#f75400", "#ef633e", "#a5d4e6", "#6399c6", 
        "#853a00", "#edeef4", "#fdbf71", "#ce710e", "#940457", "#a2d36c", 
        "#d5edb5", "#0065d6", "#bcf18b", "#a68d68", "#79e47e", "#2f0097", 
        "#47029f", "#7500a8", "#d70021", "#ca99c9", "#d4b9da", "#e00085", 
        "#f6f3f8"
    ]
}

namingtable = pd.DataFrame(namingtable)
namingtable

In [None]:
legend = {
    "Mixed and hindbrain white matter": "#360064",
    "Core callosal white matter": "#980053",
    "Callosal and cerebellar white matter": "#170b3b",
    "Ventral white matter": "#ac2f5c",
    "Boundary white matter": "#2a3f6d",
    "Thalamic and mid/hindbrain white matter": "#002657",
    "Mid/hindbrain white matter": "#21366b",
    "Mixed white matter": "#3e4b6c",
    "Choroid plexus and ventricles": "#f75400",
    "Ventricular linings": "#ef633e",
    "Thalamic and midbrain regions": "#a5d4e6",
    "White and gray matter boundary": "#6399c6",
    "Thalamic mixed gray and white matter": "#853a00",
    "Thalamic mixed gray and white matter #2": "#edeef4",
    "Neuron-rich lateral white matter": "#fdbf71",
    "Neuron-rich lateral white matter #2": "#ce710e",
    "Pallidum and projections": "#940457",
    "Cortical layer 4": "#a2d36c",
    "Subcortical plate, hippocampus and hypothalamus": "#d5edb5",
    "GABA-ergic Purkinje cells of the cerebellum": "#0065d6",
    "Cortical layers 2-3 and 4": "#bcf18b",
    "Piriform cortex": "#a68d68",
    "Cortical layers 1 and 2-3": "#79e47e",
    "Cortical layer 5": "#2f0097",
    "Cortical layer 6, dentate gyrus": "#47029f",
    "Striatum, hypothalamus and hippocampus": "#7500a8",
    "Striatum, hypothalamus and hippocampus #2": "#d70021",
    "Retrosplenial, cortical, cerebellar": "#ca99c9",
    "Cortical layer 6 and cerebellar Y": "#d4b9da",
    "Cerebellar glutamatergic neurons": "#e00085",
    "Cortical layer 6 and thalamic": "#f6f3f8"
}


In [None]:
def plot_allen_division_legend(legend):
    import textwrap
    
    plt.figure(figsize=(3, len(legend) * 0.4), dpi=300)
    
    wrapped_legend = {
        textwrap.fill(label, width=15): color 
        for label, color in legend.items()
    }
    
    for label, color in wrapped_legend.items():
        plt.scatter([], [], color=color, label=label, s=100)
    
    plt.legend(
        loc='center left', 
        bbox_to_anchor=(1, 0.5), 
        title="Allen Division",
        fontsize=8, 
        title_fontsize=10,
        frameon=False
    )
    plt.axis('off')
    plt.tight_layout()
    plt.show()
    
plot_allen_division_legend(legend)

In [None]:
datavignettes = atlas1

programs = ['HexCer + hexosylceramides', 'PC',
       'PA + diacylglycerophosphates [GP1001]', 'PE',
       'PS + diacylglycerophosphoserines [GP0301]',
       'PI + diacylglycerophosphoinositols [GP0601]',
       'LPC + monoacylglycerophosphocholines [GP0105]',
       'diacylglycerophosphocholines [GP0101]',
       '1-alkyl-2-acylglycerophosphocholines [GP0102]',
       'diacylglycerophosphoethanolamines [GP0201]',
       '1-alkyl-2-acylglycerophosphoethanolamines [GP0202]',
       'monoacylglycerophosphoethanolamines [GP0205] + LPE',
       'diacylglycerophosphoglycerols [GP0401] + PG',
       'headgroup with negative charge', 'headgroup with neutral charge',
       'headgroup with positive charge / zwitter-ion',
       'simple glc series [SP0501]', 'negative intrinsic curvature',
       'neutral intrinsic curvature', 'positive intrinsic curvature',
       'contains ether-bond', 'lysoglycerophospholipids',
       'very low transition temperature', 'low transition temperature',
       'average transition temperature', 'high transition temperature',
       'very high transition temperature', 'fatty acid with 18 carbons',
       'fatty acid with 20 carbons', 'fatty acid with 22 carbons',
       'saturated fatty acid', 'monounsaturated fatty acid',
       'fatty acid with 4 double bonds', 'lipid-mediated signalling',
       'endoplasmic reticulum (ER)', 'mitochondrion', 'plasma membrane',
       'endosome/lysosome + SM + ceramide phosphocholines (sphingomyelins) [SP0301] + golgi apparatus',
       'N-acylsphingosines (ceramides) [SP0201] + Cer',
       'very low bilayer thickness', 'low bilayer thickness',
       'average bilayer thickness', 'high bilayer thickness',
       'very high bilayer thickness', 'very low lateral diffusion',
       'low lateral diffusion', 'average lateral diffusion',
       'high lateral diffusion', 'very high lateral diffusion']

## UMAP of lipizone centroids

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import umap.umap_ as umap

scaler = MinMaxScaler()
centroids = pd.DataFrame(scaler.fit_transform(centroids), index = centroids.index, columns = centroids.columns)
reducer = umap.UMAP(n_neighbors=4, min_dist=0.05, n_jobs=1)
umap_result = reducer.fit_transform(centroids)

cols = datavignettes[['lipizone_names', 'lipizone_color']].drop_duplicates()
cols.index = cols['lipizone_names']
cols = cols.loc[centroids.index,:]

plt.figure(figsize=(8, 6)) 
plt.scatter(umap_result[:,0], umap_result[:,1], c=cols['lipizone_color'], s=12)
plt.gca().set_aspect('equal')
plt.axis('off')  
for spine in plt.gca().spines.values():
   spine.set_visible(False)
plt.xticks([])
plt.yticks([])

plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import umap.umap_ as umap

# Assuming your data is already loaded and preprocessed
scaler = MinMaxScaler()
centroids_scaled = pd.DataFrame(scaler.fit_transform(centroids), 
                               index=centroids.index, 
                               columns=centroids.columns)

# Color mapping (assuming this exists from your original code)
cols = datavignettes[['lipizone_names', 'lipizone_color']].drop_duplicates()
cols.index = cols['lipizone_names']
cols = cols.loc[centroids.index,:]

# Create subplots
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Dimensionality Reduction Comparison', fontsize=16)

# 1. PCA
pca = PCA(n_components=2)
pca_result = pca.fit_transform(centroids_scaled)

axes[0,0].scatter(pca_result[:,0], pca_result[:,1], 
                  c=cols['lipizone_color'], s=12, alpha=0.7)
axes[0,0].set_title(f'PCA\nExplained Variance: {pca.explained_variance_ratio_.sum():.3f}')
axes[0,0].set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.3f})')
axes[0,0].set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.3f})')

# 2. t-SNE
tsne = TSNE(n_components=2, perplexity=30, random_state=42, n_iter=1000)
tsne_result = tsne.fit_transform(centroids_scaled)

axes[0,1].scatter(tsne_result[:,0], tsne_result[:,1], 
                  c=cols['lipizone_color'], s=12, alpha=0.7)
axes[0,1].set_title('t-SNE (perplexity=30)')
axes[0,1].set_xlabel('t-SNE 1')
axes[0,1].set_ylabel('t-SNE 2')

# 3. UMAP (original parameters)
reducer_orig = umap.UMAP(n_neighbors=4, min_dist=0.05, n_jobs=1, random_state=42)
umap_orig_result = reducer_orig.fit_transform(centroids_scaled)

axes[1,0].scatter(umap_orig_result[:,0], umap_orig_result[:,1], 
                  c=cols['lipizone_color'], s=12, alpha=0.7)
axes[1,0].set_title('UMAP (original: n_neighbors=4)')
axes[1,0].set_xlabel('UMAP 1')
axes[1,0].set_ylabel('UMAP 2')

# 4. UMAP (adjusted parameters)
reducer_adj = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42)
umap_adj_result = reducer_adj.fit_transform(centroids_scaled)

axes[1,1].scatter(umap_adj_result[:,0], umap_adj_result[:,1], 
                  c=cols['lipizone_color'], s=12, alpha=0.7)
axes[1,1].set_title('UMAP (adjusted: n_neighbors=15)')
axes[1,1].set_xlabel('UMAP 1')
axes[1,1].set_ylabel('UMAP 2')

# Clean up all subplots
for ax in axes.flat:
    ax.set_aspect('equal')
    for spine in ax.spines.values():
        spine.set_visible(False)
    ax.tick_params(left=False, bottom=False, labelleft=False, labelbottom=False)

plt.tight_layout()
plt.show()

plot_single(pca_result, 'PCA', cols['lipizone_color'])
plot_single(tsne_result, 't-SNE', cols['lipizone_color'])
plot_single(umap_adj_result, 'UMAP (adjusted)', cols['lipizone_color'])

In [None]:
# Individual high-quality plots
def plot_single(result, title, colors, figsize=(8, 8)):
    plt.figure(figsize=figsize)
    plt.scatter(result[:,0], result[:,1], c=colors, s=50, alpha=1.0)#edgecolor="black",
    plt.title(title)
    plt.axis('off')
    for spine in plt.gca().spines.values():
        spine.set_visible(False)
    plt.xticks([])
    plt.yticks([])
    plt.tight_layout()
    plt.savefig("centroidstsne.pdf")
    plt.show()

plot_single(tsne_result, 't-SNE', cols['lipizone_color'])

In [None]:
scaler = MinMaxScaler()

centroidsP = datavignettes.loc[:, programs].groupby(datavignettes['lipizone_names']).mean()
centroidsP = pd.DataFrame(scaler.fit_transform(centroidsP), index = centroidsP.index, columns = centroidsP.columns)

import matplotlib.pyplot as plt

short_names = {
    'HexCer + hexosylceramides': 'HexCer',
    'PC': 'PC',
    'PE': 'PE',
    'PS + diacylglycerophosphoserines [GP0301]': 'PS',
    'PI + diacylglycerophosphoinositols [GP0601]': 'PI',
    'LPC + monoacylglycerophosphocholines [GP0105]': 'LPC',
    'diacylglycerophosphoglycerols [GP0401] + PG': 'PG',
    'endosome/lysosome + SM + ceramide phosphocholines (sphingomyelins) [SP0301] + golgi apparatus': 'SM',
    'N-acylsphingosines (ceramides) [SP0201] + Cer': 'Cer'
}

colors = [
    'HexCer + hexosylceramides', 'PC', 'PE', 'PS + diacylglycerophosphoserines [GP0301]',
    'PI + diacylglycerophosphoinositols [GP0601]', 'LPC + monoacylglycerophosphocholines [GP0105]',
    'diacylglycerophosphoglycerols [GP0401] + PG', 
    'endosome/lysosome + SM + ceramide phosphocholines (sphingomyelins) [SP0301] + golgi apparatus',
    'N-acylsphingosines (ceramides) [SP0201] + Cer'
]

plt.figure(figsize=(15, 15))  

for i, color in enumerate(colors, 1):
    plt.subplot(3, 3, i)
    plt.scatter(tsne_result[:,0], tsne_result[:,1], c=centroidsP[color], s=30, cmap="PuOr")
    plt.axis('off')
    plt.xticks([])
    plt.yticks([])
    plt.title(short_names[color], fontsize=12)

plt.tight_layout()
plt.savefig("lipidprogramtsne.pdf")
plt.show()

## Some useful spatial plots

In [None]:
# show lipizones that are found in several anatomical districts

import os
dot_size = 0.3
sections_to_plot = range(1, 33)
dd2 = datavignettes
global_min_z = dd2['zccf'].min()
global_max_z = dd2['zccf'].max()
global_min_y = -dd2['yccf'].max()
global_max_y = -dd2['yccf'].min()
unique_lev4cols = np.sort(dd2['lipizone_names'].unique())

fig, axes = plt.subplots(4, 8, figsize=(40, 20))
axes = axes.flatten()
for i, section_num in enumerate(sections_to_plot):
    ax = axes[i]
    xx = dd2[dd2["Section"] == section_num]
    sc1 = ax.scatter(xx['zccf'], -xx['yccf'], c=xx['old_lipizone_names'].astype("category").cat.codes,
                     cmap='Grays', s=dot_size * 2, alpha=0.2, rasterized=True)
    xx_highlight = xx[xx['old_lipizone_names'] == "Periventricular zone 4"]
    sc2 = ax.scatter(xx_highlight['zccf'], -xx_highlight['yccf'],
                     c="red", s=dot_size*3, alpha=1, rasterized=True)
    xx_highlight = xx[xx['old_lipizone_names'] == "Midbrain, behavioral state related 1"]
    sc2 = ax.scatter(xx_highlight['zccf'], -xx_highlight['yccf'],
                     c="purple", s=dot_size*5, alpha=1, rasterized=True)
    xx_highlight = xx[xx['old_lipizone_names'] == "Infralimbic area 4"]
    sc2 = ax.scatter(xx_highlight['zccf'], -xx_highlight['yccf'],
                     c="blue", s=dot_size*5, alpha=1, rasterized=True)
    ax.axis('off')
    ax.set_aspect('equal')
    ax.set_xlim(global_min_z, global_max_z)
    ax.set_ylim(global_min_y, global_max_y)
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])
plt.tight_layout()

plt.show()
plt.close(fig)

In [None]:
## highlighting lipizones in an individual section
dd_sub = dd2[dd2['Section'] == 12]

# choose the annotation level of interest
unique_levels = dd_sub['lipizone_names'].value_counts()[:128].index

fig, axes = plt.subplots(8, 16, figsize=(22, 8))
axes = axes.flatten()

for i, level in enumerate(unique_levels):
    ax = axes[i]
    data = dd_sub[dd_sub['lipizone_names'] == level]
    
    ax.scatter(dd_sub['zccf'], -dd_sub['yccf'], s=0.005, c=dd_sub['lipizone_names'].astype("category").cat.codes, cmap='Greys', alpha=0.1,rasterized=True)

    ax.scatter(data['zccf'], -data['yccf'], s=0.05, c=data['lipizone_color'].unique(),rasterized=True)
    ax.axis('off')
    
for j in range(i+1, len(axes)):
    axes[j].axis('off')

plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from mpl_toolkits.mplot3d import Axes3D

secnow = np.sort(datavignettes['Section'].unique())[8:][::-1]
datavignettes['color'] = 0
datavignettes.loc[datavignettes['old_lipizone_names'] == "Dentate gyrus 2", 'color'] = 1
dd2 = datavignettes.loc[datavignettes['Section'].isin(secnow), :]
dot_size = 1.0

global_min_z = dd2['zccf'].min()
global_max_z = dd2['zccf'].max()
global_min_y = -dd2['yccf'].max()
global_max_y = -dd2['yccf'].min()

fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(111, projection='3d')
z_offset = 0.5  
secnow = np.sort(datavignettes['Section'].unique())[8:][::-1]
datavignettes['color'] = 0
datavignettes.loc[datavignettes['old_lipizone_names'] == "Dentate gyrus 2", 'color'] = 1
dd2 = datavignettes.loc[datavignettes['Section'].isin(secnow), :]
dot_size = 1.0

fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(111, projection='3d')
z_offset = 0.5
grey_cmap = plt.cm.Greys

for i, section_num in enumerate(secnow):
    section_data = dd2[dd2["Section"] == section_num]
    z_pos = i * z_offset
    cluster_codes = section_data['lipizone_names'].astype("category").cat.codes
    norm_clusters = cluster_codes / cluster_codes.max()
    
    colors = ['red' if x == 1 else matplotlib.colors.rgb2hex(grey_cmap(y)) 
              for x, y in zip(section_data['color'], norm_clusters)]
    sizes = [dot_size * 5 if x == 1 else dot_size for x in section_data['color']]
    
    ax.scatter(
        section_data['zccf'], 
        -section_data['yccf'], 
        z_pos,
        c=colors,
        rasterized=True,
        s=sizes,
        alpha=0.5,
        edgecolors='none'
    )

ax.set_axis_off()
ax.set_xlim(global_min_z, global_max_z)
ax.set_ylim(global_min_y, global_max_y)
ax.set_zlim(-z_offset, (len(secnow)) * z_offset)
ax.set_box_aspect([1,1,0.5])
ax.view_init(elev=30, azim=300)
plt.tight_layout()
plt.show()

## UMAPs of lipids in the space of pixels

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import fcluster
import matplotlib.patches as mpatches


lipidlipidcorr = centroids.corr()

initial_g = sns.clustermap(lipidlipidcorr, cmap='vlag', figsize=(12, 12))
plt.close()
linkage = initial_g.dendrogram_col.linkage
cluster_labels = fcluster(linkage, 2, criterion='maxclust')
lipids = lipidlipidcorr.columns
lipid_clusters = pd.Series(cluster_labels, index=lipids)

cluster1 = lipid_clusters[lipid_clusters == 1].index.tolist()
cluster2 = lipid_clusters[lipid_clusters == 2].index.tolist()

print("Cluster 1:", cluster1)
print("Cluster 2:", cluster2)

wm = centroids.loc[:,cluster1]
gm = centroids.loc[:,cluster2]

In [None]:
import re
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
import umap.umap_ as umap

df['color'] = df['class'].map(lipid_class_counts['classcolors'])
dfc = df.copy()
dfc.index = df['lipid_name']
df1 = dfc.loc[cluster1,:]
df2 = dfc.loc[cluster2,:]

reducer = umap.UMAP(n_neighbors=5, min_dist=0.05, n_jobs=1)
umap_result = reducer.fit_transform(wm.T)

from adjustText import adjust_text
fig, ax = plt.subplots(figsize=(14, 10))

scatter = ax.scatter(umap_result[:, 0], umap_result[:, 1],
                     c=df1['color'], s=50)
texts = []

for i, txt in enumerate(df1['lipid_name']):
    texts.append(ax.text(umap_result[i, 0], umap_result[i, 1], txt,
                         fontsize=10, alpha=0.9))

adjust_text(texts, 
            ax=ax,
            arrowprops=dict(arrowstyle='-', color='gray', alpha=0.5),
            expand_points=(1.2, 1.4),
            force_points=0.2,
            force_text=0.2,
            lim=1000)

for spine in ax.spines.values():
    spine.set_visible(False)
ax.set_xticks([])
ax.set_yticks([])
ax.set_xticklabels([])
ax.set_yticklabels([])

plt.tight_layout()
plt.savefig("lipidumap1.pdf")
plt.show()




reducer = umap.UMAP(n_neighbors=5, min_dist=0.05, n_jobs=1)
umap_result = reducer.fit_transform(gm.T)

fig, ax = plt.subplots(figsize=(14, 10))

scatter = ax.scatter(umap_result[:, 0], umap_result[:, 1],
                     c=df2['color'], s=50)
texts = []

for i, txt in enumerate(df2['lipid_name']):
    texts.append(ax.text(umap_result[i, 0], umap_result[i, 1], txt,
                         fontsize=10, alpha=0.9))

adjust_text(texts, 
            ax=ax,
            arrowprops=dict(arrowstyle='-', color='gray', alpha=0.5),
            expand_points=(1.2, 1.4),
            force_points=0.2,
            force_text=0.2,
            lim=1000)

for spine in ax.spines.values():
    spine.set_visible(False)
ax.set_xticks([])
ax.set_yticks([])
ax.set_xticklabels([])
ax.set_yticklabels([])

plt.tight_layout()
plt.savefig("lipidumap2.pdf")
plt.show()

## Export the differential lipids at each split

In [None]:
dat = datavignettes.copy()
lipid_data = dat.iloc[:,:173]

dat = dat.loc[:, dat.columns[dat.columns.str.startswith('level')]]


from scipy.stats import mannwhitneyu, entropy
import matplotlib.pyplot as plt
from tqdm import tqdm
from statsmodels.stats.multitest import multipletests
from tqdm import tqdm

import os
import pandas as pd
import numpy as np
from scipy.stats import mannwhitneyu
from statsmodels.stats.multitest import multipletests

def differential_lipids(lipid_data, bool_mask, min_fc=0.2, pthr=0.05):
    """
    Compare two groups (True vs False in bool_mask) within `lipid_data`.
    Returns a DataFrame with log2 fold change, p-values, and FDR-corrected p-values
    for each lipid (column in lipid_data).
    """
    results = []
    
    # Subset the data into two groups
    groupA = lipid_data.loc[bool_mask]
    groupB = lipid_data.loc[~bool_mask]

    for col_name in lipid_data.columns:
        dataA = groupA[col_name].dropna()
        dataB = groupB[col_name].dropna()
        
        # Compute group means and log2 fold change
        meanA = np.mean(dataA) + 1e-11  # avoid division by zero
        meanB = np.mean(dataB) + 1e-11
        log2fc = np.log2(meanB / meanA)
        
        # Mann-Whitney U test
        try:
            _, pval = mannwhitneyu(dataA, dataB, alternative='two-sided')
        except ValueError:
            # Occurs if one group is all identical values, etc.
            pval = np.nan
        
        results.append({
            'lipid': col_name,
            'meanA': meanA,
            'meanB': meanB,
            'log2fold_change': log2fc,
            'p_value': pval
        })

    results_df = pd.DataFrame(results)

    # Multiple-testing correction
    reject, pvals_corrected, _, _ = multipletests(
        results_df['p_value'].values,
        alpha=pthr,
        method='fdr_bh'
    )
    results_df['p_value_corrected'] = pvals_corrected

    return results_df


def traverse_and_diff(
    dat,
    lipid_data,
    levels,
    current_level=0,
    branch_path=None,
    min_fc=0.2,
    pthr=0.05,
    output_dir="diff_results"
):
    """
    Recursively traverse the hierarchical labels in `dat`, perform differential analysis 
    (two-group comparison: val vs the rest) at each level, and save results for each split.
    
    - dat: DataFrame containing hierarchical annotations (columns like 'level_1', 'level_2', ...).
           Row indices align with samples.
    - lipid_data: DataFrame with lipid measurements (same rows = samples, columns = lipids).
    - levels: list of the column names describing the hierarchy.
    - current_level: integer index into `levels`.
    - branch_path: keeps track of label choices so far (used for file naming).
    - min_fc, pthr: thresholds passed to `differential_lipids` (you can incorporate `min_fc` logic as needed).
    - output_dir: directory where the CSV output is saved.
    """
    if branch_path is None:
        branch_path = []
    
    # Stop if we've consumed all hierarchical levels
    if current_level >= len(levels):
        return
    
    level_col = levels[current_level]
    unique_vals = dat[level_col].unique()
    
    # If there's no real split at this level, just exit
    if len(unique_vals) < 2:
        return
    
    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)
    
    # For each unique group at the current level
    for val in unique_vals:
        # labs is a boolean mask for the current subset of `dat`
        labs = (dat[level_col] == val)
        
        # 1) Perform differential analysis: val vs. not val
        diff = differential_lipids(lipid_data, labs, min_fc=min_fc, pthr=pthr)
        
        # (Optional) sort by log2 fold change, descending
        diff = diff.sort_values(by="log2fold_change", ascending=False)
        
        # 2) Construct a filename reflecting the path taken so far
        path_labels = [
            f"{lvl_name}={lvl_val}"
            for lvl_name, lvl_val in zip(levels[:current_level], branch_path)
        ]
        path_labels.append(f"{level_col}={val}")
        filename = "_".join(path_labels) + ".csv"
        
        # Save differential results
        out_path = os.path.join(output_dir, filename)
        diff.to_csv(out_path, index=False)
        
        # 3) Recurse deeper:
        #    - subset `dat` to only the rows where labs==True
        #    - subset `lipid_data` the same way so indexes remain aligned
        sub_dat = dat.loc[labs]
        sub_lipid_data = lipid_data.loc[labs]

        traverse_and_diff(
            dat=sub_dat,
            lipid_data=sub_lipid_data,
            levels=levels,
            current_level=current_level + 1,
            branch_path=branch_path + [val],
            min_fc=min_fc,
            pthr=pthr,
            output_dir=output_dir
        )


hierarchy = ["level_"+str(i) for i in range(1,15)]
traverse_and_diff(dat, lipid_data, levels=hierarchy, min_fc=0.2, pthr=0.05)