In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc
import anndata
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42

atlas = pd.read_parquet("./zenodo/maindata_2.parquet")
atlas = atlas.loc[atlas['Sample'] == "ReferenceAtlas",:]
atlas

## Clean cell types vs lipizones

In [None]:
import scipy.cluster.hierarchy as sch

pixelclosestcells = pd.read_hdf("./zenodo/multimodal/pixelclosestcells.h5ad", key="table")
pixelclosestcells = pixelclosestcells.loc[pixelclosestcells.index.isin(atlas.index)]
pixelclosestcells = pixelclosestcells.dropna()
lipizoneZ = atlas.loc[pixelclosestcells.index, 'lipizone_names']
ctnow = pd.read_hdf("./zenodo/multimodal/celltypesnow.h5ad", key="table")

#####
ctnow = ctnow[ctnow.isin(ctnow.value_counts()[ctnow.value_counts() > 40].index)] 
pixelclosestcells = pixelclosestcells[pixelclosestcells.isin(ctnow.index)]
lipizoneZ = lipizoneZ.loc[pixelclosestcells.index]
#####

celltypeZ = ctnow.loc[pixelclosestcells.values]
celltypeZ.index = lipizoneZ.index
cmat = pd.crosstab(lipizoneZ, celltypeZ)
substrings = ['Micro', 'Lymphocyte', 'Endo', 'Macro', 'NG', 'Ng', 'Pit']
rows_to_keep = ~cmat.index.to_series().str.contains('|'.join(substrings), case=False, na=False)
cols_to_keep = ~cmat.columns.to_series().str.contains('|'.join(substrings), case=False, na=False)
cmat = cmat.loc[:, cols_to_keep]
normalized_df = cmat / cmat.sum() # fraction 
normalized_df = (normalized_df.T / normalized_df.T.mean()).T ## switch to enrichments
normalized_df1 = normalized_df.copy()

pixelclosestcells = pd.read_hdf("./zenodo/multimodal/pixelclosestcells.h5ad", key="table")
pixelclosestcells = pixelclosestcells.loc[pixelclosestcells.index.isin(atlas.index)]
pixelclosestcells = pixelclosestcells.dropna()
lipizoneZ = atlas.loc[pixelclosestcells.index, 'lipizone_names']
ctnow = pd.read_hdf("./zenodo/multimodal/celltypesnow.h5ad", key="table")

#####
ctnow = ctnow[ctnow.isin(ctnow.value_counts()[ctnow.value_counts() > 40].index)]
pixelclosestcells = pixelclosestcells[pixelclosestcells.isin(ctnow.index)]
lipizoneZ = lipizoneZ.loc[pixelclosestcells.index]
#####

celltypeZ = ctnow.loc[pixelclosestcells.values]
celltypeZ.index = lipizoneZ.index
cmat = pd.crosstab(lipizoneZ, celltypeZ).T
substrings = ['Micro', 'Lymphocyte', 'Endo', 'Macro', 'NG', 'Ng', 'Pit']
rows_to_keep = ~cmat.index.to_series().str.contains('|'.join(substrings), case=False, na=False)
cols_to_keep = ~cmat.columns.to_series().str.contains('|'.join(substrings), case=False, na=False)
cmat = cmat.loc[rows_to_keep, :]
normalized_df = cmat / cmat.sum() # fraction 
normalized_df = (normalized_df.T / normalized_df.T.mean()) ## switch to enrichments
normalized_df2 = normalized_df.copy()

normalized_df = normalized_df2 * normalized_df1
normalized_df[cmat.T < 20] = 0
normalized_df = normalized_df.loc[:, normalized_df.sum() > 200]

linkage = sch.linkage(sch.distance.pdist(normalized_df.T), method='weighted', optimal_ordering=True)
order = sch.leaves_list(linkage)
normalized_df = normalized_df.iloc[:, order]

order = np.argmax(normalized_df.values, axis=1)
order = np.argsort(order)
normalized_df = normalized_df.iloc[order,:]

In [None]:
normalized_df = normalized_df.loc[:, normalized_df.sum() > 0]
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from matplotlib.colors import LinearSegmentedColormap

def process_column_names(df):
    processed_df = df.copy()

    new_columns = []
    for col in processed_df.columns:
        if '=' in col:
            new_col = col.split('=', 1)[1]
        else:
            new_col = col
        new_columns.append(new_col)
    
    processed_df.columns = new_columns
    
    categories = []
    for col in processed_df.columns:
        if '_' in col:
            category = col.split('_', 1)[0] 
        else:
            category = col
        categories.append(category)
    
    return processed_df, categories

processed_df, categories = process_column_names(normalized_df)

unique_categories = list(set(categories))
freq_series = pd.Series(categories).value_counts()
# engineer the sequence to make it more logical
unique_categories = freq_series.index.values
unique_categories = np.concatenate((unique_categories[1:], np.array(['Ex'])))
unique_categories = unique_categories.tolist()

import seaborn as sns

n_categories = len(unique_categories)

beautiful_colors = sns.color_palette("tab20", n_categories).as_hex()

color_dict = {cat: beautiful_colors[i] for i, cat in enumerate(unique_categories)}

color_dict['Ex'] = "#000000"
color_dict['Inh'] = "#d3d3d3"

col_colors = [color_dict[cat] for cat in categories]

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from matplotlib.colors import to_rgb

vmin, vmax = 0, np.percentile(normalized_df, 98) #np.percentile(normalized_df, 99.9), np.percentile(normalized_df, 99.9)
data = normalized_df.to_numpy()  
nrows, ncols = data.shape

img = np.zeros((nrows, ncols, 3))

for j in range(ncols):
    base_color = np.array(to_rgb(col_colors[j]))
    for i in range(nrows):
        norm_val = (data[i, j] - vmin) / (vmax - vmin)
        norm_val = np.clip(norm_val, 0, 1)
        img[i, j, :] = (1 - norm_val) * np.array([1, 1, 1]) + norm_val * base_color

plt.figure(figsize=(20, 20))
plt.imshow(img, aspect='auto')
xtick_labels = [
    categories[j] if categories[j] not in ['Ex', 'Inh'] else ''
    for j in range(ncols)
]
plt.xticks(ticks=np.arange(ncols), labels=xtick_labels, rotation=90)
plt.yticks([])
plt.title('Enrichment of lipids across major brain divisions')
plt.tight_layout()
plt.show()

from matplotlib.patches import Patch

legend_elements = [Patch(facecolor=color, edgecolor='black', label=label)
                   for label, color in color_dict.items()]

plt.figure(figsize=(8, 6))
plt.legend(handles=legend_elements, loc='center left', bbox_to_anchor=(1, 0.5), title="Categories")
plt.axis('off')  # Hide axes
plt.title("Color Legend")
plt.tight_layout()
plt.show()

## A briefing on neurotransmitters and neuromodulators

In [None]:
normalized_df.loc[:, np.array(categories)== "Nor"].sort_values(by = "300-0-0-0-0-0-0-12-1-0-1-0-0=Nor_Phox2b_Pla2g4d_1")

In [None]:
normalized_df.loc[:, np.array(categories)== "Chol"].sort_values(by = "300-0-0-0-0-0-1-2-0-1-0=Chol_Ecel1_Crabp2_1") # not so clean probably

In [None]:
normalized_df.loc[:, np.array(categories)== "Chol"].sort_values(by = "300-0-0-0-6-0-0-0-0=Chol_Tbx20_A4galt_1") # this seems clean

In [None]:
normalized_df.loc[:, np.array(categories)== "CholEx"].sort_values(by = "300-0-0-0-3-0-0=CholEx_Irx2_Gm5741") # a promising one 

In [None]:
data = atlas
unique_sections = data["Section"].unique()

for XXX in ["B-LC-SG-PDTg-PB", "Ventrolateral preoptic nucleus_2", "IPDM-IAM-IPN-IPDL-mp"]:
    fig, axs = plt.subplots(4, 8, figsize=(32, 16))
    axs = axs.flatten()
    
    print(normalized_df.loc[XXX,:].sort_values()[-5:])

    for i, section_value in enumerate(unique_sections):
        if i >= len(axs):
            break
        ax = axs[i]
        section = data[data["Section"] == section_value]
        filtered_section = section.loc[section['lipizone_names'].isin([XXX]),:]

        ax.scatter(filtered_section['z_index'], -filtered_section['y_index'],
                        c="red", s=0.2,
                        alpha=1, zorder=1, rasterized=True)  

        filtered_section_contour = section.loc[section['boundary'] == 1,:]
        ax.scatter(filtered_section_contour['z_index'], -filtered_section_contour['y_index'],
                        c='black', s=0.01, rasterized=True, zorder=2, alpha=0.9)

        ax.set_aspect('equal')

    for ax in axs:
        ax.axis('off') 

    plt.tight_layout()
    plt.show()

## How many lipizones "are cell types"?

In [None]:
normalized_df # notice i'm considering only a minority of reliable cell types

In [None]:
mappers = (normalized_df > 200).sum(axis=1).sort_values()
mappers

In [None]:
plt.hist(mappers.values)

In [None]:
maxcoloc = normalized_df.max(axis=1).sort_values()
maxcoloc

In [None]:
plt.hist(maxcoloc.values, bins=50)
plt.show()

In [None]:
maxcoloc[:10]

In [None]:
# these are the "oooo these are NOOOOT cell types..."

data = atlas
unique_sections = data["Section"].unique()

for XXX in maxcoloc[:5].index:
    fig, axs = plt.subplots(4, 8, figsize=(32, 16))
    axs = axs.flatten()
    
    print(normalized_df.loc[XXX,:].sort_values()[-5:])

    for i, section_value in enumerate(unique_sections):
        if i >= len(axs):
            break
        ax = axs[i]
        section = data[data["Section"] == section_value]
        filtered_section = section.loc[section['lipizone_names'].isin([XXX]),:]

        ax.scatter(filtered_section['z_index'], -filtered_section['y_index'],
                        c="red", s=0.2,
                        alpha=1, zorder=1, rasterized=True)  

        filtered_section_contour = section.loc[section['boundary'] == 1,:]
        ax.scatter(filtered_section_contour['z_index'], -filtered_section_contour['y_index'],
                        c='black', s=0.01, rasterized=True, zorder=2, alpha=0.9)

        ax.set_aspect('equal')

    for ax in axs:
        ax.axis('off') 

    plt.tight_layout()
    plt.show()

In [None]:
# these are the "oooo these are DEFINITELY cell types..." (in theory -  and in practice, perfectly)

data = atlas
unique_sections = data["Section"].unique()

for XXX in maxcoloc[-10:].index:
    fig, axs = plt.subplots(4, 8, figsize=(32, 16))
    axs = axs.flatten()
    
    print(normalized_df.loc[XXX,:].sort_values()[-5:])

    for i, section_value in enumerate(unique_sections):
        if i >= len(axs):
            break
        ax = axs[i]
        section = data[data["Section"] == section_value]
        filtered_section = section.loc[section['lipizone_names'].isin([XXX]),:]

        ax.scatter(filtered_section['z_index'], -filtered_section['y_index'],
                        c="red", s=0.2,
                        alpha=1, zorder=1, rasterized=True)  

        filtered_section_contour = section.loc[section['boundary'] == 1,:]
        ax.scatter(filtered_section_contour['z_index'], -filtered_section_contour['y_index'],
                        c='black', s=0.01, rasterized=True, zorder=2, alpha=0.9)

        ax.set_aspect('equal')

    for ax in axs:
        ax.axis('off') 

    plt.tight_layout()
    plt.show()

In [None]:
# where is the threshold??
maxcoloc[maxcoloc > 200][:10] # is it around 200?

In [None]:
data = atlas
unique_sections = data["Section"].unique()

for XXX in maxcoloc[maxcoloc > 200][:5].index:
    fig, axs = plt.subplots(4, 8, figsize=(32, 16))
    axs = axs.flatten()
    
    print(normalized_df.loc[XXX,:].sort_values()[-5:])

    for i, section_value in enumerate(unique_sections):
        if i >= len(axs):
            break
        ax = axs[i]
        section = data[data["Section"] == section_value]
        filtered_section = section.loc[section['lipizone_names'].isin([XXX]),:]

        ax.scatter(filtered_section['z_index'], -filtered_section['y_index'],
                        c="red", s=0.2,
                        alpha=1, zorder=1, rasterized=True)  

        filtered_section_contour = section.loc[section['boundary'] == 1,:]
        ax.scatter(filtered_section_contour['z_index'], -filtered_section_contour['y_index'],
                        c='black', s=0.01, rasterized=True, zorder=2, alpha=0.9)

        ax.set_aspect('equal')

    for ax in axs:
        ax.axis('off') 

    plt.tight_layout()
    plt.show()

In [None]:
(maxcoloc > 200).sum() # this would be 30% of lipizones are cell types. this seems to emerge consistently, i will stick to it

In [None]:
lipizonesthatarecelltypes = maxcoloc[maxcoloc > 200][::-1]
lipizonesthatarecelltypes

## Lipizones that are not cell types: what are you?

In [None]:
maxcoloc[maxcoloc <= 200] # these do not map to a cell type (conditional to macoscko btw)

In [None]:
noncelltypes = maxcoloc[maxcoloc <= 200].index.values

In [None]:
# 1) who's anatomical but not cell type?

acronyms = atlas['acronym'].copy()
lipizones = atlas['lipizone_names'].copy()

acronyms = acronyms.loc[acronyms.isin(acronyms.value_counts().index[acronyms.value_counts() > 50])]
lipizones = lipizones.loc[acronyms.index]

cmat = pd.crosstab(acronyms, lipizones)

normalized_df = cmat / cmat.sum() # fraction 
normalized_df = (normalized_df.T / normalized_df.T.mean()).T ## switch to enrichments
normalized_df1 = normalized_df.copy()
normalized_df1

cmat = pd.crosstab(lipizones, acronyms)
normalized_df = cmat / cmat.sum() 
normalized_df = (normalized_df.T / normalized_df.T.mean()).T 
normalized_df2 = normalized_df.copy().T
normalized_df2

normalized_df = normalized_df2 * normalized_df1
normalized_df[cmat.T < 20] = 0

linkage = sch.linkage(sch.distance.pdist(normalized_df.T), method='weighted', optimal_ordering=True)
order = sch.leaves_list(linkage)
normalized_df = normalized_df.iloc[:, order]

order = np.argmax(normalized_df.values, axis=1)
order = np.argsort(order)
normalized_df = normalized_df.iloc[order,:]

plt.figure(figsize=(10, 10))
sns.heatmap(normalized_df, cmap="Grays", cbar_kws={'label': 'Enrichment'}, xticklabels=True, yticklabels=False, vmin = np.percentile(normalized_df, 2), vmax = np.percentile(normalized_df, 98))

plt.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
plt.tick_params(axis='y', which='both', left=False, right=False)

plt.yticks(rotation=0)

plt.tight_layout()
plt.show()

In [None]:
callosalwm = atlas.loc[(atlas['level_1'] == 1.0) & (atlas['level_2'] == 1.0) & (atlas['level_3'] == 1.0),:]
myelin = callosalwm['lipizone_names'].unique()

In [None]:
inout_related = pd.read_csv("./zenodo/csv/islipizoneinoutconnection.csv", index_col=0)
mapper = atlas[['old_lipizone_names', 'lipizone_names']].drop_duplicates().reset_index().iloc[:,1:]
mapper.index = mapper['old_lipizone_names']
inout_related.index = inout_related.index.map(mapper['lipizone_names'])
inout = inout_related.index[inout_related["0"] < 0.05].values
inout

In [None]:
normalized_df = normalized_df.loc[normalized_df.sum(axis=1) > 0,:]
normalized_df

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Patch

vmin, vmax = 0, np.percentile(normalized_df, 98)
data = normalized_df.to_numpy()
nrows, ncols = data.shape
img = np.zeros((nrows, ncols, 3))

for j, col_name in enumerate(normalized_df.columns):
    if col_name in myelin:
        base_color = np.array([1.0, 1.0, 0.0])
    elif col_name not in noncelltypes:
        base_color = np.array([1, 0, 0])
    elif col_name in inout:
        base_color = np.array([0, 0, 1])
    else:
        base_color = np.array([0, 0, 0])
    for i in range(nrows):
        norm_val = (data[i, j] - vmin) / (vmax - vmin)
        norm_val = np.clip(norm_val, 0, 1)
        img[i, j, :] = (1 - norm_val) * np.array([1, 1, 1]) + norm_val * base_color

fig, ax = plt.subplots(figsize=(20, 20))
im = ax.imshow(img, aspect='auto')
ax.set_xticks([])
ax.set_yticks([])
ax.set_title('Enrichment of lipids across major brain divisions')
fig.tight_layout()
im.set_rasterized(True)
plt.show()

legend_elements = [
    Patch(facecolor=[1.0, 1.0, 0.0], edgecolor='black', label='Myelin'),
    Patch(facecolor='red',       edgecolor='black', label='Cell Types'),
    Patch(facecolor='blue',      edgecolor='black', label='InOut'),
    Patch(facecolor='black',     edgecolor='black', label='Non-Cell Types')
]

fig2, ax2 = plt.subplots(figsize=(8, 6))
ax2.legend(handles=legend_elements, loc='center left', bbox_to_anchor=(1, 0.5), title="Categories")
ax2.axis('off')
ax2.set_title("Color Legend")
fig2.tight_layout()
plt.show()

In [None]:
yellow_count = 0
red_count = 0
blue_count = 0
black_count = 0

for col in normalized_df.columns:
    if col in myelin:
        yellow_count += 1
    elif col not in noncelltypes:
        red_count += 1
    elif col in inout:
        blue_count += 1
    else:
        black_count += 1

print("Yellow (Myelin):", yellow_count)
print("Red (Cell Types):", red_count)
print("Blue (InOut):", blue_count)
print("Black (Non-Cell Types):", black_count)

## Assess unexplained intra-cluster variability

In [None]:
import pandas as pd

datavignettes = atlas

# subselect the two ventricular subclasses
ventricles = datavignettes.loc[datavignettes['subclass'].isin(['12111', '12112']),:]

In [None]:
from sklearn.preprocessing import StandardScaler
lipid_cols = datavignettes.columns[:173]
scaler = StandardScaler()
datavignettes.loc[:, lipid_cols] = scaler.fit_transform(datavignettes.loc[:, lipid_cols])

In [None]:
from tqdm import tqdm
results = []

for zone in tqdm(datavignettes['lipizone_names'].unique()):
    subset = datavignettes[datavignettes['lipizone_names'] == zone]
    
    lipid_data = subset.loc[:, lipid_cols]
    
    cov_matrix = np.cov(lipid_data, rowvar=False)
    
    # transition to log vs numerical instability
    sign, logdet = np.linalg.slogdet(cov_matrix)
    
    results.append({'lipizone': zone, 'determinant': logdet})

results_df = pd.DataFrame(results)

results_df = results_df.sort_values(by='determinant', ascending=True)

results_df

In [None]:
import matplotlib.pyplot as plt

results_df['determinant2'] = 1000+results_df['determinant']
ventricle_zones = ventricles['lipizone_names'].unique()
is_ventricle = results_df['lipizone'].isin(ventricle_zones)

colors = ['red' if v else 'gray' for v in is_ventricle]

plt.figure(figsize=(5, 6))
plt.bar(range(len(results_df)), results_df['determinant2'].values/100000, color=colors)

plt.ylabel('Unexplained variability after clustering')
plt.xlabel('Sorted lipizones')
plt.tight_layout()
plt.show()