In [None]:
import pandas as pd
import numpy as np
import matplotlib

matplotlib.rcParams['pdf.fonttype'] = 42

sub_alldata = pd.read_parquet("./zenodo/maindata_2.parquet")
sub_alldata = sub_alldata.loc[sub_alldata["Sample"].isin(['Male1','Male2','Male3','Female1','Female2','Female3']),:]

## Characterize interindividual variation first

In [None]:
# q0: how many lipids does a lipizone contain?
import matplotlib.pyplot as plt
atlas = sub_alldata
atlasmeans = atlas.iloc[:,:173].groupby(atlas['lipizone_names']).mean()
plt.hist((atlasmeans > 0.00015).sum(axis=1), bins=20, color="black")
plt.title("Number of lipids contained in a lipizone above background")
plt.show()
(atlasmeans > 0.00015).sum(axis=1).sort_values()

In [None]:
lipids = sub_alldata.columns.values[:173]

for col in lipids:
    vals = sub_alldata[col].astype(np.float32)
    lo = np.percentile(vals, 0.1)
    hi = np.percentile(vals, 99.9)
    sub_alldata[col] = (np.clip(vals, lo, hi) - lo) / (hi - lo)
    
coords = sub_alldata[['xccf','yccf','zccf','SectionID', 'Sample', 'SectionPlot', 'x', 'y']]
shift = pd.read_parquet("./zenodo/bayes/shift_MF.parquet")
baseline = pd.read_parquet("./zenodo/bayes/baseline_MF.parquet")
significance = pd.read_parquet("./zenodo/bayes/sign_significance_MF.parquet")
significance = significance.loc[shift.index, shift.columns]
shift[~significance] = 0.0
relshift = shift/baseline
susc_df = shift
relshift

In [None]:
import pandas as pd
import numpy as np

def compute_icc(df, subject_col, measurement_col):
    """
    Computes the intra-class correlation (ICC) for a one-way random effects model.
    
    Parameters:
    - df: pandas DataFrame containing the data
    - subject_col: name of the column identifying individuals
    - measurement_col: name of the column with the continuous measurements
    
    Returns:
    - icc: estimated ICC (fraction of variance due to between-subject differences)
    - sigma_b2: estimated between-subject variance component
    - sigma_w2: estimated within-subject variance component
    """
    # Group-level statistics
    group_stats = df.groupby(subject_col)[measurement_col].agg(['mean', 'count'])
    group_means = group_stats['mean']
    counts = group_stats['count']
    grand_mean = df[measurement_col].mean()

    # Sum of squares between
    ss_between = (counts * (group_means - grand_mean)**2).sum()
    J = len(group_stats)
    df_between = J - 1
    ms_between = ss_between / df_between

    # Sum of squares within
    ss_within = ((df[measurement_col] 
                  - df.groupby(subject_col)[measurement_col].transform('mean'))**2).sum()
    N = len(df)
    df_within = N - J
    ms_within = ss_within / df_within

    # Estimate variance components
    n_bar = counts.mean()
    sigma_b2 = (ms_between - ms_within) / n_bar
    sigma_w2 = ms_within

    # ICC
    icc = sigma_b2 / (sigma_b2 + sigma_w2)
    return icc, sigma_b2, sigma_w2

In [None]:
# males loop

from tqdm import tqdm
alldfs = []

for l in tqdm(atlas.columns[:173].values):
    
    icc_vals = []
    for subclass in atlas['subclass'].unique():

        df = atlas.loc[(atlas['Sample'].isin(['Male1', 'Male2', 'Male3'])) & (atlas['subclass'] == subclass),:]

        icc_val, _, _ = compute_icc(df, 'Sample', l)
        
        icc_vals.append(icc_val)
        
    alldfs.append(pd.DataFrame(icc_vals, index = atlas['subclass'].unique()))
    
males = pd.concat(alldfs, axis=1)
males.columns = atlas.columns[:173].values

# females loop

alldfs_F = []

for l in tqdm(atlas.columns[:173].values):
    
    icc_vals = []
    for subclass in atlas['subclass'].unique():

        df = atlas.loc[(atlas['Sample'].isin(['Female1', 'Female2', 'Female3'])) & (atlas['subclass'] == subclass),:]

        icc_val, _, _ = compute_icc(df, 'Sample', l)
        
        icc_vals.append(icc_val)
        
    alldfs_F.append(pd.DataFrame(icc_vals, index = atlas['subclass'].unique()))
    
females = pd.concat(alldfs_F, axis=1)
females.columns = atlas.columns[:173].values

In [None]:
plt.hist(females.mean(), bins=20)
plt.show()
plt.hist(males.mean(), bins=20)
plt.show()
meanmf = (males.loc[females.index, females.columns] + females) / 2
interindvariance_perlipid = meanmf.mean()


np.mean(interindvariance_perlipid)
np.percentile(interindvariance_perlipid, 5)
np.percentile(interindvariance_perlipid, 95)
interindvariance_perlipid.sort_values()[:15]
malesint = males.mean()
malesint.sort_values()[:15]
femalesint = females.mean()
femalesint.sort_values()[:15]

## Analyze sex variation

In [None]:
import matplotlib.pyplot as plt
plt.hist(susc_df.max(), bins=10, color="black")
plt.show()
plt.hist(susc_df.min(), bins=10, color="black")
plt.show()
plt.hist(shift.values.flatten(), bins=100, color="black")
plt.show()

susc_df = shift.copy()
susc_df['supertype'] = susc_df.index

coeffmap = pd.merge(
    sub_alldata[['supertype']], 
    susc_df,
    on='supertype',
    how='left'
)

coeffmap = coeffmap.iloc[:, 1:]
coeffmap.index = sub_alldata.index

tocheck = relshift

plt.hist(tocheck.values.flatten(), bins=100, color="black")
plt.show()

scores=tocheck
scores['supertype'] = scores.index
relativesuscmap = pd.merge(
    sub_alldata[['supertype']], 
    scores,
    on='supertype',
    how='left'
)
relativesuscmap.index = sub_alldata.index
relativesuscmap = relativesuscmap.iloc[:, 1:]
relativesuscmap

In [None]:
centroids = baseline
tocheck = scores.iloc[:,:-1]
mask = (tocheck.abs() >= 0.3) & (centroids > 0.1) # clean up background at least in part
indices = mask.where(mask).stack().index.tolist()
indices # there are a few cases of relative high difference, and they seem mostly in WM lipids

In [None]:
from collections import Counter

first_elements = [t[0] for t in indices]
second_elements = [t[1] for t in indices]

first_counts = Counter(first_elements)
second_counts = Counter(second_elements)

first_counts = Counter(dict(sorted(first_counts.items(), key=lambda x: (-x[1], x[0]))))
second_counts = Counter(dict(sorted(second_counts.items(), key=lambda x: (-x[1], x[0]))))

first_counts

In [None]:
from collections import Counter

filtered_lipids = {lipid: count for lipid, count in second_counts.items() if count > 1}

sexsusc_lipids = list(filtered_lipids.keys())
sexsusc_lipids

In [None]:
filtered_lipids

In [None]:
# winner: 11211222 
altered = pd.DataFrame(indices)

firstlipids2check = tocheck.loc["11211222", altered.loc[altered[0] == "11211222", 1].values].sort_values()[::-1][:10].index.values
firstlipids2check

tocheck.loc["11211222", altered.loc[altered[0] == "11211222", 1].values].sort_values()[::-1]#[:10].index.values

In [None]:
tocheck.loc["11211222", altered.loc[altered[0] == "11211222", 1].values].sort_values()[::-1][:20]

In [None]:
tocheck.loc["11211222", altered.loc[altered[0] == "11211222", 1].values].sort_values()[::-1][-15:]

In [None]:
# second winner 11221111

In [None]:
tocheck.loc["11221111", altered.loc[altered[0] == "11221111", 1].values].sort_values()[::-1][-15:]

In [None]:
tocheck.loc["11221111", altered.loc[altered[0] == "11221111", 1].values].sort_values()[::-1][:20]

In [None]:
for aaa in range(1, 7):
    sec1 = sub_alldata.loc[sub_alldata['SectionPlot'] == aaa,:]

    samples = sec1['Sample'].unique()
    num_samples = len(samples)
    fig, axes = plt.subplots(2, 3, figsize=(15, 10))

    axes = axes.flatten()

    for idx, samp in enumerate(samples):
        ax = axes[idx]
        xxx = sec1.loc[sec1['Sample'] == samp, :]

        ax.scatter(
            xxx['y'], -xxx['x'],
            c=xxx['supertype'].astype("category").cat.codes,
            s=0.05,
            alpha=0.7,
            cmap="Greys"
        )

        yyy = xxx.loc[xxx['supertype'] == "11211222", :]

        ax.scatter(
            yyy['y'], -yyy['x'],
            c="red",
            s=0.05,
            alpha=0.7
        )

        ax.set_aspect('equal')

        ax.set_xticks([])
        ax.set_yticks([])

        for spine in ax.spines.values():
            spine.set_visible(False)

        ax.set_title(samp)

    for idx in range(num_samples, len(axes)):
        fig.delaxes(axes[idx])
    plt.tight_layout()
    plt.show()

In [None]:
## i.e., ventricular systems, hindbrain

In [None]:
firstlipids2check

In [None]:
import matplotlib.pyplot as plt
from matplotlib.cm import ScalarMappable
from matplotlib.colors import Normalize

filtered_data = sub_alldata

def plot_lipid_distribution(filtered_data, currentPC, tocheck, samples=['Male3', 'Female3'], section=2):

    results = []
    for section_id in filtered_data['SectionID'].unique():
        subset = filtered_data[(filtered_data['SectionID'] == section_id) & 
                             (filtered_data['supertype'].isin(tocheck))]
        perc_2 = subset[currentPC].quantile(0.02)
        perc_98 = subset[currentPC].quantile(0.98)
        results.append([section_id, perc_2, perc_98])
    
    percentile_df = pd.DataFrame(results, columns=['SectionID', '2-perc', '98-perc'])
    med2p = percentile_df['2-perc'].median()
    med98p = percentile_df['98-perc'].median()
    
    plt.rcParams['figure.dpi'] = 100 
    fig, axes = plt.subplots(1, 2, figsize=(6.5, 2))
    cmap = plt.cm.viridis
    
    for sample_idx, sample in enumerate(samples):
        ax = axes[sample_idx]
        
        ddf = filtered_data[
            (filtered_data['Sample'] == sample) & 
            (filtered_data['SectionPlot'] == section)
        ]
        ax.scatter(
            ddf['y'], 
            -ddf['x'], 
            c=ddf["supertype"].astype("category").cat.codes, 
            cmap="Greys", 
            s=0.5, 
            alpha=0.02,
            rasterized=True
        )
        
        ddf = filtered_data[
            (filtered_data['Sample'] == sample) & 
            (filtered_data['SectionPlot'] == section) & 
            (filtered_data['supertype'].isin(tocheck))
        ]
        ax.scatter(
            ddf['y'], 
            -ddf['x'], 
            c=ddf[currentPC], 
            cmap=cmap, 
            s=0.5, 
            alpha=0.5,
            rasterized=True, 
            vmin=med2p, 
            vmax=med98p
        )
        
        ax.axis('off')
        ax.set_aspect('equal')
    
    cbar_ax = fig.add_axes([0.92, 0.15, 0.02, 0.7])
    norm = Normalize(vmin=med2p, vmax=med98p)
    sm = ScalarMappable(norm=norm, cmap=cmap)
    fig.colorbar(sm, cax=cbar_ax)
    
    plt.tight_layout(rect=[0, 0, 0.9, 1])
    plt.suptitle(currentPC)
    
    return fig


from matplotlib.backends.backend_pdf import PdfPages

for currentPC in firstlipids2check:
    fig = plot_lipid_distribution(filtered_data, currentPC, ["11211222"])
    plt.show()

In [None]:
for currentPC in tocheck.loc["11221111", altered.loc[altered[0] == "11221111", 1].values].sort_values()[::-1][:10].index.values:
    fig = plot_lipid_distribution(filtered_data, currentPC, ["11221111"], samples=['Male3', 'Female2'], section=6) 
    plt.show()

## Which lipids, classes, features are globally affected by sex?

In [None]:
allsusc = susc_df.iloc[:,:-1].values.flatten()

plt.hist(allsusc, bins=20, color="black")
plt.show()

In [None]:
def permutation_test_categorical(
    test_labels, 
    other_labels, 
    n_permutations=10_000, 
    alternative='two-sided', 
    random_state=None
):
    """
    Perform a permutation test to assess whether each category in test_labels 
    is over- or under-represented compared to what we would expect by chance.
    
    Parameters
    ----------
    test_labels : 1D array-like of categorical labels (the "test" set)
    other_labels : 1D array-like of categorical labels (all non-test elements)
    n_permutations : int, optional
        Number of random permutations
    alternative : {'two-sided', 'greater', 'less'}, optional
        - 'two-sided': tests if the proportion differs in either direction
        - 'greater': tests if test_labels has a higher proportion of the category
        - 'less': tests if test_labels has a lower proportion of the category
    random_state : int, optional
        If provided, sets the random seed for reproducibility
    
    Returns
    -------
    results : pd.DataFrame
        A DataFrame with columns: 'category', 'observed_count', 'expected_count',
        'observed_proportion', 'expected_proportion', 'p_value'
    """
    if random_state is not None:
        np.random.seed(random_state)
    
    test_labels = np.array(test_labels)
    other_labels = np.array(other_labels)
    
    # Combine everything
    all_labels = np.concatenate([test_labels, other_labels])
    n_test = len(test_labels)
    
    # Identify all unique categories
    unique_categories = np.unique(all_labels)
    
    # Calculate expected proportions from full dataset
    total_counts = {cat: np.sum(all_labels == cat) for cat in unique_categories}
    expected_props = {cat: count/len(all_labels) for cat, count in total_counts.items()}
    
    # Observed counts and proportions
    observed_counts = {cat: np.sum(test_labels == cat) for cat in unique_categories}
    observed_props = {cat: count/n_test for cat, count in observed_counts.items()}
    
    # Store permutation counts
    perm_counts = {cat: np.zeros(n_permutations) for cat in unique_categories}
    
    # Perform the permutations
    for i in range(n_permutations):
        np.random.shuffle(all_labels)
        perm_test = all_labels[:n_test]
        for cat in unique_categories:
            perm_counts[cat][i] = np.sum(perm_test == cat)
    
    # Compute p-values and prepare results
    results = []
    for cat in unique_categories:
        observed = observed_counts[cat]
        distribution = perm_counts[cat]
        expected = expected_props[cat] * n_test
        
        if alternative == 'two-sided':
            # Count permutations that deviate from expected as much as or more than observed
            observed_dev = abs(observed - expected)
            p_value = np.mean(abs(distribution - expected) >= observed_dev)
        
        elif alternative == 'greater':
            # Count permutations where count >= observed
            p_value = np.mean(distribution >= observed)
        
        elif alternative == 'less':
            # Count permutations where count <= observed
            p_value = np.mean(distribution <= observed)
        
        results.append({
            'category': cat,
            'observed_count': observed,
            'expected_count': expected,
            'observed_proportion': observed_props[cat],
            'expected_proportion': expected_props[cat],
            'p_value': p_value
        })
    
    return pd.DataFrame(results)

import re

df = pd.DataFrame(sub_alldata.columns[:173]).fillna('')
df.columns = ["lipid_name"]

# extract the "class" etc from the lipid_name
df["class"] = df["lipid_name"].apply(lambda x: 
    "PC O" if x.startswith("PC O") else
    "PE O" if x.startswith("PE O") else
    re.split(' |\(', x)[0]
)
df["carbons"] = df["lipid_name"].apply(lambda x: int(re.search(r'(\d+):', x).group(1)) if re.search(r'(\d+):', x) else np.nan)
df["insaturations"] = df["lipid_name"].apply(lambda x: int(re.search(r':(\d+)', x).group(1)) if re.search(r':(\d+)', x) else np.nan)
df["insaturations_per_Catom"] = df["insaturations"] / df["carbons"]
df["broken"] = df["lipid_name"].str.endswith('_uncertain')
df.loc[df["broken"], 'carbons'] = np.nan
df.loc[df["broken"], 'class'] = np.nan
df.loc[df["broken"], 'insaturations'] = np.nan
df.loc[df["broken"], 'insaturations_per_Catom'] = np.nan
colors = pd.read_hdf("./zenodo/mixed/lipidclasscolors.h5ad", key="table")
df['color'] = df['class'].map(colors['classcolors'])
df.loc[df["broken"], 'color'] = "gray"
df.index = df['lipid_name']
df = df.drop_duplicates()

In [None]:
from adjustText import adjust_text

meansus = coeffmap.mean()
meansus = meansus.sort_values()
dfff = pd.DataFrame(meansus)
colors = df.loc[dfff.index, 'color'].fillna("black")

plt.figure(figsize=(10, 6))
bars = plt.bar(range(len(dfff)), dfff.iloc[:,0], color=colors)
n_items = len(dfff)
bottom_5 = list(range(5))
top_5 = list(range(n_items-5, n_items))
middle_start = 5
middle_end = n_items - 5
middle_5 = list(np.random.choice(range(middle_start, middle_end), 5, replace=False))
indices_to_label = sorted(bottom_5 + middle_5 + top_5)

texts = []
for idx in indices_to_label:
    x = idx
    y = dfff.iloc[idx, 0]
    label = dfff.index[idx]
    texts.append(plt.text(x, y, label, ha='center', va='bottom'))

adjust_text(texts, 
           arrowprops=dict(arrowstyle='->', color='gray', lw=0.5),
           expand_points=(1.5, 1.5))

plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.ylabel('Mean susceptibility across the whole brain')
plt.xlabel("Sorted lipid species")
plt.xticks([])
plt.tight_layout()
plt.show()

In [None]:
dfff[:8] # globally, the differences are really weak!

In [None]:
dfff[-8:] 

In [None]:
class_enrichments = permutation_test_categorical(
df.loc[meansus.index[(meansus > 0.01)].values, 'class'], df.loc[np.setdiff1d(df.index.values, meansus.index[(meansus > 0.01)]), 'class'], 
n_permutations=5000, 
alternative='two-sided', 
#random_state=42
)
class_enrichments = class_enrichments.loc[(class_enrichments['p_value'] < 0.1),:]
class_enrichments

In [None]:
class_enrichments = permutation_test_categorical(
df.loc[meansus.index[(-meansus > 0.01)].values, 'class'], df.loc[np.setdiff1d(df.index.values, meansus.index[(-meansus > 0.01)]), 'class'], 
n_permutations=5000, 
alternative='two-sided', 
#random_state=42
)
class_enrichments = class_enrichments.loc[(class_enrichments['p_value'] < 0.1),:]
class_enrichments

In [None]:
import matplotlib.pyplot as plt
from matplotlib.cm import ScalarMappable
from matplotlib.colors import Normalize

checklip = ['Cer 40:2;O2'] 

filtered_data = pd.concat([sub_alldata, alldata.loc[:, checklip]],axis=1)

for currentPC in checklip:
    
    print(currentPC)
    results = []

    for section in filtered_data['SectionID'].unique():
        subset = filtered_data[filtered_data['SectionID'] == section]

        perc_2 = subset[currentPC].quantile(0.02)
        perc_98 = subset[currentPC].quantile(0.98)

        results.append([section, perc_2, perc_98])
    percentile_df = pd.DataFrame(results, columns=['SectionID', '2-perc', '98-perc'])
    med2p = percentile_df['2-perc'].median()
    med98p = percentile_df['98-perc'].median()

    cmap = plt.cm.plasma

    unique_samples = sorted(filtered_data['Sample'].unique())
    unique_sections = sorted(filtered_data['SectionPlot'].unique())

    fig, axes = plt.subplots(6, 6, figsize=(20, 12))

    for sample_idx, sample in enumerate(unique_samples[:6]):
        for section_idx, section in enumerate(unique_sections[:6]):
            ax = axes[sample_idx, section_idx]

            try:
                ddf = filtered_data[
                    (filtered_data['Sample'] == sample) & 
                    (filtered_data['SectionPlot'] == section)
                ]

                ax.scatter(
                    ddf['y'], 
                    -ddf['x'], 
                    c=ddf[currentPC], 
                    cmap="plasma", 
                    s=0.5, 
                    rasterized=True, 
                    vmin=med2p, 
                    vmax=med98p
                )

                ax.axis('off')
                ax.set_aspect('equal')

                ax.set_title(f'Sample {sample}, Section {section}', fontsize=8)

            except:
                continue

    cbar_ax = fig.add_axes([0.92, 0.15, 0.02, 0.7])
    norm = Normalize(vmin=med2p, vmax=med98p)
    sm = ScalarMappable(norm=norm, cmap=cmap)
    fig.colorbar(sm, cax=cbar_ax)

    plt.tight_layout(rect=[0, 0, 0.9, 1])
    plt.show() # this outlier is a bit burnt

In [None]:
import pickle

file_path = './zenodo/mixed/allen_name_to_annots.pkl'

with open(file_path, 'rb') as file:
    allen_name_to_annots = pickle.load(file)

divisions = ['Olfactory areas', 'Isocortex', 'Hippocampal formation', 'Cortical subplate', 'Striatum', 'Pallidum', 'Thalamus', 'Hypothalamus', 'Midbrain', 'Hindbrain', 'Cerebellum', 'fiber tracts', 'ventricular systems']#, ventricular systems']

sub_alldata['division'] = "General"
for i in divisions:
    sub_alldata['division'][sub_alldata['id'].isin(allen_name_to_annots[i])] = i
    
sub_alldata['division'].value_counts()

## Assess which regions are overall changed the most in positive / negative lipid content, or "membrane score"

In [None]:
# look at total production and total consumption and absolute balance
activity = susc_df.iloc[:,:-1].copy()

activitypersupertype = activity.sum(axis=1).sort_values()
plt.hist(activitypersupertype, bins = 50)
plt.show()

In [None]:
activitypersupertype = pd.DataFrame(activitypersupertype)
activitypersupertype['supertype'] = activitypersupertype.index
actmap = pd.merge(
    sub_alldata[['supertype']], 
    activitypersupertype,
    on='supertype',
    how='left'
)
actmap.index = sub_alldata.index
actmap = actmap.iloc[:, 1:]
actmap.columns = ['actmap']
filtered_data2 = pd.concat([sub_alldata.loc[actmap.index,:], actmap],axis=1)

for currentPC in ["actmap"]:
    
    print(currentPC)

    unique_samples = sorted(filtered_data2['Sample'].unique())
    unique_sections = sorted(filtered_data2['SectionPlot'].unique())

    fig, axes = plt.subplots(6, 6, figsize=(20, 12))

    for sample_idx, sample in enumerate(unique_samples[:6]):
        for section_idx, section in enumerate(unique_sections[:6]):
            ax = axes[sample_idx, section_idx]

            try:
                ddf = filtered_data2[
                    (filtered_data2['Sample'] == sample) & 
                    (filtered_data2['SectionPlot'] == section)
                ]

                ax.scatter(
                    ddf['y'], 
                    -ddf['x'], 
                    c=ddf[currentPC], 
                    cmap="coolwarm", 
                    s=0.5, 
                    rasterized=True, 
                    vmin= -4.0, ########################
                    vmax= 4.0 ########################
                )

                ax.axis('off')
                ax.set_aspect('equal')

                ax.set_title(f'Sample {sample}, Section {section}', fontsize=8)

            except:
                continue

    plt.tight_layout(rect=[0, 0, 0.9, 1])
    plt.show()

In [None]:
namingtable = {
    "cluster": [
        11111, 11112, 11121, 11122, 11211, 11212, 11221, 11222, 12111, 12112, 
        12121, 12122, 12211, 12212, 12221, 12222, 21111, 21112, 21120, 21211, 
        21212, 21221, 21222, 22111, 22112, 22121, 22122, 22211, 22212, 22221, 22222
    ],
    "zone": [
        "Mixed and hindbrain white matter", "Core callosal white matter", 
        "Callosal and cerebellar white matter", "Ventral white matter", 
        "Boundary white matter", "Thalamic and mid/hindbrain white matter", 
        "Mid/hindbrain white matter", "Mixed white matter", 
        "Choroid plexus and ventricles", "Ventricular linings", 
        "Thalamic and midbrain regions", "White and gray matter boundary", 
        "Thalamic mixed gray and white matter", "Thalamic mixed gray and white matter #2", 
        "Neuron-rich lateral white matter", "Neuron-rich lateral white matter #2", 
        "Pallidum and projections", "Cortical layer 4", 
        "Subcortical plate, hippocampus and hypothalamus", 
        "GABA-ergic Purkinje cells of the cerebellum", "Cortical layers 2-3 and 4", 
        "Piriform cortex", "Cortical layers 1 and 2-3", "Cortical layer 5", 
        "Cortical layer 6, dentate gyrus", "Striatum, hypothalamus and hippocampus", 
        "Striatum, hypothalamus and hippocampus #2", 
        "Retrosplenial, cortical, cerebellar", "Cortical layer 6 and cerebellar Y", 
        "Cerebellar glutamatergic neurons", "Cortical layer 6 and thalamic"
    ],
    "color": [
        "#360064", "#980053", "#170b3b", "#ac2f5c", "#2a3f6d", "#002657", 
        "#21366b", "#3e4b6c", "#f75400", "#ef633e", "#a5d4e6", "#6399c6", 
        "#853a00", "#edeef4", "#fdbf71", "#ce710e", "#940457", "#a2d36c", 
        "#d5edb5", "#0065d6", "#bcf18b", "#a68d68", "#79e47e", "#2f0097", 
        "#47029f", "#7500a8", "#d70021", "#ca99c9", "#d4b9da", "#e00085", 
        "#f6f3f8"
    ]
}

namingtable = pd.DataFrame(namingtable)
namingtable.index = namingtable['cluster'].astype(str)

In [None]:
# similar results as doing abs prod and abs degr independently
activity = susc_df.iloc[:,:-1].copy()

overallproduction = activity.sum(axis=1).sort_values()
subclasses_tocheck = [x[:5] for x in list(overallproduction.index)]
namingtable.loc[subclasses_tocheck, "zone"][:20]

In [None]:
namingtable.loc[subclasses_tocheck, "zone"][-20:]

In [None]:
for currentPC in ["actmap"]:
    print(currentPC)

    unique_sections = sorted(filtered_data2['SectionPlot'].unique())
    
    # Create a figure
    fig = plt.figure(figsize=(20, 4))
    scatter_plots = []  # Collect scatter plots for colorbar

    # Define starting position and width
    left_start = 0.05  # Starting position
    width = 0.2        # Width of each subplot
    overlap = 0.1      # Amount of lateral overlap (50% of width)

    for section_idx, section in enumerate(unique_sections[:6][::-1]):
        left = left_start + section_idx * (width - overlap)  # Overlap each subplot

        # Create an axis with adjusted position
        ax = fig.add_axes([left, 0.2, width, 0.6])  # [left, bottom, width, height]

        try:
            ddf = filtered_data2[
                (filtered_data2['Sample'] == 'Male3') & 
                (filtered_data2['SectionPlot'] == section)
            ]

            scatter = ax.scatter(
                ddf['y'], 
                -ddf['x'], 
                c=ddf[currentPC], 
                cmap="coolwarm", 
                s=0.5, 
                rasterized=True, 
                vmin=-4.0,  # Minimum value for color scale
                vmax=4.0    # Maximum value for color scale
            )
            scatter_plots.append(scatter)

            ax.axis('off')  # Hide axes
            ax.set_aspect('equal')

        except Exception as e:
            print(f"Error with section {section}: {e}")
            continue

    # Add a colorbar
    cbar = fig.colorbar(scatter_plots[0], ax=fig.axes, orientation='vertical', fraction=0.02, pad=0.04)
    cbar.set_label('Activity Map', fontsize=10)
    plt.show()


## Pregnancy variation vs sex variation

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42

sub_alldata = pd.read_parquet("./zenodo/maindata_2.parquet")
sub_alldata = sub_alldata.loc[(sub_alldata['Sample'].isin(["Male1", "Male2", "Male3", "Female3", "Female2", "Female1", "Pregnant1", "Pregnant2", "Pregnant4"])) ,:]
sub_alldata

In [None]:
lips = sub_alldata.iloc[:,:173]

In [None]:
colors = ["pink", "pink", "pink", "blue", "blue", "blue", "purple", "purple", "purple"]

In [None]:
datemp = lips.copy() 
p2 = datemp.quantile(0.005)
p98 = datemp.quantile(0.995)

datemp_values = datemp.values
p2_values = p2.values
p98_values = p98.values

normalized_values = (datemp_values - p2_values) / (p98_values - p2_values)

clipped_values = np.clip(normalized_values, 0, 1)

normalized_datemp = pd.DataFrame(clipped_values, columns=datemp.columns, index=datemp.index)
normalized_datemp

In [None]:
centroids = normalized_datemp.groupby([sub_alldata['Sample'], sub_alldata['supertype']]).mean()
centroids = centroids.unstack()
centroidsOLD = centroids.copy()
centroids = normalized_datemp.groupby([sub_alldata['SectionID'], sub_alldata['supertype']]).mean()
centroids = centroids.unstack()
centroids

In [None]:
centroids = centroids.fillna(0.0)
sub_alldata['colors'] = [x[:-1] for x in sub_alldata['Sample']]
mdnow=sub_alldata[['SectionID', 'colors']].drop_duplicates().reset_index()
mdnow.index = mdnow['SectionID']
mdnow = mdnow.loc[centroids.index,:]
mdnow.loc[mdnow['colors'] == "Female", 'colors'] = "pink"
mdnow.loc[mdnow['colors'] == "Male", 'colors'] = "blue"
mdnow.loc[mdnow['colors'] == "Pregnant", 'colors'] = "purple"
mdnow

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

scaler = StandardScaler()
scaled_data = pd.DataFrame(scaler.fit_transform(centroids), 
                          index=centroids.index, 
                          columns=centroids.columns)

pca = PCA(n_components=2)
pca_result = pca.fit_transform(scaled_data)

colors = mdnow['colors']
var_explained = pca.explained_variance_ratio_ * 100

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from matplotlib.patches import Ellipse
import matplotlib.transforms as transforms

df_pca = pd.DataFrame({
    'PC1': pca_result[:, 0],
    'PC2': pca_result[:, 1],
    'Color': colors
})

def plot_confidence_ellipse(x, y, ax, color, n_std=2.0, **kwargs):
    if len(x) < 3:
        return

    cov = np.cov(x, y)
    if np.linalg.det(cov) == 0:
        return

    pearson = cov[0, 1] / np.sqrt(cov[0, 0] * cov[1, 1])
    ell_radius_x = np.sqrt(1 + pearson)
    ell_radius_y = np.sqrt(1 - pearson)

    mean_x = np.mean(x)
    mean_y = np.mean(y)

    scale_x = np.sqrt(cov[0, 0]) * n_std
    scale_y = np.sqrt(cov[1, 1]) * n_std

    ellipse = Ellipse((0, 0),
                      width=ell_radius_x * 2,
                      height=ell_radius_y * 2,
                      facecolor='none',
                      edgecolor=color,
                      linewidth=2,
                      alpha=0.6,
                      **kwargs)

    transf = transforms.Affine2D().rotate_deg(45).scale(scale_x, scale_y).translate(mean_x, mean_y)
    ellipse.set_transform(transf + ax.transData)
    ax.add_patch(ellipse)

fig, ax = plt.subplots(figsize=(10, 10))

for color in np.unique(colors):
    group_df = df_pca[df_pca['Color'] == color]
    ax.scatter(group_df['PC1'], group_df['PC2'], color=color, edgecolor='black', s=100, alpha=0.9)
    plot_confidence_ellipse(group_df['PC1'], group_df['PC2'], ax, color=color, n_std=2.0)

ax.set_xlabel(f'PC1 ({var_explained[0]:.1f}% variance)')
ax.set_ylabel(f'PC2 ({var_explained[1]:.1f}% variance)')
ax.set_title('2D PCA of Centroids with Confidence Ellipses')
ax.set_xticks([])
ax.set_yticks([])
plt.tight_layout()
plt.savefig("pca_mfpreg_2d.pdf")
plt.show()


In [None]:
import seaborn as sns
sns.clustermap(centroidsOLD.T.corr())
plt.show()