In [None]:
import pandas as pd
import numpy as np

quantlcms = pd.read_csv("QuantitativeLCMS.csv", index_col=0)

atlas = quantlcms[['Male',	'Male.1',	'Male.2',	'Male.3']] # use the males as the "reference atlas" we are going to use mostly is a male
ref = atlas.iloc[1:,:].astype(float).mean(axis=1)

"""
annots = pd.read_csv("goslin_output.tsv",sep='\t')
convt = annots[['Original Name', 'Species Name']]
convt.index = convt['Original Name'].astype(str)
refvals = pd.DataFrame(ref.values, index = ref.index, columns=["nmol_fraction_LCMS"])
refvals.index = refvals.index.str.replace('Hex1Cer', 'HexCer')
tmp = pd.read_csv("manuallyannotated_addlcms.csv", index_col=0).dropna()
refvalstmp = refvals.loc[refvals.index.isin(tmp.iloc[:,0]),:]
rvl = np.array(refvals.index)
convl = np.array(convt.index)
annots.index = annots['Original Name']
annots = annots.loc[np.intersect1d(rvl, convl),:]
refvals = refvals.loc[np.intersect1d(rvl, convl),:]
indivannots = annots[['Species Name']]
indivannots = indivannots.groupby('Original Name').first()
refvals['Species Name'] = refvals.index.map(indivannots['Species Name'])
tmp.index = tmp.iloc[:,0]
tmp = tmp.loc[refvalstmp.index,:]
refvalstmp['Species Name'] = tmp['Unnamed: 2']
quantlcms = pd.concat([refvals, refvalstmp], axis=0)
quantlcms.index = quantlcms['Species Name']
quantlcms = quantlcms[['nmol_fraction_LCMS']]
quantlcms = pd.DataFrame(quantlcms['nmol_fraction_LCMS'].groupby(quantlcms.index).sum()) # merge lipids that are distinguished in LCMS but undistinguishable in IMS
"""

quantlcms = pd.DataFrame(ref, columns = ["nmol_fraction_LCMS"])
quantlcms

In [None]:
! rm -rf experiment_100140756

In [None]:
 rsync -avz fusar@cajal.epfl.ch:/data/luca/lipidatlas/ManuscriptAnalysisRound3/atlas.parquet /home/fusar/lba/atlas.parquet 

In [None]:
quantlcms = quantlcms.sort_values(by = "nmol_fraction_LCMS")
ind = quantlcms.index.values
ind[ind == "SM39:2;O2"] = "SM 39:2;O2"
ind[ind == "TG(46:0)"] = "TG 46:0"
ind = [s.replace("(", " ").replace(")", "") for s in ind]
quantlcms.index = ind
quantlcms

In [None]:
quantlcms = quantlcms.sort_values(by = "nmol_fraction_LCMS")
ind = quantlcms.index.values
ind[ind == "SM39:2;O2"] = "SM 39:2;O2"
ind[ind == "TG(46:0)"] = "TG 46:0"
ind = [s.replace("(", " ").replace(")", "") for s in ind]
quantlcms.index = ind
quantlcms

import re
import numpy as np
import pandas as pd

# Example: assume quantlcms.index contains your lipid names
df = pd.DataFrame(quantlcms.index).fillna('')
df.columns = ["lipid_name"]

def extract_class(lipid_name):
    """
    Extract the lipid class from a lipid name.
    Handles cases like "PC O-36:4" where we want to capture "PC O".
    This regex looks for one or more alphanumeric characters followed by an
    optional " O" (with or without a space) and then a space or dash.
    """
    m = re.match(r'^([A-Za-z0-9]+(?:\s?O)?)[\s-]', lipid_name)
    if m:
        return m.group(1)
    else:
        return lipid_name.split()[0]

# Test the extraction with a few examples
test_lipids = ["PC O-36:4", "PC-36:4", "PE O-38:6", "PE-38:6"]
for lip in test_lipids:
    print(f"{lip} -> {extract_class(lip)}")

df["class"] = df["lipid_name"].apply(extract_class)

# Extract number of carbons and insaturations from the lipid name
df["carbons"] = df["lipid_name"].apply(
    lambda x: int(re.search(r'(\d+):', x).group(1)) if re.search(r'(\d+):', x) else np.nan
)
df["insaturations"] = df["lipid_name"].apply(
    lambda x: int(re.search(r':(\d+)', x).group(1)) if re.search(r':(\d+)', x) else np.nan
)
df["insaturations_per_Catom"] = df["insaturations"] / df["carbons"]

# Mark broken entries based on naming convention (e.g., ending with '_uncertain')
df["broken"] = df["lipid_name"].str.endswith('_uncertain')
df.loc[df["broken"], ['carbons', 'class', 'insaturations', 'insaturations_per_Catom']] = np.nan

# Map colors from an external file
colors = pd.read_hdf("lipidclasscolors.h5ad", key="table")
df['color'] = df['class'].map(colors['classcolors'])
df.loc[df["broken"], 'color'] = "gray"

# Set index and remove duplicates
df.index = df['lipid_name']
df = df.drop_duplicates()
df['color'] = df['color'].fillna("black")

df

In [None]:
classvalues = df['class'].value_counts()
classvalues

In [None]:
colormapp = df[['class', 'color']].drop_duplicates()
colormapp.index = np.arange(colormapp.shape[0])
colormapp

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.colors as mcolors

# fill in the blacks
mask_black = colormapp['color'] == 'black'
num_black = mask_black.sum()
palette = sns.color_palette("husl", num_black)
palette_hex = [mcolors.rgb2hex(color) for color in palette]
colormapp.loc[mask_black, 'color'] = palette_hex
color_dict = pd.Series(colormapp['color'].values, index=colormapp['class']).to_dict()

color_dict

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rcParams['pdf.fonttype'] = 42

pie_colors = [color_dict.get(cls, 'black') for cls in classvalues.index]

plt.figure(figsize=(8, 8))
classvalues.plot.pie(
    colors=pie_colors, 
    autopct='%1.1f%%', 
    startangle=90, 
    textprops={'fontsize': 10}
)
plt.ylabel('')  # Hide the y-label
plt.title("Lipid class cardinality of species in whole-brain LCMS")
plt.savefig("lcms_prop.pdf")
plt.show()

In [None]:
df['quant'] = quantlcms.loc[df.index, 'nmol_fraction_LCMS']
classabundance = df['quant'].groupby(df['class']).sum()
classabundance

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rcParams['pdf.fonttype'] = 42

pie_colors = [color_dict.get(cls, 'black') for cls in classabundance.index]

plt.figure(figsize=(8, 8))
classabundance.plot.pie(
    colors=pie_colors, 
    autopct='%1.1f%%', 
    startangle=90, 
    textprops={'fontsize': 10}
)
plt.ylabel('')
plt.title("Lipid class total abundance in whole-brain LCMS")
plt.savefig("lcms_abund.pdf")
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from adjustText import adjust_text

# Compute the order of classes by ascending median of quant
order = sorted(df['class'].dropna().unique())

# Create a figure with two subplots (broken axis)
fig, (ax_top, ax_bottom) = plt.subplots(
    2, 1, sharex=True, 
    gridspec_kw={'height_ratios': [1, 3]},
    figsize=(12, 8)
)

# ------------------------------
# Plot the boxplots on both axes
sns.boxplot(data=df, x='class', y='quant', order=order, 
            palette=color_dict, showfliers=False, ax=ax_top)
sns.boxplot(data=df, x='class', y='quant', order=order, 
            palette=color_dict, showfliers=False, ax=ax_bottom)

# ------------------------------
# Overlay the individual data points using a stripplot,
# splitting the data: points above 2 go to ax_top; the rest to ax_bottom.
sns.stripplot(data=df[df['quant'] > 2], x='class', y='quant', order=order, 
              color='black', alpha=0.5, size=3, jitter=True, ax=ax_top)
sns.stripplot(data=df[df['quant'] <= 2], x='class', y='quant', order=order, 
              color='black', alpha=0.5, size=3, jitter=True, ax=ax_bottom)

# ------------------------------
# Set the y-limits for the two axes
ax_bottom.set_ylim(0, 2)
# For the top axis, start just above the break. Multiply max by 1.05 for some headroom.
ax_top.set_ylim(2.2, df['quant'].max() * 1.05)

# Remove spines between plots for a cleaner look.
sns.despine(ax=ax_top, bottom=True)
sns.despine(ax=ax_bottom, top=True)

# ------------------------------
# Draw diagonal break marks on the y-axis to indicate the break
d = .015  # size of diagonal lines in axes coordinates
# Top axes: draw lines at the bottom corners
kwargs = dict(transform=ax_top.transAxes, color='k', clip_on=False)
ax_top.plot((-d, +d), (-d, +d), **kwargs)
ax_top.plot((1-d, 1+d), (-d, +d), **kwargs)
# Bottom axes: draw lines at the top corners
kwargs.update(transform=ax_bottom.transAxes)
ax_bottom.plot((-d, +d), (1-d, 1+d), **kwargs)
ax_bottom.plot((1-d, 1+d), (1-d, 1+d), **kwargs)

# ------------------------------
# Annotate all points with quant > 1 using label repulsion.
# We’ll annotate on the appropriate axis depending on the point's value.
texts_top = []
texts_bottom = []
# For annotation we need an x-coordinate for each point.
# Because our x-axis is categorical (with order given by `order`),
# we assign x positions based on the index in `order` plus a small random jitter.
for idx, row in df[df['quant'] > 1].iterrows():
    # Determine the base x coordinate from the class order:
    cat = row['class']
    try:
        x_center = order.index(cat)
    except ValueError:
        continue  # skip if class not in order (shouldn't happen)
    jitter = np.random.uniform(-0.1, 0.1)
    x = x_center + jitter
    y = row['quant']
    # Choose the correct axis for the annotation:
    if y <= 2:
        current_ax = ax_bottom
        t = current_ax.text(x, y, row['lipid_name'], fontsize=8, ha='center', va='bottom')
        texts_bottom.append(t)
    else:
        current_ax = ax_top
        t = current_ax.text(x, y, row['lipid_name'], fontsize=8, ha='center', va='bottom')
        texts_top.append(t)

# Adjust text to avoid overlap using adjust_text:
adjust_text(texts_top, ax=ax_top, expand_text=(1.05, 1.2),
            arrowprops=dict(arrowstyle='->', color='gray', lw=0.5))
adjust_text(texts_bottom, ax=ax_bottom, expand_text=(1.05, 1.2),
            arrowprops=dict(arrowstyle='->', color='gray', lw=0.5))

# ------------------------------
# Final touches: rotate x-axis labels, add axis labels and title.
plt.setp(ax_bottom.get_xticklabels(), rotation=45, ha='right')
ax_bottom.set_xlabel("Lipid Class")
ax_bottom.set_ylabel("Quant (nmol_fraction_LCMS)")
ax_top.set_title("Lipid Class Quant Distribution (Broken Axis at y=2)")

plt.tight_layout()
plt.savefig("abundance_in_LCMS.pdf")
plt.show()


In [None]:
atlas = pd.read_parquet("atlas.parquet")
import re
import numpy as np
import pandas as pd

# Example: assume quantlcms.index contains your lipid names
df = pd.DataFrame(atlas.columns[:173]).fillna('')
df.columns = ["lipid_name"]
df

In [None]:
def extract_class(lipid_name):
    """
    Extract the lipid class from a lipid name.
    Handles cases like "PC O-36:4" where we want to capture "PC O".
    This regex looks for one or more alphanumeric characters followed by an
    optional " O" (with or without a space) and then a space or dash.
    """
    m = re.match(r'^([A-Za-z0-9]+(?:\s?O)?)[\s-]', lipid_name)
    if m:
        return m.group(1)
    else:
        return lipid_name.split()[0]

# Test the extraction with a few examples
test_lipids = ["PC O-36:4", "PC-36:4", "PE O-38:6", "PE-38:6"]
for lip in test_lipids:
    print(f"{lip} -> {extract_class(lip)}")

df["class"] = df["lipid_name"].apply(extract_class)

# Extract number of carbons and insaturations from the lipid name
df["carbons"] = df["lipid_name"].apply(
    lambda x: int(re.search(r'(\d+):', x).group(1)) if re.search(r'(\d+):', x) else np.nan
)
df["insaturations"] = df["lipid_name"].apply(
    lambda x: int(re.search(r':(\d+)', x).group(1)) if re.search(r':(\d+)', x) else np.nan
)
df["insaturations_per_Catom"] = df["insaturations"] / df["carbons"]
df.loc[df['class'] == "HexCer", 'class'] = "Hex1Cer"

# Mark broken entries based on naming convention (e.g., ending with '_uncertain')
df["broken"] = df["lipid_name"].str.endswith('_uncertain')
df.loc[df["broken"], ['carbons', 'class', 'insaturations', 'insaturations_per_Catom']] = np.nan

# Map colors from an external file
colors = pd.read_hdf("lipidclasscolors.h5ad", key="table")
df['color'] = df['class'].map(colors['classcolors'])
df.loc[df["broken"], 'color'] = "gray"

# Set index and remove duplicates
df.index = df['lipid_name']
df = df.drop_duplicates()
df['color'] = df['color'].fillna("black")

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rcParams['pdf.fonttype'] = 42
classvalues = df['class'].value_counts()
pie_colors = [color_dict.get(cls, 'black') for cls in classvalues.index]

plt.figure(figsize=(8, 8))
classvalues.plot.pie(
    colors=pie_colors, 
    autopct='%1.1f%%', 
    startangle=90, 
    textprops={'fontsize': 10}
)
plt.ylabel('')  # Hide the y-label
plt.title("Lipid class cardinality of species in whole-brain MSI")
plt.savefig("msi_prop.pdf")
plt.show()

In [None]:
annowmoran = pd.read_csv("cleanedANNOTATIONS_20250215.csv", index_col=0)
annowmoran

In [None]:
meanmoranperpeak = annowmoran.iloc[:, -138:].mean(axis=1)
meanmoranperpeak.index = annowmoran['Annotation']
meanmoranperpeak = meanmoranperpeak.fillna(0)
#meanmoranperpeak = meanmoranperpeak.loc[~pd.isna(meanmoranperpeak)] ##### drop the nans
#meanmoranperpeak = meanmoranperpeak.loc[meanmoranperpeak.index != "_db"]# drop the unannotated

meanmoranperpeak.sort_values()[::-1] # 391 got an annotation and above zero Moran's, also including those with naming that is not confident.

atlas = pd.read_parquet("atlas.parquet")
import re
import numpy as np
import pandas as pd

# Example: assume quantlcms.index contains your lipid names
df = pd.DataFrame(meanmoranperpeak.index).fillna('')
df.columns = ["lipid_name"]
df['Score'] = annowmoran['Score'].values
df['mz'] = annowmoran.index.values
df

def extract_class(lipid_name):
    """
    Extract the lipid class from a lipid name.
    Handles cases like "PC O-36:4" where we want to capture "PC O".
    This regex looks for one or more alphanumeric characters followed by an
    optional " O" (with or without a space) and then a space or dash.
    """
    m = re.match(r'^([A-Za-z0-9]+(?:\s?O)?)[\s-]', lipid_name)
    if m:
        return m.group(1)
    else:
        return lipid_name.split()[0]

# Test the extraction with a few examples
test_lipids = ["PC O-36:4", "PC-36:4", "PE O-38:6", "PE-38:6"]
for lip in test_lipids:
    print(f"{lip} -> {extract_class(lip)}")

df["class"] = df["lipid_name"].apply(extract_class)

# Extract number of carbons and insaturations from the lipid name
df["carbons"] = df["lipid_name"].apply(
    lambda x: int(re.search(r'(\d+):', x).group(1)) if re.search(r'(\d+):', x) else np.nan
)
df["insaturations"] = df["lipid_name"].apply(
    lambda x: int(re.search(r':(\d+)', x).group(1)) if re.search(r':(\d+)', x) else np.nan
)
df["insaturations_per_Catom"] = df["insaturations"] / df["carbons"]
df.loc[df['class'] == "HexCer", 'class'] = "Hex1Cer"

# Mark broken entries based on naming convention (e.g., ending with '_uncertain')
df["broken"] = df["lipid_name"].str.endswith('_uncertain')
df.loc[df["broken"], ['carbons', 'class', 'insaturations', 'insaturations_per_Catom']] = np.nan

# Map colors from an external file
colors = pd.read_hdf("lipidclasscolors.h5ad", key="table")
df['color'] = df['class'].map(colors['classcolors'])
df.loc[df["broken"], 'color'] = "gray"

# Set index and remove duplicates

df['color'] = df['color'].fillna("black")

df

mean_series = meanmoranperpeak.rename('quant')
df['quant'] = mean_series.values
extra_classes = {'CerP', 'LPA', 'PIP O', 'PGP', 'PA', 'CAR', 'ST', 'PA O', 'CoA', 'MG', 'SHexCer', 'LPE O'}

color_dict.update({cls: "gray" for cls in extra_classes})

df.loc[df['class'].isin(['CerP', 'LPA', 'PIP O', 'PGP', 'PA', 'CAR', 'ST', 'PA O', 'CoA', 'MG', 'SHexCer']),'class'] = "others"
df.loc[df['lipid_name'].str.contains("_db"), 'class'] = "others"
color_dict.update({cls: "gray" for cls in ["others"]})

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from adjustText import adjust_text

# Compute the order of classes by ascending median of quant
order = df.groupby('class')['quant'].median().sort_values().index.tolist()

# Create a single figure and axis
fig, ax = plt.subplots(figsize=(12, 8))

# Plot the boxplot
sns.boxplot(data=df, x='class', y='quant', order=order, 
            palette=color_dict, showfliers=False, ax=ax)

# Overlay the individual data points using a stripplot
sns.stripplot(data=df, x='class', y='quant', order=order, 
              color='black', alpha=0.5, size=3, jitter=True, ax=ax)

# Add a dark red dashed horizontal line at y=0.4
ax.axhline(y=0.4, color='darkred', linestyle='--')

# Set the y-limit to stop at 1
ax.set_ylim(0, 1)

# Remove spines for a cleaner look.
sns.despine(ax=ax)

# Annotate all points with quant > 1 using label repulsion.
texts = []
for idx, row in df[df['quant'] > 1].iterrows():
    # Determine the base x coordinate from the class order:
    cat = row['class']
    try:
        x_center = order.index(cat)
    except ValueError:
        continue  # skip if class not in order (shouldn't happen)
    jitter = np.random.uniform(-0.1, 0.1)
    x = x_center + jitter
    y = row['quant']
    t = ax.text(x, y, row['lipid_name'], fontsize=20, ha='center', va='bottom')
    texts.append(t)

# Final touches: rotate x-axis labels, add axis labels and title.
plt.setp(ax.get_xticklabels(), rotation=45, ha='right')
ax.set_xlabel("Lipid Class")
ax.set_ylabel("Lipid-wise mean Moran's I across sections")
ax.set_title("Spatial evaluation of lipids by average Moran's I")

plt.tight_layout()
plt.savefig("moranbyclass.pdf")
plt.show()


In [None]:
meanmoranperpeak = annowmoran.iloc[:, -138:].min(axis=1)
meanmoranperpeak.index = annowmoran['Annotation']
meanmoranperpeak = meanmoranperpeak.fillna(0)
#meanmoranperpeak = meanmoranperpeak.loc[~pd.isna(meanmoranperpeak)] ##### drop the nans
#meanmoranperpeak = meanmoranperpeak.loc[meanmoranperpeak.index != "_db"]# drop the unannotated

meanmoranperpeak.sort_values()[::-1] # 391 got an annotation and above zero Moran's, also including those with naming that is not confident.

atlas = pd.read_parquet("atlas.parquet")
import re
import numpy as np
import pandas as pd

# Example: assume quantlcms.index contains your lipid names
df = pd.DataFrame(meanmoranperpeak.index).fillna('')
df.columns = ["lipid_name"]
df['Score'] = annowmoran['Score'].values
df['mz'] = annowmoran.index.values
df

def extract_class(lipid_name):
    """
    Extract the lipid class from a lipid name.
    Handles cases like "PC O-36:4" where we want to capture "PC O".
    This regex looks for one or more alphanumeric characters followed by an
    optional " O" (with or without a space) and then a space or dash.
    """
    m = re.match(r'^([A-Za-z0-9]+(?:\s?O)?)[\s-]', lipid_name)
    if m:
        return m.group(1)
    else:
        return lipid_name.split()[0]

# Test the extraction with a few examples
test_lipids = ["PC O-36:4", "PC-36:4", "PE O-38:6", "PE-38:6"]
for lip in test_lipids:
    print(f"{lip} -> {extract_class(lip)}")

df["class"] = df["lipid_name"].apply(extract_class)

# Extract number of carbons and insaturations from the lipid name
df["carbons"] = df["lipid_name"].apply(
    lambda x: int(re.search(r'(\d+):', x).group(1)) if re.search(r'(\d+):', x) else np.nan
)
df["insaturations"] = df["lipid_name"].apply(
    lambda x: int(re.search(r':(\d+)', x).group(1)) if re.search(r':(\d+)', x) else np.nan
)
df["insaturations_per_Catom"] = df["insaturations"] / df["carbons"]
df.loc[df['class'] == "HexCer", 'class'] = "Hex1Cer"

# Mark broken entries based on naming convention (e.g., ending with '_uncertain')
df["broken"] = df["lipid_name"].str.endswith('_uncertain')
df.loc[df["broken"], ['carbons', 'class', 'insaturations', 'insaturations_per_Catom']] = np.nan

# Map colors from an external file
colors = pd.read_hdf("lipidclasscolors.h5ad", key="table")
df['color'] = df['class'].map(colors['classcolors'])
df.loc[df["broken"], 'color'] = "gray"

# Set index and remove duplicates

df['color'] = df['color'].fillna("black")

df

mean_series = meanmoranperpeak.rename('quant')
df['quant'] = mean_series.values
extra_classes = {'CerP', 'LPA', 'PIP O', 'PGP', 'PA', 'CAR', 'ST', 'PA O', 'CoA', 'MG', 'SHexCer', 'LPE O'}

color_dict.update({cls: "gray" for cls in extra_classes})

df.loc[df['class'].isin(['CerP', 'LPA', 'PIP O', 'PGP', 'PA', 'CAR', 'ST', 'PA O', 'CoA', 'MG', 'SHexCer']),'class'] = "others"
df.loc[df['lipid_name'].str.contains("_db"), 'class'] = "others"
color_dict.update({cls: "gray" for cls in ["others"]})

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from adjustText import adjust_text

# Compute the order of classes by ascending median of quant
order = df.groupby('class')['quant'].median().sort_values().index.tolist()

# Create a single figure and axis
fig, ax = plt.subplots(figsize=(12, 8))

# Plot the boxplot
sns.boxplot(data=df, x='class', y='quant', order=order, 
            palette=color_dict, showfliers=False, ax=ax)

# Overlay the individual data points using a stripplot
sns.stripplot(data=df, x='class', y='quant', order=order, 
              color='black', alpha=0.5, size=3, jitter=True, ax=ax)

# Add a dark red dashed horizontal line at y=0.4
ax.axhline(y=0.4, color='darkred', linestyle='--')

# Set the y-limit to stop at 1
ax.set_ylim(0, 1)

# Remove spines for a cleaner look.
sns.despine(ax=ax)

# Annotate all points with quant > 1 using label repulsion.
texts = []
for idx, row in df[df['quant'] > 1].iterrows():
    # Determine the base x coordinate from the class order:
    cat = row['class']
    try:
        x_center = order.index(cat)
    except ValueError:
        continue  # skip if class not in order (shouldn't happen)
    jitter = np.random.uniform(-0.1, 0.1)
    x = x_center + jitter
    y = row['quant']
    t = ax.text(x, y, row['lipid_name'], fontsize=20, ha='center', va='bottom')
    texts.append(t)

# Final touches: rotate x-axis labels, add axis labels and title.
plt.setp(ax.get_xticklabels(), rotation=45, ha='right')
ax.set_xlabel("Lipid Class")
ax.set_ylabel("Lipid-wise mean Moran's I across sections")
ax.set_title("Spatial evaluation of lipids by average Moran's I")

plt.tight_layout()
plt.savefig("moranbyclass_MIN.pdf")
plt.show()


In [None]:
meanmoranperpeak = annowmoran.iloc[:, -138:].max(axis=1)
meanmoranperpeak.index = annowmoran['Annotation']
meanmoranperpeak = meanmoranperpeak.fillna(0)
#meanmoranperpeak = meanmoranperpeak.loc[~pd.isna(meanmoranperpeak)] ##### drop the nans
#meanmoranperpeak = meanmoranperpeak.loc[meanmoranperpeak.index != "_db"]# drop the unannotated

meanmoranperpeak.sort_values()[::-1] # 391 got an annotation and above zero Moran's, also including those with naming that is not confident.

atlas = pd.read_parquet("atlas.parquet")
import re
import numpy as np
import pandas as pd

# Example: assume quantlcms.index contains your lipid names
df = pd.DataFrame(meanmoranperpeak.index).fillna('')
df.columns = ["lipid_name"]
df['Score'] = annowmoran['Score'].values
df['mz'] = annowmoran.index.values
df

def extract_class(lipid_name):
    """
    Extract the lipid class from a lipid name.
    Handles cases like "PC O-36:4" where we want to capture "PC O".
    This regex looks for one or more alphanumeric characters followed by an
    optional " O" (with or without a space) and then a space or dash.
    """
    m = re.match(r'^([A-Za-z0-9]+(?:\s?O)?)[\s-]', lipid_name)
    if m:
        return m.group(1)
    else:
        return lipid_name.split()[0]

# Test the extraction with a few examples
test_lipids = ["PC O-36:4", "PC-36:4", "PE O-38:6", "PE-38:6"]
for lip in test_lipids:
    print(f"{lip} -> {extract_class(lip)}")

df["class"] = df["lipid_name"].apply(extract_class)

# Extract number of carbons and insaturations from the lipid name
df["carbons"] = df["lipid_name"].apply(
    lambda x: int(re.search(r'(\d+):', x).group(1)) if re.search(r'(\d+):', x) else np.nan
)
df["insaturations"] = df["lipid_name"].apply(
    lambda x: int(re.search(r':(\d+)', x).group(1)) if re.search(r':(\d+)', x) else np.nan
)
df["insaturations_per_Catom"] = df["insaturations"] / df["carbons"]
df.loc[df['class'] == "HexCer", 'class'] = "Hex1Cer"

# Mark broken entries based on naming convention (e.g., ending with '_uncertain')
df["broken"] = df["lipid_name"].str.endswith('_uncertain')
df.loc[df["broken"], ['carbons', 'class', 'insaturations', 'insaturations_per_Catom']] = np.nan

# Map colors from an external file
colors = pd.read_hdf("lipidclasscolors.h5ad", key="table")
df['color'] = df['class'].map(colors['classcolors'])
df.loc[df["broken"], 'color'] = "gray"

# Set index and remove duplicates

df['color'] = df['color'].fillna("black")

df

mean_series = meanmoranperpeak.rename('quant')
df['quant'] = mean_series.values
extra_classes = {'CerP', 'LPA', 'PIP O', 'PGP', 'PA', 'CAR', 'ST', 'PA O', 'CoA', 'MG', 'SHexCer', 'LPE O'}

color_dict.update({cls: "gray" for cls in extra_classes})

df.loc[df['class'].isin(['CerP', 'LPA', 'PIP O', 'PGP', 'PA', 'CAR', 'ST', 'PA O', 'CoA', 'MG', 'SHexCer']),'class'] = "others"
df.loc[df['lipid_name'].str.contains("_db"), 'class'] = "others"
color_dict.update({cls: "gray" for cls in ["others"]})

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from adjustText import adjust_text

# Compute the order of classes by ascending median of quant
order = df.groupby('class')['quant'].median().sort_values().index.tolist()

# Create a single figure and axis
fig, ax = plt.subplots(figsize=(12, 8))

# Plot the boxplot
sns.boxplot(data=df, x='class', y='quant', order=order, 
            palette=color_dict, showfliers=False, ax=ax)

# Overlay the individual data points using a stripplot
sns.stripplot(data=df, x='class', y='quant', order=order, 
              color='black', alpha=0.5, size=3, jitter=True, ax=ax)

# Add a dark red dashed horizontal line at y=0.4
ax.axhline(y=0.4, color='darkred', linestyle='--')

# Set the y-limit to stop at 1
ax.set_ylim(0, 1)

# Remove spines for a cleaner look.
sns.despine(ax=ax)

# Annotate all points with quant > 1 using label repulsion.
texts = []
for idx, row in df[df['quant'] > 1].iterrows():
    # Determine the base x coordinate from the class order:
    cat = row['class']
    try:
        x_center = order.index(cat)
    except ValueError:
        continue  # skip if class not in order (shouldn't happen)
    jitter = np.random.uniform(-0.1, 0.1)
    x = x_center + jitter
    y = row['quant']
    t = ax.text(x, y, row['lipid_name'], fontsize=20, ha='center', va='bottom')
    texts.append(t)

# Final touches: rotate x-axis labels, add axis labels and title.
plt.setp(ax.get_xticklabels(), rotation=45, ha='right')
ax.set_xlabel("Lipid Class")
ax.set_ylabel("Lipid-wise mean Moran's I across sections")
ax.set_title("Spatial evaluation of lipids by average Moran's I")

plt.tight_layout()
plt.savefig("moranbyclass_MAX.pdf")
plt.show()


In [None]:
annowmoran = pd.read_csv("cleanedANNOTATIONS_20250215.csv", index_col=0)

annowmoran['Status'].value_counts()

In [None]:
annowmoransaved = annowmoran.loc[(annowmoran['Status'] == "measured") | (annowmoran['Status'] == "restored"),:]
annowmoransaved

In [None]:
df = pd.DataFrame(annowmoransaved['Annotation']).fillna('')
df.columns = ["lipid_name"]
df['Score'] = annowmoransaved['Score'].values
df['Status'] = annowmoransaved['Status'].values
df['mz'] = annowmoransaved.index.values
df

def extract_class(lipid_name):
    """
    Extract the lipid class from a lipid name.
    Handles cases like "PC O-36:4" where we want to capture "PC O".
    This regex looks for one or more alphanumeric characters followed by an
    optional " O" (with or without a space) and then a space or dash.
    """
    m = re.match(r'^([A-Za-z0-9]+(?:\s?O)?)[\s-]', lipid_name)
    if m:
        return m.group(1)
    else:
        return lipid_name.split()[0]

# Test the extraction with a few examples
test_lipids = ["PC O-36:4", "PC-36:4", "PE O-38:6", "PE-38:6"]
for lip in test_lipids:
    print(f"{lip} -> {extract_class(lip)}")

df["class"] = df["lipid_name"].apply(extract_class)

# Extract number of carbons and insaturations from the lipid name
df["carbons"] = df["lipid_name"].apply(
    lambda x: int(re.search(r'(\d+):', x).group(1)) if re.search(r'(\d+):', x) else np.nan
)
df["insaturations"] = df["lipid_name"].apply(
    lambda x: int(re.search(r':(\d+)', x).group(1)) if re.search(r':(\d+)', x) else np.nan
)
df["insaturations_per_Catom"] = df["insaturations"] / df["carbons"]
df.loc[df['class'] == "HexCer", 'class'] = "Hex1Cer"

# Mark broken entries based on naming convention (e.g., ending with '_uncertain')
df["broken"] = df["lipid_name"].str.endswith('_uncertain')
df.loc[df["broken"], ['carbons', 'class', 'insaturations', 'insaturations_per_Catom']] = np.nan

# Map colors from an external file
colors = pd.read_hdf("lipidclasscolors.h5ad", key="table")
df['color'] = df['class'].map(colors['classcolors'])
df.loc[df["broken"], 'color'] = "gray"

# Set index and remove duplicates

df['color'] = df['color'].fillna("black")

extra_classes = {'CerP', 'LPA', 'PIP O', 'PGP', 'PA', 'CAR', 'ST', 'PA O', 'CoA', 'MG', 'SHexCer', 'LPE O'}

color_dict.update({cls: "gray" for cls in extra_classes})

df.loc[df['class'].isin(['CerP', 'LPA', 'PIP O', 'PGP', 'PA', 'CAR', 'ST', 'PA O', 'CoA', 'MG', 'SHexCer']),'class'] = "others"
df.loc[df['lipid_name'].str.contains("_db"), 'class'] = "others"
color_dict.update({cls: "gray" for cls in ["others"]})
df

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

# Assuming df and color_dict are already defined.
# Example: color_dict = {'Hex1Cer': 'black', 'PE': '#F4E1D2', 'PC': '#E15D44', 'others': '#EFC050'}

# Group by "class" and "Status" to count occurrences
grouped = df.groupby(["class", "Status"]).size().unstack(fill_value=0)

# Ensure both statuses exist for every class
if "measured" not in grouped.columns:
    grouped["measured"] = 0
if "restored" not in grouped.columns:
    grouped["restored"] = 0

classes = grouped.index.tolist()

# Create the figure and axis for a horizontal bar plot
fig, ax = plt.subplots(figsize=(10, 6))
y_positions = np.arange(len(classes))

# Set the maximum value on the x-axis
x_max = 40

for i, cls in enumerate(classes):
    count_measured = grouped.loc[cls, "measured"]
    count_restored = grouped.loc[cls, "restored"]
    cls_color = color_dict.get(cls, "gray")
    
    # Calculate the drawn lengths (truncate if total exceeds x_max)
    draw_measured = min(count_measured, x_max)
    # Only add the restored portion if there's room left
    draw_restored = min(count_restored, max(0, x_max - count_measured))
    
    # Plot the measured bar (filled with a border of the same color)
    ax.barh(i, draw_measured, color=cls_color, edgecolor=cls_color,
            linewidth=2, height=0.6)
    
    # Plot the restored bar (empty inside, colored border)
    ax.barh(i, draw_restored, left=draw_measured, facecolor='none',
            edgecolor=cls_color, linewidth=2, height=0.6)

# Set x-axis limit so that any overflow is hidden
ax.set_xlim(0, x_max)
ax.set_yticks(y_positions)
ax.set_yticklabels(classes)
ax.set_xlabel("Count")
ax.set_title("Counts per Class: Measured (filled) vs Restored (empty), truncated at 40")

# Remove the top and right spines
sns.despine(ax=ax, top=True, right=True)

plt.tight_layout()
plt.savefig("howmanyrestored.pdf")
plt.show()


In [None]:
metrics_df= pd.read_csv("metrics_imputation_df.csv", index_col=0)

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import gaussian_kde

# Create the figure and axis
fig, ax = plt.subplots(figsize=(8, 6))

# Plot the histogram (actual frequencies)
n, bins, patches = ax.hist(metrics_df['val_pearson_r'], bins=30, color='black', alpha=0.7)

# Compute the bin width from the histogram bins
bin_width = bins[1] - bins[0]

# Compute the KDE using gaussian_kde and rescale it to match frequency counts
data = metrics_df['val_pearson_r'].dropna().values
kde = gaussian_kde(data)
x_vals = np.linspace(bins[0], bins[-1], 300)
kde_vals = kde(x_vals) * len(data) * bin_width

# Plot the rescaled KDE
ax.plot(x_vals, kde_vals, color='black', lw=2)

# Add a darkred dashed vertical line at x = 0.4
ax.axvline(x=0.4, color='red', linestyle='--', lw=3)

# Remove the top and right spines for a cleaner look
sns.despine(ax=ax, top=True, right=True)
plt.savefig("XGB_restorationperf.pdf")
plt.show()


In [None]:
alldata = pd.read_hdf("20241103_pixels_allips_allbrains_allen_pixelcleaned.h5ad")
alldata

In [None]:
atlas

In [None]:
df.loc[df['Status'] == "restored",:]

In [None]:
tmp = metrics_df.sort_values(by="val_pearson_r")[::-1].dropna()

tmp = tmp.loc[tmp["val_pearson_r"] < .82,:]
tmp = tmp.loc[tmp.index.isin(df.loc[df['Status'] == "restored",:].index),:]
tmp.iloc[20:40,:]

In [None]:
df.loc[766.514720,:]

In [None]:
native = alldata.loc[atlas.index,:]
native

In [None]:
for xxx in range(1,33): ###### MANY COME FROM OTHER BRAIN INPUTS REMEMBER... IS IT FAIR? WE NEED FAIR JUSTIFICATION...
    x = native.loc[native['Section'] == xxx,'766.514720']
    y = atlas.loc[atlas['Section'] == xxx,'PE O-36:3']

    plt.scatter(x[::10], y[::10], s=0.01, rasterized=True)
    plt.show()

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.cm import ScalarMappable
from matplotlib.colors import Normalize
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42

# Assuming 'atlas' DataFrame and 'currentLipid' are defined:
data = atlas
currentLipid = 'PE O-36:3'

# Compute the 2nd and 98th percentiles for each section for currentLipid
results = []
for section in data['Section'].unique():
    subset = data[data['Section'] == section]
    perc_2 = subset[currentLipid].quantile(0.02)
    perc_98 = subset[currentLipid].quantile(0.98)
    results.append([section, perc_2, perc_98])
percentile_df = pd.DataFrame(results, columns=['Section', '2-perc', '98-perc'])
med2p = percentile_df['2-perc'].median()
med98p = percentile_df['98-perc'].median()

# Specify the sections to plot
sections_to_plot = [3, 7, 11, 14, 18, 27]

# Create a figure with 1 row and 6 columns
fig, axes = plt.subplots(1, 6, figsize=(20, 4))
axes = np.array(axes).ravel()  # ensure axes is a 1D array

# Plot each specified section
for i, section in enumerate(sections_to_plot):
    ax = axes[i]
    ddf = data[data['Section'] == section]
    sc = ax.scatter(ddf['zccf'], -ddf['yccf'], 
                    c=ddf[currentLipid], cmap="plasma", s=0.5,
                    rasterized=True, vmin=med2p, vmax=med98p)
    ax.axis('off')
    ax.set_aspect('equal')

# Add a colorbar to the right
cbar_ax = fig.add_axes([0.92, 0.15, 0.02, 0.7])
norm = Normalize(vmin=med2p, vmax=med98p)
sm = ScalarMappable(norm=norm, cmap="plasma")
fig.colorbar(sm, cax=cbar_ax)

plt.tight_layout(rect=[0, 0, 0.9, 1])
plt.savefig("restoredexample.pdf")
plt.show()


In [None]:
# plot a lipid

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.cm import ScalarMappable
from matplotlib.colors import Normalize
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42

results = []

data = native
currentLipid = '766.514720'

# Compute the 2nd and 98th percentiles for each section for currentLipid
results = []
for section in data['Section'].unique():
    subset = data[data['Section'] == section]
    perc_2 = subset[currentLipid].quantile(0.02)
    perc_98 = subset[currentLipid].quantile(0.98)
    results.append([section, perc_2, perc_98])
percentile_df = pd.DataFrame(results, columns=['Section', '2-perc', '98-perc'])
med2p = percentile_df['2-perc'].median()
med98p = percentile_df['98-perc'].median()

# Specify the sections to plot
sections_to_plot = [3, 7, 11, 14, 18, 27]

# Create a figure with 1 row and 6 columns
fig, axes = plt.subplots(1, 6, figsize=(20, 4))
axes = np.array(axes).ravel()  # ensure axes is a 1D array

# Plot each specified section
for i, section in enumerate(sections_to_plot):
    ax = axes[i]
    ddf = data[data['Section'] == section]
    sc = ax.scatter(ddf['zccf'], -ddf['yccf'], 
                    c=ddf[currentLipid], cmap="plasma", s=0.5,
                    rasterized=True, vmin=med2p, vmax=med98p)
    ax.axis('off')
    ax.set_aspect('equal')

# Add a colorbar to the right
cbar_ax = fig.add_axes([0.92, 0.15, 0.02, 0.7])
norm = Normalize(vmin=med2p, vmax=med98p)
sm = ScalarMappable(norm=norm, cmap="plasma")
fig.colorbar(sm, cax=cbar_ax)

plt.tight_layout(rect=[0, 0, 0.9, 1])
plt.savefig("prerestoredexample.pdf")
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Specify the sections to plot (assuming these are defined)
sections_to_plot = [3, 7, 11, 14, 18, 27]

# Create a 1x6 grid of subplots
fig, axes = plt.subplots(1, 6, figsize=(18, 3))

for ax, section in zip(axes, sections_to_plot):
    # Select data for the current section
    x = native.loc[native['Section'] == section, '766.514720']
    y = atlas.loc[atlas['Section'] == section, 'PE O-36:3']

    # Plot every 10th point with darkred dots
    ax.scatter(x[::10], y[::10], s=0.01, color='darkred', rasterized=True)
    
    # Remove top and right spines
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    
    # Remove all ticks and tick labels
    ax.set_xticks([])
    ax.set_yticks([])
    
    # Optional: add a title for clarity
    ax.set_title(f"Section {section}", fontsize=10)

plt.tight_layout()
plt.savefig("measuredvsimputed.pdf")
plt.show()
