In [None]:
import os
import numpy as np
import pandas as pd
import zarr
import matplotlib.pyplot as plt
from tqdm import tqdm
from collections import Counter
from rdkit import Chem

## Define helper functions to handle strings, adducts, databases, scores

In [None]:
# some helper functions

ppm = 5 # desired ppm to validate an annotation
reference_mz = 800 # scale of our dataset
distance_ab5ppm = ppm / 1e6 * reference_mz


def find_matching_lipids(path_mz, lipid_mz_df):
    lower_bound = path_mz - ppm / 1e6 * path_mz
    upper_bound = path_mz + ppm / 1e6 * path_mz
    matching_lipids = lipid_mz_df[(lipid_mz_df['m/z'] >= lower_bound) & (lipid_mz_df['m/z'] <= upper_bound)]['Lipids']
    return ', '.join(matching_lipids)

def create_adduct_entries(row):
    entries = []
    for adduct, mass_diff in adducts.items():
        new_row = row.copy()
        new_row['Exact Mass'] += mass_diff
        new_row['Adduct'] = adduct
        entries.append(new_row)
    return entries

def find_closest_mz(peaks_mz, metaspace_df):
    abs_diff = np.abs(metaspace_df['mz'].astype(float) - peaks_mz)
    min_index = abs_diff.idxmin()
    min_value = abs_diff[min_index]
    if min_value <= distance_ab5ppm:
        return metaspace_df.loc[min_index, ['METASPACEfdr', 'METASPACEname', 'METASPACEmsm']]
    else:
        return pd.Series([np.nan, np.nan, np.nan], index=['METASPACEfdr', 'METASPACEname', 'METASPACEmsm'])

def map_hmdb_ids(hmdb_ids):
    return np.unique([conversion_dict.get(hmdb_id.strip(), '') for hmdb_id in hmdb_ids.split(',')])

def process_values(values):
    if pd.isna(values) or values == '':
        return []
    split_values = values.split(', ')
    processed_values = [value.split(' -')[0] for value in split_values]
    return processed_values

def map_to_species_name(candidates):
    return np.unique([name_to_species.get(candidate, 'Unknown') for candidate in candidates])

def find_closest_abbreviation(mz_list, lipidmaps):
    closest_abbreviations = []
    for mz in mz_list:
        abs_diffs = np.abs(lipidmaps['EXACT_MASS'] - mz)
        if np.min(abs_diffs) <= distance_ab5ppm:
            closest_idx = abs_diffs.idxmin()
            closest_abbreviation = lipidmaps.at[closest_idx, 'ABBREVIATION']
        else:
            closest_abbreviation = None
        closest_abbreviations.append(closest_abbreviation)
    return closest_abbreviations

def update_score(row):
    if len(row['ShortNamesTANDEMMS']) > 0:
        return row['Score'] + 8
    else:
        return row['Score']

def process_ILICColumn_column(cell):
    items = cell.split(',')
    processed_items = [item.split(' -')[0].strip() for item in items]
    return processed_items

def process_shortnames(shortnames_list):
    if shortnames_list is None:
        return None
    processed_list = []
    for entry in shortnames_list:
        processed_entry = entry.replace('(', ' ').replace(')', ' ')
        processed_entry = processed_entry.rstrip()
        processed_list.append(processed_entry)
    return processed_list

def has_nonempty_intersection(list1, list2):
    if list1 is None or list2 is None:
        return False
    if list2 == [] or list2 == ['']:
        return False
    try:
        if len(list1[0]) == 0:
            return False
    except:
        return False
    return bool(set(list1) & set(list2))

def update_annotation_with_shortnames(row):
    if (row['Annotation'] is None) or (len(row['Annotation']) == 0) or (row['Annotation'][0] == ''):
        if (len(row['ShortNamesILICColumn']) > 0) & (row['ShortNamesILICColumn'] != ['']):
            print(row['ShortNamesILICColumn'])
            row['Annotation'] = row['ShortNamesILICColumn']
            row['Score'] += 2
    return row

def update_annotation_with_shortnames_reversecolumn(row):
    if ((row['Annotation'] is None) or (len(row['Annotation']) == 0) or (row['Annotation'][0] == '')) & (row['NameReversePhaseColumn'] is not None):
        if (len(row['NameReversePhaseColumn']) > 0) & (row['NameReversePhaseColumn'] != ['']):
            print(row['NameReversePhaseColumn'])
            row['Annotation'] = row['NameReversePhaseColumn']
            row['Score'] += 2
    return row

def update_annotation_with_shortnames2(row):
    if row['AnalyticalChemistryStudy'] is not None:
        if (row['Annotation'] is None) or (len(row['Annotation']) == 0) or (row['Annotation'][0] == ''):
            if (len(row['AnalyticalChemistryStudy']) > 0) & (row['AnalyticalChemistryStudy'] != ['']):
                print(row['AnalyticalChemistryStudy'])
                row['Annotation'] = row['AnalyticalChemistryStudy']
                row['Score'] += 1
    return row

def update_annotation_with_shortnames2b(row):
    if row['Fitzner within +-distance_ab5ppm'] is not None:
        if (row['Annotation'] is None) or (len(row['Annotation']) == 0) or (row['Annotation'][0] == ''):
            if (len(row['Fitzner within +-distance_ab5ppm']) > 0) & (row['Fitzner within +-distance_ab5ppm'] != ['']):
                print(row['Fitzner within +-distance_ab5ppm'])
                row['Annotation'] = row['Fitzner within +-distance_ab5ppm']
                row['Score'] += 1
    return row

def update_annotation_with_metaspace(row):
    metaspace_name = row['METASPACEname']
    current_annotation = row['Annotation']
    if isinstance(metaspace_name, np.ndarray):
        metaspace_name = metaspace_name.tolist()
    if isinstance(current_annotation, np.ndarray):
        current_annotation = current_annotation.tolist()
    def is_empty_value(val):
        if val is None:
            return True
        if isinstance(val, (list, tuple)):
            return len(val) == 0 or all(x == '' or pd.isna(x) for x in val)
        if isinstance(val, str):
            return len(val) == 0
        return pd.isna(val)
    if isinstance(metaspace_name, (list, tuple)):
        valid_names = [x for x in metaspace_name if not is_empty_value(x)]
        metaspace_name = valid_names[0] if valid_names else None   
    if not is_empty_value(metaspace_name):
        if is_empty_value(current_annotation):
            print(metaspace_name)
            row['Annotation'] = metaspace_name
            if row['METASPACEfdr'] <= 0.20: # 0.20 is considered the threshold in the community
                row['Score'] += 1
            else:
                row['Score'] += 0.5  
    return row

def find_closest_name(mz, mz_list, name_list):
    closest_mz = min(mz_list, key=lambda x: abs(x - mz))
    if abs(closest_mz - mz) <= ppm / 1e6 * closest_mz:
        return name_list[mz_list.index(closest_mz)]
    return None

def ensure_numpy_array(x):
    return x if isinstance(x, np.ndarray) else np.array(x)
def safe_tuple(x):
    if isinstance(x, np.ndarray):
        return tuple(x.flatten()) if x.size > 1 else (x.item(),)
    elif hasattr(x, '__iter__') and not isinstance(x, str):
        return tuple(x)
    else:
        return (x,)

def nan_safe_key(t):
    return tuple('NaN' if isinstance(x, float) and np.isnan(x) else x for x in t)


def prioritize_annotations(row):
    
    annotations = row['Annotation']
    frequencies = row['AnnotFreq']
    """  
        try:
             if np.isnan(annotations):
                return ''

        except:
    """
    print(annotations)
    try:
        if len(annotations) == 0:
            return ''
    except:
        return ''
    try:
        sorted_annotations = sorted(zip(frequencies, annotations), reverse=True, key=lambda x: x[0])

        if len(sorted_annotations) == 0:
            return ''

        highest_freq = sorted_annotations[0][0]
        second_highest_freq = sorted_annotations[1][0] if len(sorted_annotations) > 1 else 0

        if highest_freq >= second_highest_freq + 3:
            return sorted_annotations[0][1]
        else:
            return [sorted_annotations[0][1], sorted_annotations[1][1]] if len(sorted_annotations) > 1 else [sorted_annotations[0][1]]

    except Exception as e:
        print(f"Error processing row: {e}")
        return annotations[0] if annotations else ''

def process_annotations(annotation):
    if isinstance(annotation, list):
        annotation_str = ' '.join(annotation)
        annotation_str = annotation_str.replace('[', '').replace(']', '').replace("'", '')
        return annotation_str
    else:
        annotation_str = annotation.replace('[', '').replace(']', '').replace("'", '')
        return annotation_str

def split_at_second_space(s):
    parts = s.split(' ')
    if len(parts) > 3:
        split_point = parts.index('', 2) if '' in parts[2:] else 1
        return parts[:split_point + 1], parts[split_point + 1:]
    else:
        return [s], []
        
def map_frequency(annotation):
    print(annotation)
    if annotation.count(' ') == 3:
        first_part, second_part = split_at_second_space(annotation)
        part1, part2 = ' '.join(first_part), ' '.join(second_part)
        print(part1)
        freq1 = frequency_df.loc[frequency_df['Entry'] == part1, 'Frequency']
        freq2 = frequency_df.loc[frequency_df['Entry'] == part2, 'Frequency']
        return f"{freq1.values[0] if not freq1.empty else 0}; {freq2.values[0] if not freq2.empty else 0}"
    else:
        freq = frequency_df.loc[frequency_df['Entry'] == annotation, 'Frequency']
        return freq.values[0]-1 if not freq.empty else 0

def split_lipids(entry):
    if isinstance(entry, str):
        words = entry.split()
        lipids = []
        for i in range(0, len(words), 2):
            if i + 1 < len(words): 
                lipid = f"{words[i]} {words[i+1]}"
                lipids.append(lipid)
        return lipids
    return []

def process_lipid_annotations(filtered_df, lipid_dict):
    """
    Process a dataframe based on lipid annotations according to specific criteria:
    1. For each unique lipid (key in lipid_dict):
       - If annotations match the key exactly, keep only the key
       - If annotations are all different from key, keep all annotations
    2. Keep all rows where Annotation is NaN
    
    Parameters:
    filtered_df (pd.DataFrame): DataFrame containing an 'Annotation' column
    lipid_dict (dict): Dictionary where keys are unique lipids and values are lists of annotations
    
    Returns:
    pd.DataFrame: Processed DataFrame with filtered annotations
    """
    annotations_to_keep = []

    for lipid, annotations in lipid_dict.items():
        if lipid in annotations:
            annotations_to_keep.append(lipid)
        else:
            annotations_to_keep.extend(annotations)
    
    annotations_to_keep = list(dict.fromkeys(annotations_to_keep))
    
    mask = (filtered_df['Annotation'].isin(annotations_to_keep) | 
            filtered_df['Annotation'].isna())
    
    result_df = filtered_df[mask].copy()
    
    return result_df

def find_closest_entry(peaks_df, posionmode_analytical_df):
    analytical_study = []
    
    for index, row in peaks_df.iterrows():
        value = index
        
        posionmode_analytical_df['Difference'] = np.abs(posionmode_analytical_df['Measured'] - value)
        closest_entry = posionmode_analytical_df[posionmode_analytical_df['Difference'] <= ppm / 1e6 * posionmode_analytical_df['Measured']]
        
        if not closest_entry.empty:
            closest_entry = closest_entry.sort_values(by='Difference')
            closest_row = closest_entry.iloc[-1] ###
            analytical_study.append(
                f"{closest_row['Species Level ID']}"
            )
        else:
            analytical_study.append(None)
    
    peaks_df['AnalyticalChemistryStudy'] = analytical_study
    return peaks_df

def split_at_second_space2(s):
    parts = s.split(' ')
    if len(parts) > 3:
        index_second_space = s.find(' ', s.find(' ') + 1)
        return s[:index_second_space], s[index_second_space + 1:]
    return s, None

## Preprocess MS datasets and databases and use each of them to annotate our peaks

## In house LC-MS

In [None]:
# import the m/z peaks

PATH_DATA = '/data/LBA_DATA/ALL_DATA_PROCESSED/181024_BrainTOTAL_ALL_MOLECULES'
root = zarr.open(PATH_DATA, mode='rb')
PATH_MZ = np.sort(list(root.group_keys()))

# annotate lipids with the ESI LC-MS generated in house
lipid_mz_df = pd.read_csv("lcms_mar2022_withcounterions (2).txt", index_col=0) 
lipid_mz_df["m/z"] = lipid_mz_df["m/z"].astype(float)
lipid_mz_df["Lipids"] = lipid_mz_df["Lipid"] + " - " + lipid_mz_df["Adduct"] + "_" + lipid_mz_df["m/z"].astype(str)

ilic = lipid_mz_df['Lipid'].values.copy()

In [None]:
assignment_df = pd.DataFrame(PATH_MZ, columns=['PATH_MZ'])             
assignment_df.index = assignment_df['PATH_MZ'].astype(float)
assignment_df['PATH_MZ'] = assignment_df['PATH_MZ'].astype(float)
assignment_df['ILICColumn within +-distance_ab5ppm'] = assignment_df['PATH_MZ'].apply(lambda mz: find_matching_lipids(mz, lipid_mz_df))
assignment_df

In [None]:
# check how many were annotated
len(assignment_df['ILICColumn within +-distance_ab5ppm'].unique())

## Tandem LC-MS2

In [None]:
# annotate lipids with the TANDEM LC-MS, passed through the goslin database online portal (https://lifs-tools.org/goslin.html)

lipid_mz_df = pd.read_csv("goslin_output.tsv",sep='\t')
goslin = lipid_mz_df[['Normalized Name', 'Exact Mass']]

# add to exact masses the ions
adducts = {
    'Na+': 22.989769,
    'K+': 38.963707,
    'H+': 1.007276,
    'NH4+': 18.033823
}
tandem = lipid_mz_df[['Species Name']].iloc[:,0].values.copy()[1:]

lipid_mz_df

In [None]:
filtered_goslin = goslin.dropna(subset=['Exact Mass'])
new_entries = filtered_goslin.apply(create_adduct_entries, axis=1)
goslin = pd.DataFrame([entry for sublist in new_entries for entry in sublist])
lipid_mz_df = goslin
lipid_mz_df["m/z"] = lipid_mz_df["Exact Mass"].astype(float)
lipid_mz_df["Lipids"] = lipid_mz_df["Normalized Name"] + " - " + lipid_mz_df["Adduct"] + "_" + lipid_mz_df["m/z"].astype(str)

assignment_df['TANDEMMS within +-distance_ab5ppm'] = assignment_df['PATH_MZ'].apply(lambda mz: find_matching_lipids(mz, lipid_mz_df))
lipid_mz_df

In [None]:
# little snippet to check fast what's around (and how far) a given peak of interest
lipid_mz_df.loc[(lipid_mz_df['Exact Mass'] - 869.649307).abs().idxmin()]

In [None]:
len(assignment_df['TANDEMMS within +-distance_ab5ppm'].unique())

## Literature key datasets

In [None]:
# annotate lipids with the Fitzner dataset, passed through the goslin database online portal (https://lifs-tools.org/goslin.html)

fitzner = pd.read_csv("fitzner_normalized.tsv",sep='\t')
goslinfitzner = fitzner[['Species Name', 'Exact Mass']]

# add to exact masses the ions
adducts = {
    'Na+': 22.989769,
    'K+': 38.963707,
    'H+': 1.007276,
    'NH4+': 18.033823
}

fitzner_ = fitzner[['Species Name']].iloc[:,0].values.copy()[1:]

filtered_goslinfitzner = goslinfitzner.dropna(subset=['Exact Mass'])
new_entries = filtered_goslinfitzner.apply(create_adduct_entries, axis=1)
goslinfitzner = pd.DataFrame([entry for sublist in new_entries for entry in sublist])
lipid_mz_dffitzner = goslinfitzner
lipid_mz_dffitzner["m/z"] = lipid_mz_dffitzner["Exact Mass"].astype(float)
lipid_mz_dffitzner["Lipids"] = lipid_mz_dffitzner["Species Name"] #+ " - " + lipid_mz_dffitzner["Adduct"] + "_" + lipid_mz_dffitzner["m/z"].astype(str)
lipid_mz_dffitzner = lipid_mz_dffitzner.drop_duplicates()
lipid_mz_dffitzner

In [None]:
assignment_df['Fitzner within +-distance_ab5ppm'] = assignment_df['PATH_MZ'].apply(lambda mz: find_matching_lipids(mz, lipid_mz_dffitzner))

len(assignment_df['Fitzner within +-distance_ab5ppm'].unique())

In [None]:
# import the dataset from a nice MSI study of brain lipidomics (https://pubs.acs.org/doi/10.1021/acs.analchem.3c02724)

posionmode_analytical = pd.read_csv("posionmode_acs.csv")

# augment that dataset to also keep into account all possible counterions we may get
adduct_masses = {
    '[M+H]+': 1.007276,
    '[M+Na]+': 22.989769,
    '[M+K]+': 38.963707,
    '[M+NH4]+': 18.033823
}

posionmode_analytical['Original Mass'] = posionmode_analytical.apply(lambda row: row['Measured'] - adduct_masses[row['Adduct']], axis=1)
new_rows = []
for _, row in posionmode_analytical.iterrows():
    for adduct, mass in adduct_masses.items():
        new_row = row.copy()
        new_row['Adduct'] = adduct
        new_row['Measured'] = row['Original Mass'] + mass
        new_rows.append(new_row)

posionmode_analytical = pd.DataFrame(new_rows)
analchem=posionmode_analytical["Species Level ID"].values
posionmode_analytical

In [None]:
# closest entry for each peak within +-distance_ab5ppm
peaks_df = find_closest_entry(assignment_df, posionmode_analytical)

for i in range(peaks_df.shape[0]):
    if peaks_df.iloc[i,:]['AnalyticalChemistryStudy'] is None:
        continue
    else:
        peaks_df.iloc[i,:]['AnalyticalChemistryStudy'] = [peaks_df.iloc[i,:]['AnalyticalChemistryStudy']]

peaks_df

In [None]:
len(peaks_df['AnalyticalChemistryStudy'].unique())

## LIPIDMAPS

In [None]:
# preprocess LIPIDMAPS

# load SDF file
supplier = Chem.SDMolSupplier('structures.sdf')

# load the first molecule from the supplier
first_molecule = next(iter(supplier))

if first_molecule is not None:
    # get the names of all keys
    keys = first_molecule.GetPropNames()
    print(list(keys))

# initialize empty lists to store data
lm_id_list = []
name_list = []
systematic_name_list = []
category_list = []
main_class_list = []
mass_list = []
abbreviation_list = []
ik_list = []

# iterate over the molecules in the SDF file
for molecule in tqdm(supplier):
    if molecule is not None:
        if molecule.HasProp('LM_ID'):
            lm_id_list.append(molecule.GetProp('LM_ID'))
        else:
            lm_id_list.append(None)
        
        if molecule.HasProp('NAME'):
            name_list.append(molecule.GetProp('NAME'))
        else:
            name_list.append(None)
        
        if molecule.HasProp('SYSTEMATIC_NAME'):
            systematic_name_list.append(molecule.GetProp('SYSTEMATIC_NAME'))
        else:
            systematic_name_list.append(None)
        
        if molecule.HasProp('CATEGORY'):
            category_list.append(molecule.GetProp('CATEGORY'))
        else:
            category_list.append(None)
        
        if molecule.HasProp('MAIN_CLASS'):
            main_class_list.append(molecule.GetProp('MAIN_CLASS'))
        else:
            main_class_list.append(None)
            
        if molecule.HasProp('EXACT_MASS'):
            mass_list.append(molecule.GetProp('EXACT_MASS'))
        else:
            mass_list.append(None)
            
        if molecule.HasProp('ABBREVIATION'):
            abbreviation_list.append(molecule.GetProp('ABBREVIATION'))
        else:
            abbreviation_list.append(None)
            
        if molecule.HasProp('INCHI_KEY'):
            ik_list.append(molecule.GetProp('INCHI_KEY'))
        else:
            ik_list.append(None)

data = {
    'LM_ID': lm_id_list,
    'NAME': name_list,
    'SYSTEMATIC_NAME': systematic_name_list,
    'CATEGORY': category_list,
    'MAIN_CLASS': main_class_list,
    'EXACT_MASS': mass_list,
    'ABBREVIATION': abbreviation_list,
    'INCHY_KEY': ik_list
}

df = pd.DataFrame(data)

lipidmaps = df
lipidmaps_hmdb = lipidmaps.copy()

# prepare using HMDB to match METASPACE and LIPIDMAPS naming conventions

hmdb = pd.read_csv("HMDB_complete.csv", index_col=0)
merged_df = pd.merge(lipidmaps_hmdb, hmdb, left_on='INCHY_KEY', right_on='InchiKey', how='left')

conversionhmdb = merged_df[['DBID', 'ABBREVIATION']].dropna()

## METASPACE

In [None]:
# METASPACE annotations downloaded from the portal after uploading our raw data
data_path = '/data/LBA_DATA/METASPACE_annotations/' ######
file_paths = [os.path.join(data_path, f) for f in os.listdir(data_path)]
df_list = [pd.read_csv(f, skiprows=2, index_col=0)[['datasetName', 'adduct', 'mz', 'msm', 'fdr', 'moleculeNames', 'moleculeIds']].reset_index(drop=True) for f in file_paths]

metaspace = pd.concat(df_list)
metaspace = metaspace[['mz',	'msm',	'fdr',	'moleculeNames', 'adduct', 'moleculeIds']].drop_duplicates()

metaspace = metaspace.reset_index()
peaks_df['METASPACEfdr'] = np.nan
peaks_df['METASPACEname'] = np.nan
peaks_df['METASPACEmsm'] = np.nan

conversion_dict = conversionhmdb.set_index('DBID')['ABBREVIATION'].to_dict()

metaspace['MappedAbbreviations'] = metaspace['moleculeIds'].apply(map_hmdb_ids)

metaspace

In [None]:
# add the metaspace annotations

metaspace = metaspace[['mz', 'msm',	'fdr',	'MappedAbbreviations', 'adduct']]

metaspace.columns = ['mz', 'METASPACEmsm', 'METASPACEfdr', 'METASPACEname', 'adduct']

for idx, row in peaks_df.iterrows():
    peaks_mz = float(idx) 
    closest_metaspace_values = find_closest_mz(peaks_mz, metaspace)
    peaks_df.at[idx, 'METASPACEfdr'] = closest_metaspace_values['METASPACEfdr']
    peaks_df.at[idx, 'METASPACEname'] = closest_metaspace_values['METASPACEname']
    peaks_df.at[idx, 'METASPACEmsm'] = closest_metaspace_values['METASPACEmsm']

peaks_df

In [None]:
# some further preprocessing of different nomenclatures

lipid_mz_df = pd.read_csv("goslin_output.tsv",sep='\t')
shortnaming = lipid_mz_df[['Normalized Name', 'Species Name']]
peaks_df['ListCandidatesTANDEMMS'] = peaks_df['TANDEMMS within +-distance_ab5ppm'].apply(process_values)
name_to_species = dict(zip(shortnaming['Normalized Name'], shortnaming['Species Name']))
peaks_df['ShortNamesTANDEMMS'] = peaks_df['ListCandidatesTANDEMMS'].apply(map_to_species_name)
peaks_df

In [None]:
len(peaks_df['ShortNamesTANDEMMS'].apply(tuple).unique())

In [None]:
# add LIPIDMAPS annotation

lipidmaps.loc[lipidmaps['ABBREVIATION'].isna(), 'ABBREVIATION'] = lipidmaps['NAME']

lipidmaps = lipidmaps[['EXACT_MASS',	'ABBREVIATION']]

# reconsider all possible adducts
peaks_df['mz'] = peaks_df.index.astype(float)
peaks_df['mz'] = [[peaks_df.iloc[i,:]['mz'] - 22.989769, peaks_df.iloc[i,:]['mz'] - 38.963707, peaks_df.iloc[i,:]['mz'] - 1.007825, peaks_df.iloc[i,:]['mz'] - 18.033823] for i in range(0, peaks_df.shape[0])]

lipidmaps['EXACT_MASS'] = pd.to_numeric(lipidmaps['EXACT_MASS'], errors='coerce')

peaks_df['LIPIDMAPS'] = peaks_df['mz'].apply(lambda x: find_closest_abbreviation(x, lipidmaps))

len(peaks_df['LIPIDMAPS'].apply(tuple).unique())

## Combine the studies to define an annotation and score it

In [None]:
# first, annotate with the tandem MS2
peaks_df['Annotation'] = peaks_df['ShortNamesTANDEMMS']
peaks_df['Score'] = 0
peaks_df['Score'] = peaks_df.apply(update_score, axis=1)
peaks_df

In [None]:
peaks_df['Score'].value_counts()

In [None]:
peaks_df['ShortNamesILICColumn'] = peaks_df['ILICColumn within +-distance_ab5ppm'].apply(process_ILICColumn_column)
peaks_df['ShortNamesILICColumn'] = peaks_df['ShortNamesILICColumn'].apply(process_shortnames)

In [None]:
peaks_df['Annotation'] = peaks_df['Annotation'].apply(lambda x: [''] if isinstance(x, float) and pd.isna(x) else x)
peaks_df['Annotation'] = peaks_df['Annotation'].apply(lambda x: [''] if len(x) == 0 else x)

len(peaks_df['Annotation'].apply(tuple).unique())

In [None]:
peaks_df['Score'] += peaks_df.apply(lambda row: 3 if has_nonempty_intersection(row['Annotation'], row['ShortNamesILICColumn']) else 0, axis=1)
peaks_df['Score'].value_counts()

In [None]:
# second, annotate with the in-house LCMS
peaks_df = peaks_df.apply(update_annotation_with_shortnames, axis=1)
len(peaks_df['Annotation'].apply(tuple).unique())

In [None]:
# use also another ESI LC-MS dataset of mouse brain generated for another project in the lab
ReversePhaseColumn = pd.read_csv("Transitions_list_240527_CG.csv")
reverseph = ReversePhaseColumn["PrecursorName"].values
reverseph

In [None]:
ReversePhaseColumn = ReversePhaseColumn[["PrecursorName", "PrecursorMz"]]
ReversePhaseColumn.columns = ["ReversePhaseColumnName", "ReversePhaseColumnMZ"]

peaks_df['NameReversePhaseColumn'] = peaks_df.index.astype(float).map(lambda mz: find_closest_name(mz, ReversePhaseColumn['ReversePhaseColumnMZ'].astype(float).tolist(), ReversePhaseColumn['ReversePhaseColumnName'].tolist()))

peaks_df = peaks_df.apply(update_annotation_with_shortnames_reversecolumn, axis=1)

for x in range(len(peaks_df)):
    try:
        row = peaks_df.iloc[x]
        
        if len(np.intersect1d(row['Annotation'], np.array(row['NameReversePhaseColumn']))) > 0:
            peaks_df.loc[peaks_df.index[x], 'Score'] = row['Score'] + 3
            
    except Exception as e:
        continue

peaks_df['Score'].value_counts()

In [None]:
# third, annotate with the Analytical Chemistry paper and the Fitzner dataset
peaks_df = peaks_df.apply(update_annotation_with_shortnames2, axis=1)
len(peaks_df['Annotation'].apply(tuple).unique())

In [None]:
peaks_df = peaks_df.apply(update_annotation_with_shortnames2b, axis=1)
len(peaks_df['Annotation'].apply(tuple).unique())

In [None]:
# fourth, annotate with METASPACE
peaks_df = peaks_df.apply(update_annotation_with_metaspace, axis=1)
len(peaks_df['Annotation'].apply(tuple).unique())

In [None]:
peaks_df['Score'].value_counts()

In [None]:
unique_annotations = peaks_df['Annotation'].apply(safe_tuple)

unique_annotations_list = list(unique_annotations)
unique_annotations_set = set(map(nan_safe_key, unique_annotations_list))

len(unique_annotations_set)

In [None]:
for i in range(peaks_df.shape[0]):
    try:
        if len(np.intersect1d(np.array(peaks_df.iloc[i,:].loc['AnalyticalChemistryStudy']), peaks_df.iloc[i,:].loc['Annotation'])) > 0:
            peaks_df['Score'].iloc[i] = peaks_df['Score'].iloc[i] + 3
    except:
        continue
peaks_df['Score'].value_counts()

In [None]:
peaks_df['Fitzner within +-distance_ab5ppm'].iloc[0]

In [None]:
for i in range(peaks_df.shape[0]):
    try:
        if (peaks_df.iloc[i,:].loc['Fitzner within +-distance_ab5ppm'] is not None) & (peaks_df.iloc[i,:].loc['Fitzner within +-distance_ab5ppm'] != ''):
            if len(np.intersect1d(np.array(peaks_df.iloc[i,:].loc['Fitzner within +-distance_ab5ppm']), peaks_df.iloc[i,:].loc['Annotation'])) > 0:
                peaks_df['Score'].iloc[i] = peaks_df['Score'].iloc[i] + 3
    except:
        continue
peaks_df['Score'].value_counts()

In [None]:
for i in range(peaks_df.shape[0]):

    if len(np.intersect1d(np.array(peaks_df.iloc[i,:]['METASPACEname']), np.array(peaks_df.iloc[i,:]['Annotation']))) > 0:
        peaks_df['Score'].iloc[i] = peaks_df.iloc[i,:]['Score'] + 0.5

for i in range(peaks_df.shape[0]):
    try:
        if len(np.intersect1d(np.array(peaks_df['LIPIDMAPS'].iloc[i]), np.array(peaks_df['Annotation'].iloc[i]))) > 0:
            peaks_df['Score'].iloc[i] = peaks_df['Score'].iloc[i] + 0.5
    except:
        continue
peaks_df

In [None]:
peaks_df['Score'].value_counts()

In [None]:
pdf = peaks_df.sort_values(by='Score', ascending=False)
pdf

## Use quantitative LC-MS results to break some ties when one of the candidate lipids is much more abundant in the brain

In [None]:
quantlcms = pd.read_csv("QuantitativeLCMS.csv", index_col=0)

atlas = quantlcms[['Male',	'Male.1',	'Male.2',	'Male.3']] # use the males as the "reference atlas" we are going to use mostly is a male
ref = atlas.iloc[1:,:].astype(float).mean(axis=1)
annots = pd.read_csv("goslin_output.tsv",sep='\t')
convt = annots[['Original Name', 'Species Name']]
convt.index = convt['Original Name'].astype(str)
refvals = pd.DataFrame(ref.values, index = ref.index, columns=["nmol_fraction_LCMS"])
refvals.index = refvals.index.str.replace('Hex1Cer', 'HexCer')
tmp = pd.read_csv("manuallyannotated_addlcms.csv", index_col=0).dropna()
refvalstmp = refvals.loc[refvals.index.isin(tmp.iloc[:,0]),:]
rvl = np.array(refvals.index)
convl = np.array(convt.index)
annots.index = annots['Original Name']
annots = annots.loc[np.intersect1d(rvl, convl),:]
refvals = refvals.loc[np.intersect1d(rvl, convl),:]
indivannots = annots[['Species Name']]
indivannots = indivannots.groupby('Original Name').first()
refvals['Species Name'] = refvals.index.map(indivannots['Species Name'])
tmp.index = tmp.iloc[:,0]
tmp = tmp.loc[refvalstmp.index,:]
refvalstmp['Species Name'] = tmp['Unnamed: 2']
quantlcms = pd.concat([refvals, refvalstmp], axis=0)
quantlcms.index = quantlcms['Species Name']
quantlcms = quantlcms[['nmol_fraction_LCMS']]
quantlcms = pd.DataFrame(quantlcms['nmol_fraction_LCMS'].groupby(quantlcms.index).sum()) # merge lipids that are distinguished in LCMS but undistinguishable in IMS
quantlcms

In [None]:
# example of non-disambiguated lipid that is known to be superabundant, indeed it takes a sizable fraction of the three possible matches, but with the (arbitrary) 80% threshold we chose, it indeed loses by a tiny bit
quantlcms.loc['PC 34:1',:] / (quantlcms.loc['PC 34:1',:] + quantlcms.loc['PC 36:4',:] + quantlcms.loc['PE 39:4',:])

In [None]:
THRESHOLD = 0.8 # a lipid to be prioritized should be at least 80% molar fraction

pdf['AnnotationLCMSPrioritized'] = pdf['Annotation']
for i, annot in enumerate(pdf['Annotation']):
    if isinstance(annot, str):
        annot = [annot]
        pdf['Annotation'].iloc[i] = annot
    now = quantlcms.loc[quantlcms.index.isin(annot),:]
    now = now/now.sum()
    if now['nmol_fraction_LCMS'].max() > THRESHOLD:
        pdf['AnnotationLCMSPrioritized'].iloc[i] = now.index[now['nmol_fraction_LCMS'] > THRESHOLD].values[0]
pdf['AnnotationLCMSPrioritized']

In [None]:
# report the weighted annotation frequencies across studies as well #####?????
pdf['AnnotFreq'] = [[] for _ in range(pdf.shape[0])]

col_index = pdf.columns.get_loc('AnnotFreq')

## manually giving more weight to MS-based ones
columns_to_check = ['AnalyticalChemistryStudy', 'AnalyticalChemistryStudy',  'ShortNamesTANDEMMS', 'ShortNamesTANDEMMS', 'ShortNamesTANDEMMS', 'LIPIDMAPS', 'ShortNamesILICColumn','ShortNamesILICColumn', 'NameReversePhaseColumn', 'NameReversePhaseColumn', 'METASPACEname']

for xxx in tqdm(range(0, pdf.shape[0])):
    listona = []

    CURRENTANNOT = pdf.iloc[xxx]['Annotation']
    
    try:
        if CURRENTANNOT == ['']:
            continue
    except:
        for annotation in np.array(CURRENTANNOT):

            count = 0

            for col in columns_to_check:
                col_values = pdf.iloc[xxx][col]

                if np.isscalar(col_values):
                    col_values = [col_values]
                elif isinstance(col_values, np.ndarray):
                    col_values = col_values.tolist()

                if np.isscalar(annotation):
                    annotation = [annotation]
                try:
                    annotation = [x for x in annotation if x is not None]
                except:
                    annotation = ''
                try:
                    col_values = [x for x in col_values if x is not None]
                except:
                    col_values = 0

                count_now = len(np.intersect1d(annotation, np.array(col_values)))
                count = count + count_now

            listona.append(count)

        pdf.iat[xxx, col_index] = listona

pdf['Annotation'] = pdf['Annotation'].apply(ensure_numpy_array)
pdf['AnnotationTiesPrioritized'] = pdf.apply(prioritize_annotations, axis=1)
pdf['AnnotationTiesPrioritized'] = pdf['AnnotationTiesPrioritized'].apply(process_annotations)
pdf

In [None]:
df = pdf.copy()
pdf['AnnotationLCMSPrioritized'] = pdf['AnnotationLCMSPrioritized'].apply(
    lambda x: ', '.join(x) if isinstance(x, list) else x
).astype(str)
allnames = pdf['AnnotationLCMSPrioritized']

allnames = pdf.loc[pdf['AnnotationLCMSPrioritized'] != '', 'AnnotationLCMSPrioritized']
allnames = allnames[~pd.isna(allnames)]
allnames

In [None]:
result = []

for name in allnames:
    if name.count(' ') == 3:
        first_part, second_part = split_at_second_space(name)
        result.extend([' '.join(first_part), ' '.join(second_part)])
    else:
        result.append(name)

In [None]:
allnames = np.array(result)

frequency_counter = Counter(allnames)
frequency_df = pd.DataFrame(list(frequency_counter.items()), columns=['Entry', 'Frequency'])

In [None]:
pdf['FrequencyLipidOtherPeaks'] = pdf['AnnotationTiesPrioritized'].astype(str).apply(map_frequency)
pdf['Annotation'] = pdf['Annotation'].astype(str).str.strip('[]').str.replace("'", "")
pdf['AnnotationTiesPrioritized'] = pdf['AnnotationTiesPrioritized'].str.replace(r'[,\[\]]', '', regex=True)
#pdf.loc[pdf['AnnotationTiesPrioritized'] != '', 'Annotation'] = pdf.loc[pdf['AnnotationTiesPrioritized'] != '', 'AnnotationTiesPrioritized']
pdf['Annotation']

In [None]:
pdf.loc[pdf['Annotation'] == "SM 34:1;O2",:] # a fast sanity check...

In [None]:
pdf.loc[pdf['AnnotationLCMSPrioritized'] != '', 'Annotation'] = pdf.loc[pdf['AnnotationLCMSPrioritized'] != '', 'AnnotationLCMSPrioritized']
pdf['Annotation'] = pdf['Annotation'].str.replace(r'[,\[\]]', '', regex=True)
pdf['Annotation'] = pdf['Annotation'].str.replace("'", "")

## Archive the annotations for the raw dataset

In [None]:
np.sum(pdf['Score'] > 8)

In [None]:
pdf['Score'].value_counts()

In [None]:
pdf['LIPIDMAPS'] = pdf['LIPIDMAPS'].apply(lambda x: pd.NA if x == '[None, None, None, None]' else x)

pdf['LIPIDMAPS'] = pdf['LIPIDMAPS'].astype(str).str.strip('[]').str.replace("'", "")

pdf['LIPIDMAPS'] = pdf['LIPIDMAPS'].apply(
    lambda x: ', '.join([i.strip() for i in x.split(',') if i.strip() != 'None']) if pd.notnull(x) else x
)
pdf.loc[pdf['Annotation'].isna(), 'Annotation'] = pdf['LIPIDMAPS'] + "_db"
pdf.loc[pdf['Score'] < 3., 'Annotation'] = pdf.loc[pdf['Score'] < 3., 'Annotation'] + "_db"

mask_pi_cer = pdf['Annotation'].str.contains('PI-Cer')
pdf.loc[mask_pi_cer, 'Annotation'] = pdf.loc[mask_pi_cer, 'Annotation'] + '_db'
pdf.loc[mask_pi_cer, 'Score'] = 0.0

mask_shex_cer = pdf['Annotation'].str.contains('SHexCer')
pdf.loc[mask_shex_cer, 'Annotation'] = pdf.loc[mask_shex_cer, 'Annotation'] + '_db'
pdf.loc[mask_shex_cer, 'Score'] = 0.0

o3_mask = pdf['Annotation'].str.contains('O3')
for idx in pdf[o3_mask].index:
    o2_exists = pdf['Annotation'].str.contains(pdf.loc[idx, 'Annotation'].replace('O3', 'O2')).any()
    if o2_exists:
        pdf.loc[idx, 'Annotation'] = pdf.loc[idx, 'Annotation'] + '_db'
    else:
        pdf.loc[idx, 'Annotation'] = pdf.loc[idx, 'Annotation'].replace('O3', 'O2')

In [None]:
survived = pd.read_csv("survivedpeaks.csv", index_col=0).values[23:][:,0]
original_columns = pd.read_csv("originals.csv", index_col=0).values[:,0]
len(original_columns)

In [None]:
pdf['Status'] = "unusable"
pdf.loc[pdf.index.isin(survived.astype(float)), 'Status'] = "restored"
pdf.loc[pdf.index.isin(original_columns), 'Status'] =  "measured"
pdf['Status'].value_counts()

In [None]:
pdf2 = pdf.loc[pdf.index.astype(str).isin(survived.astype(str)),:]
duplicated_mask = pdf2['Annotation'].duplicated(keep=False)
max_score_indices = pdf2[duplicated_mask].groupby('Annotation')['Score'].idxmax()
duplicate_label_mask = duplicated_mask & ~pdf2.index.isin(max_score_indices)
duplicate_label_mask

In [None]:
pdf.loc[pdf.index.isin(duplicate_label_mask.index[duplicate_label_mask]), 'Status'] = pdf.loc[pdf.index.isin(duplicate_label_mask.index[duplicate_label_mask]), 'Status'] + "_duplicated"

In [None]:
pdf = pdf.drop("PATH_MZ", axis=1)
pdf

In [None]:
# save the complete dataset to file
pdf.to_csv("ALLANNOTATIONSCORES_20250215.csv")
#pdf[['Annotation', 'Score', 'AnnotationLCMSPrioritized', 'TANDEMMS within +-distance_ab5ppm', 'ILICColumn within +-distance_ab5ppm', 'NameReversePhaseColumn','AnalyticalChemistryStudy', 'METASPACEname', 'METASPACEfdr', 'LIPIDMAPS', 'FrequencyLipidOtherPeaks']].to_csv("ALLANNOTATIONS_20241213.csv")

In [None]:
morans = pd.read_csv("morans_by_sec.csv", index_col=0)
morans.columns = "moran_acquisition"+morans.columns.astype(str)
morans

In [None]:
pdf = pd.concat([pdf, morans], axis=1)
pdf

In [None]:
pdf = pdf.drop(['mz','AnnotationTiesPrioritized', 'FrequencyLipidOtherPeaks'], axis=1)

In [None]:
cols_to_move = ['Annotation', 'Score', 'Status']
remaining_cols = [col for col in pdf.columns if col not in cols_to_move]
pdf = pdf[cols_to_move + remaining_cols]
pdf

In [None]:
status_order = ['measured', 'restored', 'unusable']
pdf['Status'] = pd.Categorical(pdf['Status'], categories=status_order, ordered=True)
pdf = pdf.sort_values(by=['Status', 'Score'], ascending=[True, False])
pdf

In [None]:
pdf.to_csv("cleanedANNOTATIONS_20250215.csv")

## Prepare a list of all lipids found generically in the brain by combining different studies

In [None]:
brainlipids = np.unique(np.concatenate((ilic.astype(str),
tandem.astype(str),
fitzner_.astype(str),
analchem.astype(str),
reverseph.astype(str))))

brainlipids

In [None]:
len(brainlipids)

In [None]:
pd.DataFrame(brainlipids).to_csv("brainlipids_unique.csv")

## Use the annotations and signals to prepare a subset of peaks to be normalized with uMAIA and used for lipidomic analysis

In [None]:
anns = pd.read_csv("ALLANNOTATIONS.csv") # computed at a previous iteration, similar to this but with less stringent thresholds - yielded more peaks, 
# but naming for the dataset used throughout the study follows uMAIA and uses the above procedure for stringent naming

In [None]:
anns['LIPIDMAPS'] = anns['LIPIDMAPS'].apply(lambda x: pd.NA if x == '[None, None, None, None]' else x)

columns_to_check = ['Juljiana within +-0.01', 'Jonathan within +-0.01', 'NameCharlotte', "AnalyticalChemistryStudy", "LIPIDMAPS"]

filtered_df = anns.dropna(subset=columns_to_check, how='all')
filtered_df

In [None]:
## prepare to prioritize the best adduct for each molecule to reduce the computational burden for uMAIA

lipidomic_peaks = filtered_df['PATH_MZ']
lipidomic_peaks_str = pd.DataFrame(lipidomic_peaks).applymap(lambda x: f"{x:.6f}" if pd.notna(x) else "")
np.save("lipidomic_peaks_alldata_5annotsources.npy", lipidomic_peaks_str.values[:,0])
vcs = filtered_df['Annotation'].value_counts()
to_dc = filtered_df.loc[filtered_df['Annotation'].isin(vcs.index[vcs>1]),:]
grouped = to_dc.groupby('Annotation')['PATH_MZ'].agg(list)
grouped

In [None]:
filtered_df#['AnnotationTiesPrioritized'] ### note the lipidmaps one do not have a name... we can add it with low confidence in case.

In [None]:
lipidnames = np.array(filtered_df['Annotation'])

unique_lipids = set()
for entry in lipidnames:
    unique_lipids.update(split_lipids(entry))

lipid_dict = {lipid: [] for lipid in unique_lipids}
for entry in lipidnames:
    if isinstance(entry, str):
        lipids = split_lipids(entry)
        for lipid in lipids:
            lipid_dict[lipid].append(entry)

# how many unique lipid names do we get?
len(list(lipid_dict.keys()))

In [None]:
processed_df = process_lipid_annotations(filtered_df, lipid_dict)
processed_df

In [None]:
### add LIPIDMAPS names
processed_df['LIPIDMAPS'] = processed_df['LIPIDMAPS'].astype(str).str.strip('[]').str.replace("'", "")
processed_df['LIPIDMAPS']

In [None]:
processed_df['LIPIDMAPS'] = processed_df['LIPIDMAPS'].apply(
    lambda x: ', '.join([i.strip() for i in x.split(',') if i.strip() != 'None']) if pd.notnull(x) else x
)
processed_df.loc[processed_df['Annotation'].isna(), 'Annotation'] = processed_df['LIPIDMAPS']
processed_df

## Prioritize isobars and remove untrustworthy lipids for uMAIA normalization

In [None]:
lipidnames = np.array(processed_df['Annotation'])

unique_lipids = set()
for entry in lipidnames:
    unique_lipids.update(split_lipids(entry))

lipid_dict = {lipid: [] for lipid in unique_lipids}

for entry in lipidnames:
    if isinstance(entry, str):
        lipids = split_lipids(entry)
        for lipid in lipids:
            lipid_dict[lipid].append(entry)
            
len(lipid_dict) ### several are lipidmaps... 624 -> 2541

# map to isobars to prioritize one based on total signal
annotation_to_mz = processed_df.groupby('Annotation')['PATH_MZ'].agg(list).to_dict()

In [None]:
## calculate the total signal inside the mask for each peak for each section

path_save = '/data/LBA_DATA/'

acquisitions=[ 
    #### BRAIN 2
    'BrainAtlas/BRAIN2/20211201_MouseBrain2_S11_306x248_Att30_25um',
 'BrainAtlas/BRAIN2/20211202_MouseBrain2_S12_332x246_Att30_25um',
 'BrainAtlas/BRAIN2/20211203_MouseBrain2_S13_319x262_Att30_25um',
 'BrainAtlas/BRAIN2/20211206_MouseBrain2_S14_354x299_Att30_25um',
 'BrainAtlas/BRAIN2/20211209_MouseBrain2_S15_359x281_Att30_25um',
 'BrainAtlas/BRAIN2/20220127_MouseBrain2_S16_398x303_Att30_25um',
 'BrainAtlas/BRAIN2/20220105_MouseBrain2_S17_395x294_Att32_25um',
 'BrainAtlas/BRAIN2/20220106_MouseBrain2_S18_393x309_Att32_25um',
 'BrainAtlas/BRAIN2/20211210_MouseBrain2_S19_423x320_Att32_25um',
 'BrainAtlas/BRAIN2/20220321_MouseBrain2_S20_Duplicate_443x313_Att30_25um',
 'BrainAtlas/BRAIN2/20211213_MouseBrain2_S21_412x360_Att30_25um',
 'BrainAtlas/BRAIN2/20220107_MouseBrain2_S22_417x310_Att32_25um',
 'BrainAtlas/BRAIN2/20220224_MouseBrain2_S23_427x319_Att30_25um',
 'BrainAtlas/BRAIN2/20220115_MouseBrain2_S24_427x322_Att30_25um',
 'BrainAtlas/BRAIN2/20220311_MouseBrain2_S25_duplicate_476x336_Att30_25um',
 'BrainAtlas/BRAIN2/20220120_MouseBrain2_S26_415x315_Att30_25um',
 'BrainAtlas/BRAIN2/20220122_MouseBrain2_S27_443x301_Att30_25um',
 'BrainAtlas/BRAIN2/20220125_MouseBrain2_S28_312x431_Att30_25um',
 'BrainAtlas/BRAIN2/20220126_MouseBrain2_S29_410x290_Att30_25um',
 'BrainAtlas/BRAIN2/20220111_MouseBrain2_S30_400x248_Att32_25um',
 'BrainAtlas/BRAIN2/20220124_MouseBrain2_S31_394x261_Att30_25um',
 'BrainAtlas/BRAIN2/20220130_MouseBrain2_S32_370x325_Att30_25um',
 'BrainAtlas/BRAIN2/20220201_MouseBrain2_S33_359x314_Att30_25um',
 'BrainAtlas/BRAIN2/20220203_MouseBrain2_S34_377x322_Att30_25um',
 'BrainAtlas/BRAIN2/20220207_MouseBrain2_S35_375x344_Att30_25um',
 'BrainAtlas/BRAIN2/20220210_MouseBrain2_S36_363x307_Att30_25um',
 'BrainAtlas/BRAIN2/20220213_MouseBrain2_S37_354x308_Att30_25um',
 'BrainAtlas/BRAIN2/20220216_MouseBrain2_S38_363x304_Att30_25um',
 'BrainAtlas/BRAIN2/20220225_MouseBrain2_S39_354x296_Att30_25um',
 'BrainAtlas/BRAIN2/20220308_MouseBrain2_S40_322x290_Att30_25um',
 'BrainAtlas/BRAIN2/20220303_MouseBrain2_S41_333x262_Att30_25um',
 'BrainAtlas/BRAIN2/20220222_MouseBrain2_S42_278x251_Att30_25um',

#### BRAIN 3
'BrainAtlas/BRAIN3/20221103_MouseBrain_LipidAtlas_Brain2_1A_231x231_Att30_25um',
'BrainAtlas/BRAIN3/20221104_MouseBrain_LipidAtlas_Brain2_2B_257x209_Att30_25um',
'BrainAtlas/BRAIN3/20221105_MouseBrain_LipidAtlas_Brain2_1C_249x247_Att30_25um',
'BrainAtlas/BRAIN3/20221107_MouseBrain_LipidAtlas_Brain2_1D_274x247_Att30_25um',
'BrainAtlas/BRAIN3/20221107_MouseBrain_LipidAtlas_Brain2_1E_286x251_Att30_25um',
'BrainAtlas/BRAIN3/20221109_MouseBrain_LipidAtlas_Brain2_1F_308x267_Att30_25um',
'BrainAtlas/BRAIN3/20221110_MouseBrain_LipidAtlas_Brain2_1G_338x266_Att30_25um',
'BrainAtlas/BRAIN3/20221111_MouseBrain_LipidAtlas_Brain2_1H_345x239_Att30_25um',
'BrainAtlas/BRAIN3/20221112_MouseBrain_LipidAtlas_Brain2_1I_368x259_Att30_25um',
'BrainAtlas/BRAIN3/20221113_MouseBrain_LipidAtlas_Brain2_1J_374x262_Att30_25um',
'BrainAtlas/BRAIN3/20221118_MouseBrain_LipidAtlas__Brain2_1K_390x291_Att30_25um',
'BrainAtlas/BRAIN3/20221119_MouseBrain_LipidAtlas__Brain2_1L_394x310_Att30_25um',
'BrainAtlas/BRAIN3/20221120_MouseBrain_LipidAtlas__Brain2_1M_400x303_Att30_25um',
'BrainAtlas/BRAIN3/20221121_MouseBrain_LipidAtlas__Brain2_1N_390x309_Att30_25um',
# '20221122_MouseBrain_LipidAtlas__Brain2_1O_422x284_Att30_25um',
'BrainAtlas/BRAIN3/20221123_MouseBrain_LipidAtlas__Brain2_1P_434x327_Att30_25um',
#### '20221209_MouseBrainLipidAlas_Brain2_Q1_429x315_Att30_25um',
#### '20221210_MouseBrain_LipidAlas_Brain2_R1_431x328_Att30_25um',
#### '20221211_MouseBrain_LipidAlas_Brain2_S2_430x335_Att30_25um',
'BrainAtlas/BRAIN3/20221227_MouseBrain_LipdAtlas_R2_430x346_Att30_25um',
'BrainAtlas/BRAIN3/20221229_MouseBrain_LipdAtlas_T1_448x317_Att30_25um',
'BrainAtlas/BRAIN3/20221230_MouseBrain_LipdAtlas_Brain2_U1_441x330_Att30_25um',
'BrainAtlas/BRAIN3/20230612_MouseBrain_LipidAtlas_Brain2_box1_V2_458x346_Att30_25um_Tx',
'BrainAtlas/BRAIN3/20230103_MouseBrain_LipdAtlas_Brain2_W1_444x354_Att30_25um',
'BrainAtlas/BRAIN3/20230310_MouseBrain_LipidAtlas_Brain2_X2_450x338_Att30_25um',
'BrainAtlas/BRAIN3/20230316_MouseBrain_LipidAtlas_Brain2_Y2_452x337_Att30_25um',
'BrainAtlas/BRAIN3/20230318_MouseBrain_LipidAtlas_Brain2_2_A1_453x358_Att30_25um',
'BrainAtlas/BRAIN3/20230319_MouseBrain_LipidAtlas_Brain2_2_B1_457x349_Att30_25um',
'BrainAtlas/BRAIN3/20230325_MouseBrain_LipidAtlas_Brain2_2_C2_451x352_Att30_25um',
'BrainAtlas/BRAIN3/20230328_MouseBrain_LipidAtlas_Brain2_2_D1_456x339_Att30_25um',
'BrainAtlas/BRAIN3/20230331_MouseBrain_LipidAtlas_Brain2_2_E1_449x315_Att30_25um',
'BrainAtlas/BRAIN3/20230402_MouseBrain_LipidAtlas_Brain2_2_F1_447x324_Att30_25um',
'BrainAtlas/BRAIN3/20230415_MouseBrain_LipidAtlas_Brain2_2_G2_434x282_Att30_25um',
'BrainAtlas/BRAIN3/20230416_MouseBrain_LipidAtlas_Brain2_2_H1_418x283_Att30_25um',
'BrainAtlas/BRAIN3/20230417_MouseBrain_LipidAtlas_Brain2_2_I1_413x291_Att30_25um',
'BrainAtlas/BRAIN3/20230421_MouseBrain_LipidAtlas_Brain2_2_J3_417x323_Att30_25um',
'BrainAtlas/BRAIN3/20230422_MouseBrain_LipidAtlas_Brain2_2_K1_404x305_Att30_25um',
'BrainAtlas/BRAIN3/20230424_MouseBrain_LipidAtlas_Brain2_2_L2_409x296_Att30_25um',
'BrainAtlas/BRAIN3/20230425_MouseBrain_LipidAtlas_Brain2_2_M1_403x303_Att30_25um',
'BrainAtlas/BRAIN3/20230429_MouseBrain_LipidAtlas_Brain2_2_N1_394x304_Att30_25um',
'BrainAtlas/BRAIN3/20230430_MouseBrain_LipidAtlas_Brain2_2_O4_391x294_Att30_25um',
'BrainAtlas/BRAIN3/20230508_MouseBrain_LipidAtlas_Brain2_2_P2_379x312_Att30_25um',
    'BrainAtlas/BRAIN3/20230515_MouseBrain_LipidAtlas_2_Q1_350x301_Att30_25um',
    'BrainAtlas/BRAIN3/20230516_MouseBrain_LipidAtlas_2_R1_351x301_Att30_25um',
    'BrainAtlas/BRAIN3/20230522_MouseBrain_LipidAtlas_Brain2_S1_344x279_Att30_25um',

#### CONTROL BRAINS

    
    'BrainAtlas/Control_Brains/female/20220411_MouseBrain_female_217G_349x316_Att30_25um',
    'BrainAtlas/Control_Brains/female/20220412_MouseBrain_female_217B_374x286_Att30_25um',
    'BrainAtlas/Control_Brains/female/20220416_MouseBrain_female_217D_447x332_Att30_25um',
    'BrainAtlas/Control_Brains/female/20220419_MouseBrain_female_217E_433x309_Att30_25um',
    'BrainAtlas/Control_Brains/female/20220420_MouseBrain_female_217F_383x296_Att30_25um',
    
    'BrainAtlas/Control_Brains/female/20220617_MouseBrain_214_A_386x291_Att30_25um',
    'BrainAtlas/Control_Brains/female/20220627_MouseBrain_214_F_383x313_Att30_25um',
    'BrainAtlas/Control_Brains/female/20220709_MouseBrain_214_C_421x328_Att30_25um',
    'BrainAtlas/Control_Brains/female/20220710_MouseBrain_214_E_415x292_Att30_25um',
    'BrainAtlas/Control_Brains/female/20220711_MouseBrain_214_D_447x327_Att30_25um',
    
    'BrainAtlas/Control_Brains/female/20220712_MouseBrain_308_A_371x297_Att30_25um',
    'BrainAtlas/Control_Brains/female/20220713_MouseBrain_308_B_399x315_Att30_25um',
    'BrainAtlas/Control_Brains/female/20220725_MouseBrain_308_F_368x284_Att30_25um',
    'BrainAtlas/Control_Brains/female/20220731_MouseBrain_308_C_401x326_Att30_25um',
    'BrainAtlas/Control_Brains/female/20220810_MouseBrain_308_D_405x296_Att30_25um',
    'BrainAtlas/Control_Brains/female/20220811_MouseBrain_308_E_410x292_Att30_25um',
    
    'BrainAtlas/Control_Brains/male/20220531_MouseBrain_203_B_409x281_Att30_25um',
    'BrainAtlas/Control_Brains/male/20220605_MouseBrain_203_C_419x330_Att30_25um',
    'BrainAtlas/Control_Brains/male/20220607_MouseBrain_203_A_362x283_Att30_25um',
    'BrainAtlas/Control_Brains/male/20220609_MouseBrain_203_D_451x343_Att30_25um',
    'BrainAtlas/Control_Brains/male/20220616_MouseBrain_203_E_430x282_Att30_25um',
    'BrainAtlas/Control_Brains/male/20220608_MouseBrain_203_F_351x281_Att30_25um',
    
    'BrainAtlas/Control_Brains/male/20220421_MouseBrain_male_212B_387x285_Att30_25um',
    'BrainAtlas/Control_Brains/male/20220423_MouseBrain_male_212C_412x334_Att30_25um',
    'BrainAtlas/Control_Brains/male/20220425_MouseBrain_male_212G_382x305_Att30_25um',
    'BrainAtlas/Control_Brains/male/20220426_MouseBrain_male_212D_dupl_422x338_Att30_25um',
    'BrainAtlas/Control_Brains/male/20220427_MouseBrain_male_212E_443x322_Att30_25um',
    'BrainAtlas/Control_Brains/male/20220502_MouseBrain_male_212F_330x243_Att30_25um',

    'BrainAtlas/Control_Brains/male/20220920_MouseBrain_LipidAtlas_M_305_1A_406x268_Att30_25um',
    'BrainAtlas/Control_Brains/male/20220920_MouseBrain_LipidAtlas_M_305_1B_434x308_Att30_25um',
    'BrainAtlas/Control_Brains/male/20220920_MouseBrain_LipidAtlas_M_305_1C_430x325_Att30_25um',
    'BrainAtlas/Control_Brains/male/20220923_MouseBrain_LipidAtlas_M_305_3D_448x322_Att30_25um',
    'BrainAtlas/Control_Brains/male/20221030_MouseBrain_LipidAtlas_M_305_1E_393x340_Att30_25um',
    'BrainAtlas/Control_Brains/male/20221031_MouseBrain_LipidAtlas_M_305_1F_365x313_Att30_25um',


    ## PREGNANCY BRAINS
    'PREGNANT/20240708_MouseBrain_LipidAtlas_Pregnant_Brain1_A1_395x280_25um_Att30',
    'PREGNANT/20240709_MouseBrain_LipidAtlas_Pregnant_Brain1_B1_458x337_25um_Att30',
    'PREGNANT/20240712_MouseBrain_LipidAtlas_Pregnant_Brain1_C2_459x352_25um_Att30',
    'PREGNANT/20240714_MouseBrain_LipidAtlas_Pregnant_Brain1_D1_477x346_25um_Att30',
    'PREGNANT/20240715_MouseBrain_LipidAtlas_Pregnant_Brain1_E1_454x310_25um_Att30',
    'PREGNANT/20240711_MouseBrain_LipidAtlas_Pregnant_Brain1_F1_424x338_25um_Att30',

    'PREGNANT/20240710_MouseBrain_LipidAtlas_Pregnant_Brain2_A1_380x297_25um_Att30',
    'PREGNANT/20240721_MouseBrain_LipidAtlas_Pregnant_Brain2_B1_434x310_25um_Att30',
    'PREGNANT/20240716_MouseBrain_LipidAtlas_Pregnant_Brain2_C1_445x318_25um_Att30',
    'PREGNANT/20240717_MouseBrain_LipidAtlas_Pregnant_Brain2_D1_452x333_25um_Att30',
    'PREGNANT/20240720_MouseBrain_LipidAtlas_Pregnant_Brain2_E1_436x299_25um_Att30',
    'PREGNANT/20240719_MouseBrain_LipidAtlas_Pregnant_Brain2_F1_383x288_25um_Att30',

    'PREGNANT/20240722_MouseBrain_LipidAtlas_Pregnant_Brain4_A1_383x301_25um_Att30',
    'PREGNANT/20240723_MouseBrain_LipidAtlas_Pregnant_Brain4_B1_432x321_25um_Att30',
    'PREGNANT/20240724_MouseBrain_LipidAtlas_Pregnant_Brain4_C1_467x353_25um_Att30',
    'PREGNANT/20240727_MouseBrain_LipidAtlas_Pregnant_Brain4_D1_452x307_25um_Att30',
    'PREGNANT/20240726_MouseBrain_LipidAtlas_Pregnant_Brain4_E1_436x307_25um_Att30',
    'PREGNANT/20240725_MouseBrain_LipidAtlas_Pregnant_Brain4_F1_336x273_25um_Att30',
    

    ## GBA

    'GBA1/20220505_MouseBrain_GBA1__7580_2A_395x287_Att30_25um',
    'GBA1/20220506_MouseBrain_GBA1__7580_B2_421x333_Att30_25um',
    'GBA1/20220519_MouseBrain_GBA1_7583_C1_430x310_Att30_25um',
    'GBA1/20220507_MouseBrain_GBA1__7580_C2_427x324_Att30_25um',
    'GBA1/20220510_MouseBrain_GBA1_7580_D2_444x312_Att30_25um',
    'GBA1/20220511_MouseBrain_GBA1_7580_E2_406x314_Att30_25um',
    'GBA1/20220513_MouseBrain_GBA1__7580_F2_352x295_Att30_25um',


    # CERT
    'CERT1/20240803_GDA990_Brain_Slide5_A_195x175_25um_Att30',
    'CERT1/20240730_GDA990_Brain_Slide5_B_393x342_25um_Att30',
    # 'CERT1/20240624_GDA990_BRAINc05_419X340_25UM_Att30',
    'CERT1/20240620_GDA990_Brain_Slide5_D_413x331_25um_Att30',
    'CERT1/20240729_GDA990_Brain_Slide6_E_395x309_25um_Att30',
    'CERT1/20240731_GDA990_Brain_Slide5_F_414x324_25um_Att30',
    'CERT1/20240802_GDA990_Brain_Slide5_G_366x304_25um_Att30'

    
]
print(len(acquisitions))
masks = [np.load(f'/data/LBA_DATA/{section}/mask.npy') for section in acquisitions]
features = processed_df['PATH_MZ'].values
totsig_df = pd.DataFrame(np.zeros((len(features), len(acquisitions))), index=features, columns = np.arange(0, len(acquisitions)).astype(str))
totsig_df

In [None]:
TBC = 5 # check 4 sections for now assuming they are representative to prioritize at least the good lipids, then compute the full dataframe

for xxx, feat in tqdm(enumerate(features)): 
    feat_dec = f"{float(feat):.6f}"
    ns = np.array(list(root[feat_dec].keys())).astype(int).astype(str) [:TBC]
    
    for nnn in ns:

        MASK = masks[int(nnn)]
        image = root[feat_dec][nnn][:]

        image[MASK == 0] = 0
        sig = np.mean(image*1e6)
        
        totsig_df.loc[feat, nnn] = sig
        
totsig_df = totsig_df.fillna(0)

totsig_df

In [None]:
totsig_df.to_hdf("totsig_df.h5ad", key="table")

In [None]:
featuresum = totsig_df.sum(axis=1)
featuresum.sort_values()

In [None]:
import pickle

output_file = 'annotation_to_mz.pkl'
with open(output_file, 'wb') as f:
    pickle.dump(annotation_to_mz, f)

In [None]:
annotation_to_mz_bestisobar = {}

for annotation, mz_values in annotation_to_mz.items():
    max_featuresum = -float('inf') 
    best_mz = None  
    
    for mz_value in mz_values:
        if mz_value in featuresum.index:
            featuresum_value = featuresum.loc[mz_value]
            if featuresum_value > max_featuresum:
                max_featuresum = featuresum_value
                best_mz = mz_value
        else:
            print(f"m/z value {mz_value} not found in featuresum index.")
            
    if best_mz is not None:
        annotation_to_mz_bestisobar[annotation] = best_mz
    else:
        print(f"No valid m/z values found for annotation {annotation}.")
        
len(annotation_to_mz_bestisobar) # ok decent cut of 800+ features

In [None]:
adducts_cleaned = processed_df.loc[processed_df['PATH_MZ'].isin(list(annotation_to_mz_bestisobar.values())),:]
adducts_cleaned

In [None]:
annotation_to_mz['SM 34:1;O2']

In [None]:
annotation_to_mz_bestisobar['SM 34:1;O2'] # this was chosen, does it make sense? check below

In [None]:
annotation_to_mz['PC 36:3']

In [None]:
annotation_to_mz_bestisobar['PC 36:3'] # this was chosen, does it make sense? check below

In [None]:
#### do some checks that the choice makes sense 
PATH_DATA = '/data/LBA_DATA/ALL_DATA_PROCESSED/181024_BrainTOTAL_ALL_MOLECULES'
root = zarr.open(PATH_DATA, mode='rb')
PATH_MZ = np.sort(list(root.group_keys()))

for esca in annotation_to_mz['PC 36:3']:#annotation_to_mz['SM 34:1;O2']:
    esca = f"{float(esca):.6f}"
    print(esca)
    ns = np.sort(np.array(list(root[esca].keys())).astype(int)).astype(str)

    print(len(PATH_MZ))
    imgs = root[esca]

    for k in ns:
        plt.imshow(imgs[k][:])
        plt.show()

In [None]:
ndetsecs = []
for xxx, feat in tqdm(enumerate(features)): 
    feat = f"{float(feat):.6f}"
    ns = np.sort(np.array(list(root[feat].keys())).astype(int)).astype(str)
    ndetsecs.append(len(ns))
    
ndet = pd.Series(ndetsecs, index=features)
ndet

In [None]:
annotation_to_mz_bestisobar_basedonNsec = {}

for annotation, mz_values in annotation_to_mz.items():
    max_ndet = -float('inf') 
    best_mz = None  
    
    for mz_value in mz_values:
        if mz_value in ndet.index:
            ndet_value = ndet.loc[mz_value]
            if ndet_value > max_ndet:
                max_ndet = ndet_value
                best_mz = mz_value
        else:
            print(f"m/z value {mz_value} not found in ndet index.")
            
    if best_mz is not None:
        annotation_to_mz_bestisobar_basedonNsec[annotation] = best_mz
    else:
        print(f"No valid m/z values found for annotation {annotation}.")
        
sel1 = list(annotation_to_mz_bestisobar.values())
sel2 = list(annotation_to_mz_bestisobar_basedonNsec.values())
len(np.intersect1d(sel1, sel2)) / len(sel1) # 92% choosing most sections is equivalent to mean signal choice based on 4 sampled sections

In [None]:
peaks = adducts_cleaned['PATH_MZ'].values
alldet = '/data/LBA_DATA/ALL_DATA_PROCESSED/181024_BrainTOTAL_ALL_MOLECULES'
root = zarr.open(alldet, mode='rb')
alldet = np.sort(list(root.group_keys()))

In [None]:
# also keep the heme along with lipids

heme0 = 639.1666068
heme0peak = alldet[np.argmin(np.abs(alldet.astype(float) - heme0))]

heme1 = 653.14
heme1peak = alldet[np.argmin(np.abs(alldet.astype(float) - heme1))]
heme1peak

In [None]:
kept_sorted_signal = featuresum.loc[adducts_cleaned['PATH_MZ']].sort_values()

for esca in kept_sorted_signal.index[:3]:#these were considered undetected, shall we discard?
    esca = f"{float(esca):.6f}"
    print(esca)
    ns = np.sort(np.array(list(root[esca].keys())).astype(int)).astype(str)
    imgs = root[esca]

    for k in ns:
        plt.imshow(imgs[k][:])
        plt.show()

In [None]:
plt.plot(kept_sorted_signal.values)

In [None]:
for esca in kept_sorted_signal.index[-3:]:
    esca = f"{float(esca):.6f}"
    print(esca)
    ns = np.sort(np.array(list(root[esca].keys())).astype(int)).astype(str)
    imgs = root[esca]

    for k in ns:
        plt.imshow(imgs[k][:])
        plt.show()

In [None]:
for esca in kept_sorted_signal.index[650:653]: # manually selected as threshold
    esca = f"{float(esca):.6f}"
    print(esca)
    ns = np.sort(np.array(list(root[esca].keys())).astype(int)).astype(str)
    imgs = root[esca]

    for k in ns:
        plt.imshow(imgs[k][:])
        plt.show()

In [None]:
## i'll discard those in the 600 bottom mean signal that survived only thanks to lipidmaps so multiple reasons to be untrusted

adducts_cleaned['totsig'] = np.array(kept_sorted_signal.loc[adducts_cleaned['PATH_MZ']])
thr = kept_sorted_signal.iloc[600]
adducts_cleaned = adducts_cleaned.loc[~((adducts_cleaned['Annotation'] == adducts_cleaned['LIPIDMAPS']) & (adducts_cleaned['totsig'] <thr)),:]
adducts_cleaned

In [None]:
### remove features that were discarded by a preliminary permissive threshold on Moran's I

toremove = np.load("tmpbadfeatures.npy")
adducts_cleaned = adducts_cleaned.loc[~adducts_cleaned['PATH_MZ'].astype(str).isin(toremove),:]
bestfeatures = np.concatenate([np.array(adducts_cleaned['PATH_MZ']), np.array([heme1peak, heme0peak])])

bestfeatures = np.unique(bestfeatures)
print(len(bestfeatures))
np.save("best1402features.npy", bestfeatures)
adducts_cleaned.to_csv("ALLANNOTATIONS_CLEAN.csv")

## Justify quantitatively total signal as a good metric for adduct prioritarization

In [None]:
# calculate the correlation between peaks mapping to the same lipid, one section at a time, for all sections where they both appear, in the raw data

annots = pd.read_csv("ALLANNOTATIONS_20241208.csv")
lipidomic_peaks = annots['PATH_MZ']
lipidomic_peaks_str = pd.DataFrame(lipidomic_peaks).applymap(lambda x: f"{x:.6f}" if pd.notna(x) else "")
vcs = annots['Annotation'].value_counts()
to_dc = annots.loc[annots['Annotation'].isin(vcs.index[vcs>1]),:]
grouped = to_dc.groupby('Annotation')['PATH_MZ'].agg(list)

corrs = []
pairs0 = []
pairs1 = []
secs = []

import itertools
from tqdm import tqdm
for test in tqdm(grouped):
    for test0, test1 in itertools.combinations(test, 2):
        for NNN in range(32):
            try:
                mask = masks[NNN]
                root = zarr.open(PATH_DATA, mode='rb')
                a = np.exp(root[test0][NNN][:]) 
                b = np.exp(root[test1][NNN][:])
                a = a.flatten()[mask.flatten()]
                b = b.flatten()[mask.flatten()]

                corrs.append(np.corrcoef(a, b)[0,1])
                pairs0.append(test0)
                pairs1.append(test1)
                secs.append(NNN)
            except:
                continue
                

paircorr = pd.DataFrame([pairs0, pairs1, corrs, secs]).T
paircorr.columns = ['peak1', 'peak2', 'corr', 'section']
paircorr

In [None]:
# remove the peaks that were tagged by a gentle threshold on Moran's I as spatial noise

toremove = np.load("tmpbadfeatures.npy")
paircorr['flagged'] = 0
paircorr.loc[paircorr['peak1'].astype(str).isin(toremove), 'flagged'] = 1
paircorr.loc[paircorr['peak2'].astype(str).isin(toremove), 'flagged'] = 1
paircorr = paircorr.loc[paircorr['flagged'] == 0,:]
paircorr

In [None]:
# extract the adduct that was chosen as "winner" vs all other putative adducts of the same molecule

featuresum = totsig_df.sum(axis=1)
featuresum.sort_values()

annotation_to_mz_bestisobar = {}

for annotation, mz_values in annotation_to_mz.items():
    max_featuresum = -float('inf') 
    best_mz = None  
    
    for mz_value in mz_values:
        if mz_value in featuresum.index:
            featuresum_value = featuresum.loc[mz_value]
            if featuresum_value > max_featuresum:
                max_featuresum = featuresum_value
                best_mz = mz_value
        else:
            print(f"m/z value {mz_value} not found in featuresum index.")
            
    if best_mz is not None:
        annotation_to_mz_bestisobar[annotation] = best_mz
    else:
        print(f"No valid m/z values found for annotation {annotation}.")
        
len(annotation_to_mz_bestisobar)

pairs_involved = []
winners = []
anns = []

for ann in list(annotation_to_mz.keys()):
    if len(list(annotation_to_mz[ann])) > 1:
        pairs_involved.append(list(annotation_to_mz[ann]))
        winners.append(annotation_to_mz_bestisobar[ann])
        anns.append(ann)
        
checking_vs_winner = pd.DataFrame([anns, winners, pairs_involved]).T
checking_vs_winner.columns = ["ann", "winner", "contenders"]

# i want to remove the metaspace/lipidmaps ones, i don't care much for them as they are already low confidence hits
checking_vs_winner = checking_vs_winner.loc[checking_vs_winner['ann'].isin(annots.loc[annots['Score'] > 2, 'Annotation'].values),:]
checking_vs_winner # so there are 195 important cases where we did make a choice

In [None]:
# extract the correlation of these adduct peak pairs 

filtered_df = paircorr.merge(
    checking_vs_winner.explode('contenders'),
    left_on=['peak1', 'peak2'],
    right_on=['winner', 'contenders']
)

len(filtered_df['ann'].unique()) # it comes down to 36 lipids being adduct-prioritized
# the others do not appear consistently in the same sections, so they do not have any correlation pairs - empirically, often because one of them is so broken/low level to drop out often

In [None]:
inuseadducts = filtered_df[['ann', 'peak1', 'peak2', 'corr']].groupby(['ann', 'peak1', 'peak2']).mean()
inuseadducts

In [None]:
problematic = inuseadducts.loc[inuseadducts['corr'] < 0.5,:]
problematic.shape # 32 debatable cases, for the others there is robust correlation (as this is the mean across sections, and in some sections we see clear reassuring co-distribution)

In [None]:
# inspect the debatable cases one by one manually for a couple of sections, what do we see? in most-most of cases we are taking the good one whereas the other is just noise, explaining the uncorrelation. 
# it's never two beautiful but discordant spatial patterns

for xxx in problematic.index:
    
    print(xxx[0])
    peak1 = xxx[1]
    peak2 = xxx[2]
    
    flag = 0
    
    for NNN in range(32):
        try:
            if flag < 2:
                root = zarr.open(PATH_DATA, mode='rb')
                a = np.exp(root[peak1][NNN][:]) 
                b = np.exp(root[peak2][NNN][:])
                plt.imshow(a)
                plt.title("winner "+xxx[0])
                plt.show()
                plt.imshow(b)
                plt.title("contender "+xxx[0])
                plt.show()
                flag = flag +1
            else:
                continue
            
        except:
            continue