In [None]:
import zarr
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

PATH_DATA = '/data/luca/lipidatlas/uMAIA_allbrains/021124_ALLBRAINS_normalised.zarr'

root = zarr.open(PATH_DATA, mode='rb')
PATH_MZ = np.sort(list(root.group_keys()))

print(len(PATH_MZ))

plt.imshow(np.exp(root[PATH_MZ[29]][10][:]))

## Extract the data

In [None]:
datasetsize = 138 # OUR NUMBER OF ACQUISITIONS
lipid_native_sections_array = np.full((len(PATH_MZ), datasetsize, 500, 500), np.nan)

for LIPID in tqdm(range(len(PATH_MZ))):
    for SECTION in range(datasetsize):
        img = root[PATH_MZ[LIPID]][SECTION][:]
        img_x, img_y = img.shape
        lipid_native_sections_array[LIPID, SECTION, :img_x, :img_y] = img

print(lipid_native_sections_array.shape)

In [None]:
import pandas as pd

# preparing a pixel x lipid dataframe (similar to "cell x gene")
lipid_tensor = lipid_native_sections_array

# flatten the tensor
flattened_lipid_tensor = lipid_tensor.reshape(lipid_tensor.shape[0], -1)

# generate temporary lipid names
lipid_names = ["lipid" + str(i+1) for i in range(flattened_lipid_tensor.shape[0])]

flattened_lipid_tensor

In [None]:
# generate pixel names that retain their spatial position as unique identifier
column_names = []
for i in range(lipid_tensor.shape[1]):
    for j in range(lipid_tensor.shape[2]):
        for k in range(lipid_tensor.shape[3]):
            column_names.append(f"section{i+1}_pixel{j+1}_{k+1}")

df = pd.DataFrame(flattened_lipid_tensor, index=lipid_names, columns=column_names)
df

In [None]:
## removing out-of-brain pixels

df_transposed = df.T

df_transposed = df_transposed.dropna(how='all')

df_transposed

In [None]:
df_transposed.columns = PATH_MZ

In [None]:
df_index = df_transposed.index.to_series().str.split('_', expand=True)
df_index.columns = ['Section', 'x', 'y']

df_index['Section'] = df_index['Section'].str.replace('section', '')
df_index['x'] = df_index['x'].str.split('pixel').str.get(1)

df_index = df_index.astype(int)

df_transposed = df_transposed.join(df_index)

pixels = df_transposed

pixels

In [None]:
pixels.to_parquet("20241102_pixels_allipids_allbrains.parquet")

## Exponentiate

In [None]:
# restart the kernel due to memory

import pandas as pd
import zarr
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

PATH_DATA = '/data/luca/lipidatlas/uMAIA_allbrains/021124_ALLBRAINS_normalised.zarr'

### wait for the "full" dataset

root = zarr.open(PATH_DATA, mode='rb')
PATH_MZ = np.sort(list(root.group_keys()))

pixels = pd.read_parquet("20241102_pixels_allipids_allbrains.parquet")

In [None]:
Nlipids = len(PATH_MZ)

pixels.iloc[:,:Nlipids] = np.exp(pixels.iloc[:,:Nlipids])
pixels

In [None]:
pixels.to_parquet("20241102_exp_pixels_allipids_allbrains.parquet")

## Metadata

In [None]:
import pandas as pd
import zarr
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

#pixels = pd.read_parquet("20241024_exp_pixels_allipids_allbrains.parquet")

ind = pixels.index

metadata = pd.read_csv("acquisitions_metadata.csv")
cols = np.array(pixels.columns)
cols[-3] = "SectionID"
pixels.columns = cols
pixels.index = ind

In [None]:
pixels = pixels.merge(metadata, left_on='SectionID', right_on='SectionID', how='left')

In [None]:
pixels.index = ind
pixels

## Remove background

In [None]:
## clean up from background pixels

mask = (pixels.iloc[:, :len(PATH_MZ)] < 0.00011).all(axis=1)
pixels = pixels[~mask]
pixels

## CCF coordinates

In [None]:
# for the reference atlas

coordinates_initialization = pd.read_hdf("ccf_coordinates_colas_initialization.h5ad", key="table")

coordinates_initialization = coordinates_initialization[['xccf', 'yccf', 'zccf']]

import pandas as pd
import os
import numpy as np

files = np.sort(os.listdir("STalignOutput"))
files = files[files != '.ipynb_checkpoints']

print(files)

stal = []

for xxx in files:
    stal.append(pd.read_csv(os.getcwd()+"/STalignOutput/"+xxx, index_col=0))

stal = pd.concat(stal)
stal = stal[['density', 'y_index_new',	'z_index_new']]

stal['yccf'] = stal['y_index_new'] / 40
stal['zccf'] = stal['z_index_new'] / 40

coordinates_initialization = coordinates_initialization.loc[stal.index,:]

stal['xccf'] = coordinates_initialization['xccf']
stal['x_index'] = (stal['xccf']*40).astype(int)
stal['y_index'] = (stal['yccf']*40).astype(int)
stal['z_index'] = (stal['zccf']*40).astype(int)

In [None]:
# for the second atlas

import re
exprs = []
for i in range(metadata.loc[metadata['Sample'] == "SecondAtlas", :].shape[0]):
    path = metadata.loc[metadata['Sample'] == "SecondAtlas", "Path"].iloc[i]
    match = re.search(r'_(\d+[A-Z])_', path)
    if match:
        exprs.append(match.group(1))
    else:
        match = re.search(r'_([A-Z]+\d+)_', path)
        if match:
            exprs.append(match.group(1))
        else:
            exprs.append(None)

In [None]:
import os
import skimage

filez = os.listdir("/data/luca/lipidatlas/ManuscriptAnalysisRound2/Atlas_Brain3/")
filez = [file[:-len("_Coords.tif")] for file in filez]

matches = [re.findall(r"\[['\"]([^'\"]+)['\"]\]", text)[1] for text in filez]

ms = []
for m in matches:
    if m[:2] == '2_':
        ms.append(m[2:])
    else:
        ms.append(m)
        
print(len(ms)) # so 7 sections of brain 3 (OB i guess) are missing. 
exprssel = [e for e in exprs if e in ms]
sorted_filez = pd.DataFrame(filez, index=ms).loc[exprssel,:]
sorted_filez.columns = ['Path_coordinates']

In [None]:
tmp = metadata.loc[metadata['Sample'] == "SecondAtlas", :]
tmp.index = exprs

tmp = tmp.loc[sorted_filez.index,:]
tmp['Path_coordinates'] = sorted_filez['Path_coordinates']
tmp

In [None]:
import numpy as np
import pandas as pd
import skimage.io

dfs = []
for it in range(tmp.shape[0]):
    xxx = tmp['Path_coordinates'].iloc[it]
    yyy = tmp['SectionID'].iloc[it]

    image = skimage.io.imread(f"/data/luca/lipidatlas/ManuscriptAnalysisRound2/Atlas_Brain3/{xxx}_Coords.tif")
    height, width, _ = image.shape
    x_coords, y_coords = np.meshgrid(np.arange(width), np.arange(height))
    x_coords_flat = x_coords.flatten()
    y_coords_flat = y_coords.flatten()
    flattened_array = image.reshape(-1, 3)

    df = pd.DataFrame(flattened_array, columns=['xccf', 'yccf', 'zccf'])

    df['x_pos'] = x_coords_flat
    df['y_pos'] = y_coords_flat
    df.index = "section"+yyy.astype(str)+"_pixel"+(df['y_pos']+1).astype(str)+"_"+(df['x_pos']+1).astype(str)
    df.drop(columns=['x_pos', 'y_pos'], inplace=True)

    dfs.append(df)

secondatlas_ccf = pd.concat(dfs)

indexes = np.array(secondatlas_ccf.index)
pixels_index_set = set(pixels.index)
goodindexes = [ind for ind in indexes if ind in pixels_index_set]
secondatlas_ccf = secondatlas_ccf.loc[goodindexes,:]
secondatlas_ccf

In [None]:
secondatlas_ccf['x_index'] = (secondatlas_ccf['xccf']*40).astype(int)
secondatlas_ccf['y_index'] = (secondatlas_ccf['yccf']*40).astype(int)
secondatlas_ccf['z_index'] = (secondatlas_ccf['zccf']*40).astype(int)
secondatlas_ccf

In [None]:
ccf_dataframe = pd.concat([stal[["xccf",	"yccf","zccf",	"x_index",	"y_index",	"z_index"]], secondatlas_ccf], axis=0)

In [None]:
### there will be several NaN's wherever we do not yet have CCF coordinates
pixels = pd.concat([pixels, ccf_dataframe], axis=1)
pixels = pixels.dropna(subset=['Condition']) # remove the extrapixels unfiltered in the STAlign procedure
pixels

In [None]:
pixels['Section'] = np.array(pixels['Section']).astype(int)

In [None]:
pixels.to_parquet("20241102_exp_pixels___allipids_allbrains.parquet")

In [None]:
for samp in pixels['Sample'].unique():

    data = pixels.loc[pixels['Sample'] == samp,:] # just do a check on a lipid to omit bad sections

    global_min_z = data['x'].min()
    global_max_z = data['x'].max()
    global_min_y = data['y'].min() 
    global_max_y = data['y'].max()  

    cmap = plt.cm.plasma

    for currentPC in ['741.530795']:

        fig, axes = plt.subplots(4, 11, figsize=(20, 10))
        axes = axes.flatten()

        for section in range(1, np.max(data['Section'])+1):
            ax = axes[section - 1]
            ddf = data[(data['Section'] == section)]

            ax.scatter(ddf['y'], -ddf['x'], c=np.array(ddf[currentPC]), cmap=cmap, s=0.1,rasterized=True, vmin=np.percentile(ddf[currentPC], 10), vmax=np.percentile(ddf[currentPC], 98))
            ax.axis('off')
            ax.set_aspect('equal')  
            ax.set_ylim(global_min_z, global_max_z)
            ax.set_xlim(global_min_y, global_max_y)
            ax.set_title(section)

        ax = axes[31]
        ax.axis('off')

        plt.tight_layout(rect=[0, 0, 0.9, 1])
        plt.show()

## Allen metadata

In [None]:
from bg_atlasapi import BrainGlobeAtlas

atlas = BrainGlobeAtlas("allen_mouse_25um")
reference_image = atlas.reference

In [None]:
import numpy as np

erodedannot = np.load("eroded_annot.npy")
erodedannot.shape

In [None]:
import pandas as pd 

pixels = pd.read_parquet("20241102_exp_pixels___allipids_allbrains.parquet")

In [None]:
nan_mask = pixels[['x_index', 'y_index', 'z_index']].isna().any(axis=1)

pixels['boundary'] = np.nan
valid_pixels = pixels.loc[~nan_mask, ['x_index', 'y_index', 'z_index']]
valid_indices = valid_pixels.astype(int)
X_MAX, Y_MAX, Z_MAX = erodedannot.shape

in_bounds_mask = (
    (valid_indices['x_index'] >= 0) & (valid_indices['x_index'] < X_MAX) &
    (valid_indices['y_index'] >= 0) & (valid_indices['y_index'] < Y_MAX) &
    (valid_indices['z_index'] >= 0) & (valid_indices['z_index'] < Z_MAX)
)

valid_and_inbounds_indices = valid_indices.loc[in_bounds_mask]

pixels.loc[valid_and_inbounds_indices.index, 'boundary'] = erodedannot[
    valid_and_inbounds_indices['x_index'].values,
    valid_and_inbounds_indices['y_index'].values,
    valid_and_inbounds_indices['z_index'].values
]

In [None]:
import matplotlib.pyplot as plt

In [None]:
# check the registration is good enough

data=pixels.loc[(pixels['Sample'] == "ReferenceAtlas") | (pixels['Sample'] == "SecondAtlas"),:]

In [None]:
data

In [None]:
for SEC in data['SectionID'].unique()[32:]: #brain 3 check
    sec10 = data.loc[data['SectionID'] == SEC,:]
    image_array = np.zeros((320, 456))
    for _, row in sec10.iterrows():
        try:
            y = int(row['z_index']) ################## since casting the indexes to int with nan's (legit) is cumbersome
            z = int(row['y_index'])
            image_array[z, y] = row['870.540956']
        except:
            continue
    plt.imshow(image_array)
    image_array = np.zeros((320, 456))
    for _, row in sec10.iterrows():
        try:
            y = int(row['z_index'])
            z = int(row['y_index'])
            x = int(row['x_index'])
            image_array[z, y] = row['boundary']
        except:
            continue
    smoothed_image = image_array
    plt.imshow(smoothed_image, cmap="Grays", alpha=0.3)
    plt.show()

In [None]:
reference_image.shape

In [None]:
#### assigning Allen regions

from allensdk.core.mouse_connectivity_cache import MouseConnectivityCache
import pandas as pd
import numpy as np
from tqdm import tqdm

## use with care!
import warnings
warnings.filterwarnings('ignore')

mcc = MouseConnectivityCache(manifest_file='mouse_connectivity_manifest.json')
structure_tree = mcc.get_structure_tree()

data = pixels
annotation, _ = mcc.get_annotation_volume()

import pandas as pd
import matplotlib.pyplot as plt

num_rows = len(data)
data['acronym'] = [''] * num_rows
data['id'] = [0] * num_rows
data['name'] = [''] * num_rows
data['structure_id_path'] = [''] * num_rows
data['structure_set_ids'] = [''] * num_rows
data['rgb_triplet'] = [''] * num_rows

for xxx in tqdm(range(0, data.shape[0])):
    row = data.iloc[xxx,:]
    try:
        x_index = int(row['x_index'])
        y_index = int(row['y_index'])
        z_index = int(row['z_index'])
    
        if 0 <= x_index < annotation.shape[0] and 0 <= y_index < annotation.shape[1] and 0 <= z_index < annotation.shape[2]:
            index = annotation[x_index, y_index, z_index]
            brain_region = structure_tree.get_structures_by_id([index])[0]

            if brain_region is not None:

                data['acronym'][xxx] = brain_region['acronym']
                data['id'][xxx] = brain_region['id']
                data['name'][xxx] = brain_region['name']
                data['structure_id_path'][xxx] = brain_region['structure_id_path']
                data['structure_set_ids'][xxx] = brain_region['structure_set_ids']
                data['rgb_triplet'][xxx] = brain_region['rgb_triplet']
    except:
        continue
            

def rgb_to_hex_safe(rgb):
    try:
        return '#{:02x}{:02x}{:02x}'.format(*rgb)
    except (TypeError, IndexError):
        return "#000000"

data['allencolor'] = data['rgb_triplet'].apply(rgb_to_hex_safe)

data

In [None]:
data.iloc[:,-14:].to_hdf("allenmeta.h5ad", key="table")

In [None]:
import pandas as pd
import numpy as np

dat = pd.read_parquet("20241102_exp_pixels___allipids_allbrains.parquet")
allenmeta = pd.read_hdf("allenmeta.h5ad", key="table")
dat

In [None]:
data = pd.concat([dat, allenmeta], axis=1)
data

In [None]:
# check that Allen metadata transfer went smooth
import matplotlib.pyplot as plt
import numpy as np


tmp = data.loc[data['SectionID'] == 49,:]
tmp['allencolor'] = tmp['allencolor'].fillna("#000000")

plt.scatter(tmp['zccf'].iloc[:,0], -tmp['yccf'].iloc[:,0], c=np.array(tmp['allencolor']), s=0.5)
plt.show()

## Clean up out-of-brain pixels by using the Allen annotation

In [None]:
data['allencolor'] = data['allencolor'].fillna("#000000")

data = data.loc[
    (data["rgb_triplet"] != "") |
    (
        ~data["Sample"].isin(['ReferenceAtlas', 'SecondAtlas']) |
        (
            (data["Sample"] == "SecondAtlas") & (data["Section"] < 8)
        )
    ),
    :
]

In [None]:
# check that Allen cleanup went smooth
import matplotlib.pyplot as plt
tmp = data.loc[data['SectionID'] == 49,:]
plt.scatter(tmp['zccf'].iloc[:,0], -tmp['yccf'].iloc[:,0], c=tmp['allencolor'], s=0.5)
plt.show()

In [None]:
data = data.loc[:, ~data.columns.duplicated()]

data['structure_id_path'] = data['structure_id_path'].apply(lambda x: x if isinstance(x, list) else [] if pd.isnull(x) else [x])
data['structure_id_path'] = data['structure_id_path'].apply(lambda x: '_'.join(map(str, x)))

data['structure_set_ids'] = data['structure_set_ids'].apply(lambda x: x if isinstance(x, list) else [] if pd.isnull(x) else [x])
data['structure_set_ids'] = data['structure_set_ids'].apply(lambda x: '_'.join(map(str, x)))

data['rgb_triplet'] = data['rgb_triplet'].apply(lambda x: x if isinstance(x, list) else [] if pd.isnull(x) else [x])
data['rgb_triplet'] = data['rgb_triplet'].apply(lambda x: '_'.join(map(str, x)))

In [None]:
data.to_hdf("20241103_pixels_allips_allbrains_allen_pixelcleaned.h5ad", key="df")

In [None]:
data.loc[data['Sample'] == "ReferenceAtlas",:].to_parquet("brain2only.parquet")

In [None]:
data['SectionID'].value_counts() # fast identity check