In [None]:
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
from matplotlib.colors import Normalize
from matplotlib.cm import ScalarMappable
import pandas as pd
import networkx as nx
from adjustText import adjust_text
import random
import squidpy as sq
import scanpy as sc
import numpy as np
from tqdm import tqdm
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

matplotlib.rcParams['pdf.fonttype'] = 42

# here, we use Moran I / spatial autocorrelation and variance within vs across sections to select those lipids having a spatially informative pattern across all sections, to be used for clustering
data = pd.read_parquet("brain2only.parquet")
data = data.loc[data['BadSection'] == 0,:]
data

## Check various metrics for each lipid: variance of section-wise variances, Moran's I, dropout level

In [None]:
# import the dataset of the section-wise, lipid-wise precalculated Moran's coefficients

moran = pd.read_csv("morans_by_sec.csv", index_col=0)
moran = moran.fillna(0) ### risky move
moran

In [None]:
# select the sections of the atlas

moran = moran.iloc[:,:32]
moran

In [None]:
# standardize the data

inputlips = data.iloc[:,:-23]
inputlips[inputlips > 1.] = 0.0001 ### broken values

scaler = StandardScaler()
scaled_data = scaler.fit_transform(inputlips)

inputlips = pd.DataFrame(scaled_data, columns=inputlips.columns, index=inputlips.index)
inputlips

In [None]:
# define a function to evaluate the variances and the means of section-wise variances

adata = sc.AnnData(X=inputlips)
adata.obsm['spatial'] = data[['zccf', 'yccf', 'Section']].loc[data.index,:].values

def rank_features_by_combined_score(tempadata):
    
    sections = tempadata.obsm['spatial'][:, 2]  
    
    unique_sections = np.unique(sections)

    var_of_vars = []
    mean_of_vars = []

    for i in range(tempadata.X.shape[1]):
        feature_values = tempadata.X[:, i]

        section_variances = []
        for section in unique_sections:
            section_values = feature_values[sections == section]
            section_variance = np.var(section_values)
            section_variances.append(section_variance)

        var_of_vars.append(np.var(section_variances))
        mean_of_vars.append(np.mean(section_variances))

    var_of_vars = np.array(var_of_vars)
    mean_of_vars = np.array(mean_of_vars)

    combined_score = -var_of_vars/2 + mean_of_vars

    return var_of_vars, mean_of_vars, combined_score

var_of_vars, mean_of_vars, combined_score = rank_features_by_combined_score(adata)
ranked_indices = np.argsort(combined_score)[::-1]

plt.plot(var_of_vars[ranked_indices])
plt.show()

plt.plot(mean_of_vars[ranked_indices])
plt.show()

plt.plot(combined_score[ranked_indices])
plt.show()

In [None]:
# check the scores

scores = pd.DataFrame([np.array(inputlips.columns)[ranked_indices], var_of_vars[ranked_indices], mean_of_vars[ranked_indices], combined_score[ranked_indices]]).T
scores.columns = ["spatvar", "var_of_vars", "mean_of_vars", "combined_score"]
scores

In [None]:
moran_sorted = moran.mean(axis=1).sort_values()[::-1]
moran_sorted

In [None]:
scores.index = scores['spatvar'].astype(float).astype(str)
scores = scores.loc[moran_sorted.index.astype(str),:]

scores['combined_score'][scores['combined_score'] < -5] = -5 # bad is bad, control outliers
scores.index = scores.index.astype(float).astype(str)

# a very permissive threshold on Moran's I
scores_good_moran = scores.loc[moran_sorted.index[moran_sorted > 0.4].astype(float).astype(str),:]
scores = scores_good_moran
scores

In [None]:
# a permissive filter over section-wise dropout: too many dropouts => lipids should be excluded for clustering and reimputed

peakmeans = data.iloc[:,:1400].groupby(data['Section']).mean()
missinglipid = np.sum(peakmeans < 0.00015).sort_values()
missinglipid

plt.plot(np.array(missinglipid))
plt.show()

In [None]:
dropout_acceptable_lipids = missinglipid.loc[missinglipid < 4].index.astype(float).astype(str)
scores = scores.loc[scores.index.isin(dropout_acceptable_lipids),:]
scores

## Cluster the lipids in the space of scores to detect "good" and "bad" groups of lipids

In [None]:
# preprocess, then cluster

moran_sorted.index = moran_sorted.index.astype(float).astype(str)
scores['moran'] = moran_sorted.loc[scores.index.astype(float).astype(str)]
missinglipid.index = missinglipid.index.astype(float).astype(str)
scores['missinglipid'] = missinglipid.loc[scores.index.astype(float).astype(str)]

scores = scores.loc[scores['combined_score'] > 0,:]
X = scores[['var_of_vars',	'combined_score',	'moran',	'missinglipid']]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

kmeans = KMeans(n_clusters=10, random_state=42)
cluster_labels = kmeans.fit_predict(X_scaled)
plt.scatter(X['combined_score'], X['moran'], c=cluster_labels, s=2, cmap="tab20")
plt.show()

In [None]:
cols = np.array(data.columns)
cols[:1400] = cols[:1400].astype(float).astype(str)
data.columns = cols
data

## Manually inspect and annotate "good" and "bad" groups of lipids

In [None]:
scores['cl'] = cluster_labels

for xxx in range(0,10):
    
    print("**************")

    scoresaaa = scores.loc[scores['cl'] == xxx,:]

    for currentPC in np.random.choice(np.array(scoresaaa.index), 5).astype(float).astype(str):
        print(scoresaaa.loc[scoresaaa['spatvar'].astype(float).astype(str) == currentPC,:])
        results = []
        filtered_data = pd.concat([data[['yccf','zccf','Section']], data.loc[:,str(currentPC)]], axis=1)

        for section in filtered_data['Section'].unique():
            subset = filtered_data[filtered_data['Section'] == section]

            perc_2 = subset[str(currentPC)].quantile(0.02)
            perc_98 = subset[str(currentPC)].quantile(0.98)

            results.append([section, perc_2, perc_98])
        percentile_df = pd.DataFrame(results, columns=['Section', '2-perc', '98-perc'])
        med2p = percentile_df['2-perc'].median()
        med98p = percentile_df['98-perc'].median()

        cmap = plt.cm.inferno

        fig, axes = plt.subplots(4, 8, figsize=(20, 10))
        axes = axes.flatten()

        for section in range(1, 33):
            ax = axes[section - 1]
            ddf = filtered_data[(filtered_data['Section'] == section)]

            ax.scatter(ddf['zccf'], -ddf['yccf'], c=ddf[str(currentPC)], cmap="inferno", s=0.5,rasterized=True, vmin=med2p, vmax=med98p) 
            ax.axis('off')
            ax.set_aspect('equal')

        cbar_ax = fig.add_axes([0.92, 0.15, 0.02, 0.7])
        norm = Normalize(vmin=med2p, vmax=med98p)
        sm = ScalarMappable(norm=norm, cmap=cmap)
        fig.colorbar(sm, cax=cbar_ax)

        plt.tight_layout(rect=[0, 0, 0.9, 1])
        plt.show()

In [None]:
# manually annotate the score clusters to keep and to discard
scores['keep'] = 1
scores.loc[scores['cl'].isin([1,2,3,7,8]), 'keep'] = 0

In [None]:
# check the scores space for good and bad features, then keep the good ones
plt.scatter(scores['combined_score'], scores['moran'], c=scores['keep'], s=2)
plt.show()

scores = scores.loc[scores['keep'] == 1,:]
scores

In [None]:
# check the distributions of lipids to add a round of manual feature curation (removing bad lipids)

with PdfPages('ranking_clustering_featsel.pdf') as pdf:
    for currentPC in tqdm(np.array(scores['spatvar'].astype(float).astype(str))):
        results = []
        filtered_data = pd.concat([data[['yccf','zccf','Section']], data.loc[:,str(currentPC)]], axis=1)[::5] #### ds to go faster

        for section in filtered_data['Section'].unique():
            subset = filtered_data[filtered_data['Section'] == section]

            perc_2 = subset[str(currentPC)].quantile(0.02)
            perc_98 = subset[str(currentPC)].quantile(0.98)

            results.append([section, perc_2, perc_98])
        percentile_df = pd.DataFrame(results, columns=['Section', '2-perc', '98-perc'])
        med2p = percentile_df['2-perc'].median()
        med98p = percentile_df['98-perc'].median()

        cmap = plt.cm.inferno

        fig, axes = plt.subplots(4, 8, figsize=(20, 10))
        axes = axes.flatten()

        for section in range(1, 33):
            ax = axes[section - 1]
            ddf = filtered_data[(filtered_data['Section'] == section)]

            ax.scatter(ddf['zccf'], -ddf['yccf'], c=ddf[str(currentPC)], cmap="inferno", s=2.0, alpha=0.8,rasterized=True, vmin=med2p, vmax=med98p) 
            ax.axis('off')
            ax.set_aspect('equal')

        cbar_ax = fig.add_axes([0.92, 0.15, 0.02, 0.7])
        norm = Normalize(vmin=med2p, vmax=med98p)
        sm = ScalarMappable(norm=norm, cmap=cmap)
        fig.colorbar(sm, cax=cbar_ax)

        plt.tight_layout(rect=[0, 0, 0.9, 1])
        pdf.savefig(fig) 
        plt.close(fig)

In [None]:
# this was too gentle - there are still several peaks that break the NMF, i'll remove them manually for now. mostly corresponds to raising Moran's to 0.5

bad_folks = np.array([25,26,34,35,45,51,55,59,62,67,70,72,73,76,77,91,92,95,97,101,102,103,106,107,110,116,117,118,121,122,127,132,134])
sub_scores = scores.iloc[bad_folks,:]

In [None]:
# is some cluster to be discarded altogether? it seems not to be the case

tmp = scores['cl'].value_counts()
tmp2 = sub_scores['cl'].value_counts()
tmp2 / tmp.loc[tmp2.index]

In [None]:
goodscores = scores.loc[~scores.index.isin(sub_scores.index),:]
goodscores

## Export the feature-selected dataset with lipids that are overall consistent across sections to be used for clustering and to recover other lipids by imputation

In [None]:
### make a feature-selected dataframe

#scores = scores.iloc[:elbow_index,:]
goodscores.to_csv("scores_featsellipids_log.csv")
featsel_lba = pd.concat([data.iloc[:,-23:], data.loc[:,np.array(goodscores['spatvar']).astype(float).astype(str)]], axis=1)

featsel_lba.to_hdf("20241103_featsel_lba.h5", key="table")
peaks_for_imputation_and_clustering = np.array(goodscores['spatvar'])
np.save("peaks_for_imputation_and_clustering.npy", peaks_for_imputation_and_clustering)

featsel_lba                         