In [None]:
import os
import joblib
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from threadpoolctl import threadpool_limits, threadpool_info
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
from matplotlib.cm import ScalarMappable
from matplotlib.colors import Normalize
import seaborn as sns
import scanpy as sc
import umap.umap_ as umap

os.environ["OPENBLAS_NUM_THREADS"] = "10"
os.environ["MKL_NUM_THREADS"] = "10"
os.environ["OMP_NUM_THREADS"] = "6"
threadpool_limits(limits=8)

## Import and prepare the data

In [None]:
# import the harmonized-NMF embeddings: they will be used as predictors

embeddings = pd.read_hdf("corrected_nmfall_nochunking.h5ad", key="table")
embeddings

In [None]:
# import the Moran's I scores - they will be used to pick sections that are spatially good enough to train on

morans = pd.read_csv("morans_by_sec.csv", index_col=0)
morans

In [None]:
isitrestorable = (morans > 0.4).sum(axis=1).sort_values()
torestore = isitrestorable[isitrestorable > 3].index # there must be at least three good sections to train on and one to evaluate on
torestore

In [None]:
# import and preprocess the whole dataset
alldata = pd.read_hdf("20241103_pixels_allips_allbrains_allen_pixelcleaned.h5ad")

cols = np.array(alldata.columns)
cols[:1400]=cols[:1400].astype(float).astype(str)
alldata.columns = cols

lipids_to_restore = alldata.loc[:,torestore.astype(float).astype(str)]
lipids_to_restore = lipids_to_restore.iloc[:-5,:]
lipids_to_restore

## Select the sections to be used to train XGB models for imputation

In [None]:
usage_dataframe = morans.iloc[:,:70].copy() # use the atlases as the basis to impute on

# remove the broken sections
brokenones = alldata[['SectionID', 'BadSection']].drop_duplicates().dropna()
goodones = brokenones.loc[brokenones['BadSection'] == 0,'SectionID'].values
usage_dataframe = usage_dataframe.loc[:, usage_dataframe.columns.astype(float).isin(goodones)]

# choose the best sections to train and validate XGBoost models on
def top_3_above_threshold(row, threshold=0.4):
    
    above_threshold = row >= threshold
    
    if above_threshold.sum() >= 3:
        
        top_3 = row.nlargest(3).index
        result = pd.Series(False, index=row.index)
        result[top_3] = True
    else:
        result = above_threshold
    
    return result

usage_dataframe = usage_dataframe.apply(top_3_above_threshold, axis=1)

usage_dataframe=usage_dataframe.loc[np.array(usage_dataframe.sum(axis=1).index[usage_dataframe.sum(axis=1) > 2]),:]
usage_dataframe = usage_dataframe.loc[usage_dataframe.index.astype(float).astype(str) != '953.120019',:]
usage_dataframe # could be further be optimized by ensuring the 3 training sections are not-so-close-to-each-other

In [None]:
usage_dataframe.sum() # (strange distribution...)

In [None]:
usage_dataframe.T.sum().min() # ok all good

In [None]:
# some data prep
lipids_to_restore = lipids_to_restore.loc[:,usage_dataframe.index.astype(float).astype(str)]
lipids_to_restore['SectionID'] = alldata['SectionID']
coordinates = alldata.loc[embeddings.index, ['SectionID', 'x', 'y']]
coordinates['SectionID'] = coordinates['SectionID'].astype(float).astype(int).astype(str)

## Train XGB models for imputation

In [None]:
metrics_df = pd.DataFrame(
    columns=['train_pearson_r', 'train_rmse', 'val_pearson_r', 'val_rmse']
)

for index, row in tqdm(usage_dataframe.iterrows(), total=usage_dataframe.shape[0]):
    #try:
    train_sections = row[row].index.tolist()  
    val_sections = train_sections[1]
    train_sections = [train_sections[0], train_sections[2]]

    train_data = embeddings.loc[coordinates['SectionID'].isin(train_sections),:]
    y_train = lipids_to_restore.loc[train_data.index, str(index)]

    # take one out and use it for validation: can we trust this XGB model? 
    val_data = embeddings.loc[coordinates['SectionID'] == val_sections,:]
    y_val = lipids_to_restore.loc[val_data.index, str(index)]

    model = XGBRegressor()
    model.fit(train_data, y_train)

    train_pred = model.predict(train_data)
    val_pred = model.predict(val_data)

    train_pearson = pearsonr(y_train, train_pred)[0]
    val_pearson = pearsonr(y_val, val_pred)[0]
    print(val_pearson)
    train_rmse = np.sqrt(mean_squared_error(y_train, train_pred))
    val_rmse = np.sqrt(mean_squared_error(y_val, val_pred))
    metrics_df.loc[index] = {
        'train_pearson_r': train_pearson,
        'train_rmse': train_rmse,
        'val_pearson_r': val_pearson,
        'val_rmse': val_rmse
    }

    # save the model
    model_path = os.path.join('xgbmodels_onmnnnmf', str(index)+'_xgb_model.joblib')
    joblib.dump(model, model_path)

    #except Exception as e:
     #   print("exception at index: "+str(index))
      #  continue

In [None]:
# check the performance on the hold-out test set across to-be-imputed lipids
plt.hist(metrics_df['val_pearson_r'], bins=30)
plt.show()

## Deploy the trained XGB models across all acquisitions

In [None]:
# loop to import and deploy the models, creating one column at a time. deploy on all sections, also on the training ones, to be in-distribution
coordinates = coordinates[['SectionID',	'x',	'y']]
for file in tqdm(os.listdir('xgbmodels_onmnnnmf')[1:]):
    model_path = os.path.join('xgbmodels_onmnnnmf', file)
    model = joblib.load(model_path)
    coordinates[file] = model.predict(embeddings)
coordinates.columns = [
    col.replace('_xgb_model.joblib', '') if i >= 3 else col 
    for i, col in enumerate(coordinates.columns)
]

In [None]:
# filter with the metrics df to keep only "reliably imputed" lipids
metrics_df.to_csv("metrics_imputation_df.csv")

# keep only the lipids whose generalization Pearson's R is good enough (0.4 threshold)
cols = np.array(coordinates.columns)
cols[3:] = cols[3:].astype(float).astype(str)
coordinates.columns = cols
coordinates = coordinates.loc[:, metrics_df.loc[metrics_df['val_pearson_r'] > 0.4,:].index.astype(float).astype(str)]
coordinates.to_hdf("20241113_xgboost_recovered_lipids.h5ad", key="table")
coordinates

## Check on examples the effect of imputation with spatial plots

In [None]:
currentPC = '1002.581042'
filtered_data = coordinates
for PC_I in range(1):

    results = []
    
    for section in filtered_data['SectionID'].unique():
        subset = filtered_data[filtered_data['SectionID'] == section]

        perc_2 = subset[currentPC].quantile(0.02)
        perc_98 = subset[currentPC].quantile(0.98)

        results.append([section, perc_2, perc_98])
    percentile_df = pd.DataFrame(results, columns=['SectionID', '2-perc', '98-perc'])
    med2p = percentile_df['2-perc'].median()
    med98p = percentile_df['98-perc'].median()

    cmap = plt.cm.plasma

    fig, axes = plt.subplots(14, 10, figsize=(20, 38))
    axes = axes.flatten()

    for section in np.unique(filtered_data['SectionID']):
        ax = axes[int(section) - 1]
        try:
            ddf = filtered_data[(filtered_data['SectionID'] == section)]

            ax.scatter(ddf['y'], -ddf['x'], c=ddf[currentPC], cmap="plasma", s=0.5,rasterized=True, vmin=med2p, vmax=med98p) 
            ax.axis('off')
            ax.set_aspect('equal')
        except:
            continue

    cbar_ax = fig.add_axes([0.92, 0.15, 0.02, 0.7])
    norm = Normalize(vmin=med2p, vmax=med98p)
    sm = ScalarMappable(norm=norm, cmap=cmap)
    fig.colorbar(sm, cax=cbar_ax)

    plt.tight_layout(rect=[0, 0, 0.9, 1])
    plt.show()