In [None]:
import pandas as pd

merfish = pd.read_parquet("./zenodo/multimodal/cell_filtered_w500genes.parquet") # just basic preprocessing of the Allen MERFISH coronal atlas
datavignettes = pd.read_parquet("./zenodo/maindata_2.parquet")
lipidsinallen = datavignettes[['xccf','yccf','zccf']].dropna()
merfishinallen = merfish[['x_ccf', 'y_ccf', 'z_ccf']]
merfishinallen.columns = ['xccf','yccf','zccf']
merfishinallen

In [None]:
xx = 'xccf'
print(merfishinallen[xx].max())
print(lipidsinallen[xx].max())

xx = 'yccf'
print(merfishinallen[xx].max())
print(lipidsinallen[xx].max())

xx = 'zccf'
print(merfishinallen[xx].max())
print(lipidsinallen[xx].max()) # perfect

## Match the two datasets by constrained neighbor search

In [None]:
from allensdk.core.mouse_connectivity_cache import MouseConnectivityCache
mcc = MouseConnectivityCache(manifest_file='mouse_connectivity_manifest.json')
annotation, _ = mcc.get_annotation_volume()
merfish['x_index'] = (merfish['x_ccf']*40).astype(int)
merfish['y_index'] = (merfish['y_ccf']*40).astype(int)
merfish['z_index'] = (merfish['z_ccf']*40).astype(int)
merfish['id'] = annotation[merfish['x_index'], merfish['y_index'], merfish['z_index']]
merfish['id']

In [None]:
datavignettes = datavignettes.dropna(subset=['id'])
datavignettes['id'] = datavignettes['id'].astype(int).astype(str)
merfish['id'] = merfish['id'].astype(str)
merfishinallen['id'] = merfish['id'].values
#drop vascular and immune cells first...
merfishinallen['division'] = merfish['division'].values
merfishinallen = merfishinallen.loc[~merfishinallen['division'].isin(['6 Vascular', '7 Immune']),:]
datavignettes =datavignettes.dropna(subset=['xccf'])
datavignettess = datavignettes.copy()

In [None]:
import numpy as np
import pandas as pd
from scipy.spatial import cKDTree
from threadpoolctl import threadpool_limits, threadpool_info
threadpool_limits(limits=8)
import os
os.environ['OMP_NUM_THREADS'] = '6'

# 1) pre-group merfish and build trees
trees = {}
feats = {}
for id_, sub in merfish.groupby('id'): # this is a priori done for all sections once
    coords = sub[['x_ccf','y_ccf','z_ccf']].to_numpy()
    trees[id_] = cKDTree(coords)
    feats[id_] = sub.iloc[:, -554:-55].to_numpy()
    
from tqdm import tqdm

thr = 0.075
idxs = []
means = []

for iiii in tqdm(datavignettess['SectionID'].unique()):
    datavignettes = datavignettess.loc[datavignettess['SectionID'] == iiii,:]

    for id_, dsub in datavignettes.groupby('id'):
        tree = trees.get(id_)
        if tree is None:
            continue
        qpts = dsub[['xccf','yccf','zccf']].to_numpy()
        nbrs_list = tree.query_ball_point(qpts, r=thr)
        arr = feats[id_]
        for i, nbrs in enumerate(nbrs_list):
            if nbrs:
                idxs.append(dsub.index[i])
                means.append(arr[nbrs].mean(axis=0))

                datavignettes = datavignettess                
result = pd.DataFrame(np.array(means), index=idxs, columns=sub.iloc[:, -554:-55].columns)
result['SectionID'] = datavignettes.loc[result.index,'SectionID']
result['xccf'] = datavignettes.loc[result.index,'xccf']
result['yccf'] = datavignettes.loc[result.index,'yccf']
result['zccf'] = datavignettes.loc[result.index,'zccf']
result['boundary'] = datavignettes.loc[result.index,'boundary']
result.to_parquet("spatialgoodgexpr.parquet")
result

## Check imputation quality visually

In [None]:
r = result.loc[result['SectionID'].isin([76.,  82., 106.,   2., 131.,  88.,  63., 112.,  60.,  62., 118.,
     21.,  45., 123.,  58., 100.,  83.,  61.,  59.,  98.,  28.,  19.,
     43.,  18., 107.,  29., 104., 124.,  52., 129.,  14.,  78.,  15.,
     65.,  89.,  41., 117., 111.,  68.,  70., 125.,  92.,  16., 122.,
    114.,  91.,  11.,  24.,  71.,  46.,  57., 120.,  75.]),:]

sections_top10_fast = r['xccf'].groupby(r['SectionID']).mean().sort_values()[::5].index # equispace rostrocaudally manually good sections...
sections_top10_fast

# i wanna keep

import matplotlib.pyplot as plt

for xxx in sections_top10_fast:
    mer = result.loc[result['SectionID'] == xxx,:]

    cont = mer.loc[mer['boundary'] == 1,:]

    plt.scatter(mer['zccf'], -mer['yccf'], c=mer['ENSMUST00000102665'], cmap="Reds", s=0.1, rasterized=True)
    plt.scatter(cont['zccf'], -cont['yccf'],
                     c='black', s=0.01, alpha=1.0, rasterized=True)

    plt.show()

## Train genes to lipids XGBoost models

In [None]:
genes = result.iloc[:,:499]
lipids = datavignettes.loc[genes.index,:].iloc[:,:173]
sids = datavignettes.loc[genes.index,'SectionID']
lipids2learn = datavignettes.columns[:173]

import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split, ParameterSampler
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
import xgboost as xgb

from threadpoolctl import threadpool_limits
threadpool_limits(limits=8)
os.environ['OMP_NUM_THREADS'] = '6'

# ---- SECTIONS TO LOOP OVER ----
sections_to_use = np.array([41.0, 91.0, 11.0, 104.0, 58.0, 60.0, 88.0, 76.0, 65.0, 83.0, 78.0])

# ---- SCORER FOR PEARSON R ----
def pearson_scorer(y_true, y_pred):
    if np.std(y_true) == 0 or np.std(y_pred) == 0:
        return 0.0
    return pearsonr(y_true, y_pred)[0]

# ---- HYPERPARAM DISTRIBUTION & SAMPLER ----
param_dist = {
    "n_estimators":  [300],
    "learning_rate": [0.05],
    "max_depth":     [6],
    "subsample":     [0.6]
}
n_iter     = 1
param_list = list(ParameterSampler(param_dist, n_iter=n_iter, random_state=42))

results_by_lipid = {}

for lipid in tqdm(lipids2learn, desc="Lipids (XGB Single Split)"):
    records = []

    for sect in tqdm(sections_to_use, desc=f"  Sections for {lipid}", leave=False):
        # select only this section
        idx   = sids[sids == sect].index
        X_sec = genes.loc[idx]
        y_sec = lipids.loc[idx, lipid]

        # train/test split per-section
        X_tr, X_te, y_tr, y_te = train_test_split(
            X_sec, y_sec, test_size=0.2, random_state=42
        )

        # fit scaler & PCA on TRAIN, apply to TRAIN & TEST
        scaler = StandardScaler().fit(X_tr)
        X_tr_s = scaler.transform(X_tr)
        X_te_s = scaler.transform(X_te)

        pca = PCA(n_components=0.95, random_state=42).fit(X_tr_s)
        X_tr_p = pca.transform(X_tr_s)
        X_te_p = pca.transform(X_te_s)

        # evaluate each parameter set on TRAIN→TEST
        for params in param_list:
            model = xgb.XGBRegressor(
                objective='reg:squarederror',
                n_jobs=4,
                random_state=42,
                **params
            )
            model.fit(X_tr_p, y_tr)
            y_te_pred = model.predict(X_te_p)

            rec = params.copy()
            rec.update({
                'Section':  sect,
                'Test_MSE': mean_squared_error(y_te, y_te_pred),
                'Test_R':   pearson_scorer(y_te, y_te_pred)
            })
            records.append(rec)

    # aggregate & save
    results_df = pd.DataFrame(records)
    fname = f"{lipid.replace(' ', '_').replace('/', '_')}_xgb_no_cv_results.csv"
    results_df.to_csv(fname, index=False)
    results_by_lipid[lipid] = results_df

import pandas as pd
import numpy as np
from threadpoolctl import threadpool_limits, threadpool_info
threadpool_limits(limits=8)
import os
os.environ['OMP_NUM_THREADS'] = '6'

pattern = '_xgb_no_cv_results.csv'
files = [
    f 
    for f in os.listdir('.') 
    if f.endswith(pattern) and 'l2l' not in f
]

lipid_dfs = {
    f.replace(pattern, ''): pd.read_csv(f)
    for f in files
}

testr = [lipid_dfs[xxx]['Test_R'].mean() for xxx in list(lipid_dfs.keys())]
testmse = [lipid_dfs[xxx]['Test_MSE'].mean() for xxx in list(lipid_dfs.keys())]

performance_gene2lipideasy = pd.DataFrame([testr, testmse], index = ["test R", "test MSE"], columns = list(lipid_dfs.keys())).T
performance_gene2lipideasy

## As a baseline to estimate the irreducible noise, also train lipid to lipid XGBoost models on the same data

In [None]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split, ParameterSampler
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
import xgboost as xgb

from threadpoolctl import threadpool_limits
threadpool_limits(limits=8)
os.environ['OMP_NUM_THREADS'] = '6'

# ---- SECTIONS TO LOOP OVER ----
sections_to_use = np.array([41.0, 91.0, 11.0, 104.0, 58.0, 60.0, 88.0, 76.0, 65.0, 83.0, 78.0])

# ---- SCORER FOR PEARSON R ----
def pearson_scorer(y_true, y_pred):
    if np.std(y_true) == 0 or np.std(y_pred) == 0:
        return 0.0
    return pearsonr(y_true, y_pred)[0]

# ---- HYPERPARAM DISTRIBUTION & SAMPLER ----
param_dist = {
    "n_estimators":  [300],
    "learning_rate": [0.05],
    "max_depth":     [6],
    "subsample":     [0.6]
}
n_iter     = 1
param_list = list(ParameterSampler(param_dist, n_iter=n_iter, random_state=42))

results_by_lipid = {}

def pearson_scorer(y_true, y_pred):
    return np.corrcoef(y_true, y_pred)[0, 1]

results_by_lipid = {}
all_lipids = lipids.columns.values

targets = np.setdiff1d(all_lipids, np.array(list(results_by_lipid.keys())))

import numpy as np
import pandas as pd
import pickle
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from threadpoolctl import threadpool_limits
threadpool_limits(limits=8)
os.environ['OMP_NUM_THREADS'] = '6'

for lipid in tqdm(targets, desc="Lipids (XGB Single Split)"):
    records = []

    # Define predictors: all lipids except the current target
    predictors = [l for l in all_lipids if l != lipid]

    for sect in tqdm(sections_to_use, desc=f"  Sections for {lipid}", leave=False):
        idx = sids[sids == sect].index

        # Select predictor and target data for this section
        X_sec = lipids.loc[idx, predictors]
        y_sec = lipids.loc[idx, lipid]

        # train/test split per-section
        X_tr, X_te, y_tr, y_te = train_test_split(
            X_sec, y_sec, test_size=0.2, random_state=42
        )

        # Standardize then PCA on training data
        scaler = StandardScaler().fit(X_tr)
        X_tr_s = scaler.transform(X_tr)
        X_te_s = scaler.transform(X_te)

        pca = PCA(n_components=0.95, random_state=42).fit(X_tr_s)
        X_tr_p = pca.transform(X_tr_s)
        X_te_p = pca.transform(X_te_s)

        # Evaluate each parameter set
        for params in param_list:
            model = xgb.XGBRegressor(
                objective='reg:squarederror',
                n_jobs=4,
                random_state=42,
                **params
            )
            model.fit(X_tr_p, y_tr)
            y_te_pred = model.predict(X_te_p)

            rec = params.copy()
            rec.update({
                'Section': sect,
                'Test_MSE': mean_squared_error(y_te, y_te_pred),
                'Test_R': pearson_scorer(y_te, y_te_pred)
            })
            records.append(rec)

    # Aggregate & save
    results_df = pd.DataFrame(records)
    fname = f"{lipid.replace(' ', '_').replace('/', '_')}_xgb_no_cv_results_L2L.csv"
    results_df.to_csv(fname, index=False)
    results_by_lipid[lipid] = results_df

# Save all results
with open("results_by_lipid_complete_L2L.pkl", "wb") as f:
    pickle.dump(results_by_lipid, f, protocol=pickle.HIGHEST_PROTOCOL)

import pandas as pd
import numpy as np
from threadpoolctl import threadpool_limits, threadpool_info
threadpool_limits(limits=8)
import os
os.environ['OMP_NUM_THREADS'] = '6'

pattern = '_xgb_no_cv_results_L2L.csv'
files = [
    f 
    for f in os.listdir('.') 
    if f.endswith(pattern)
]

lipid_dfs = {
    f.replace(pattern, ''): pd.read_csv(f)
    for f in files
}

lipid_dfs

testr = [lipid_dfs[xxx]['Test_R'].mean() for xxx in list(lipid_dfs.keys())]
testmse = [lipid_dfs[xxx]['Test_MSE'].mean() for xxx in list(lipid_dfs.keys())]
len(testr)

perf_test = pd.DataFrame([testr, testmse], index = ["test R", "test MSE"], columns = list(lipid_dfs.keys())).T
performance_lipid2lipideasy = perf_test.copy()
performance_lipid2lipideasy

## Compare lipid predictions from genes and from lipids

In [None]:
import matplotlib.pyplot as plt

plt.scatter(performance_lipid2lipideasy['test R'], performance_gene2lipideasy['test R'])
plt.show()

import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42

g2l = performance_gene2lipideasy
l2l = performance_lipid2lipideasy
data_l2l = l2l['test R'].values
data_g2l = g2l['test R'].values

pearson_min, pearson_max = -1.0, 1.0
bins = np.linspace(pearson_min, pearson_max, 30)
purples = plt.cm.Purples(np.linspace(0.3, 0.8, 2))
labels  = ['l2l', 'g2l']
datasets = [data_l2l, data_g2l]

plt.figure(figsize=(8, 5))
for data, label, color in zip(datasets, labels, purples):
    plt.hist(data, bins=bins, density=True, alpha=0.4, label=f'{label} hist', color=color)
x_grid = np.linspace(pearson_min, pearson_max, 300)
for data, label, color in zip(datasets, labels, purples):
    kde = gaussian_kde(data)
    plt.plot(x_grid, kde(x_grid), color=color, lw=2, label=f'{label} KDE')

ax = plt.gca()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.grid(False)

plt.xlabel('Test Pearson r')
plt.ylabel('Density')
plt.title('Density Histograms + KDEs of Test Pearson r')
plt.legend(frameon=False)
plt.tight_layout()
plt.savefig("prediction_test_set_SPAT.pdf")
plt.show()

In [None]:
import numpy as np

r_l2l = data_l2l
r_g2l = data_g2l

# -----------------------------------------------------------------------------
# 2. compute per‐lipid irreducible fraction:
#      irreducible_i = 1 - (R_g2l[i]^2) / (R_l2l[i]^2)
# -----------------------------------------------------------------------------
eps = 1e-8
denom = r_l2l**2 + eps
var_explained_by_genes = (r_g2l**2) / denom
irreducible = 1.0 - var_explained_by_genes

# -----------------------------------------------------------------------------
# 3. basic summaries
# -----------------------------------------------------------------------------
mean_irred   = np.mean(irreducible)
median_irred = np.median(irreducible)
p25, p75     = np.percentile(irreducible, [25, 75])

print(f"Mean irreducible fraction:   {mean_irred:.3f}")
print(f"Median irreducible fraction: {median_irred:.3f}")
print(f"25th / 75th percentiles:     {p25:.3f} / {p75:.3f}")

# -----------------------------------------------------------------------------
# 4. bootstrap a 95% CI on the *mean* irreducible fraction
# -----------------------------------------------------------------------------
def bootstrap_mean_irred(r_l2l, r_g2l, n_boot=5000, seed=0):
    rng = np.random.RandomState(seed)
    n = len(r_l2l)
    boot_means = []
    for _ in range(n_boot):
        idx = rng.choice(n, size=n, replace=True)
        denom = r_l2l[idx]**2 + eps
        ir = 1.0 - (r_g2l[idx]**2) / denom
        boot_means.append(ir.mean())
    return np.percentile(boot_means, [2.5, 97.5])

ci_lower, ci_upper = bootstrap_mean_irred(r_l2l, r_g2l)
print(f"95% CI on mean irreducible fraction: [{ci_lower:.3f}, {ci_upper:.3f}]")

## CCA analysis on metabolic genes

In [None]:
df_lipids = lipidome.loc[transcriptome.index, :].iloc[:,:173]
df_genes = transcriptome.loc[:, transcriptome.columns.isin(metabolic)]
df_genes.shape

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cross_decomposition import CCA
from sklearn.model_selection import KFold

from threadpoolctl import threadpool_limits, threadpool_info
threadpool_limits(limits=8)
import os
os.environ['OMP_NUM_THREADS'] = '6'

scaler_g = StandardScaler()
scaler_l = StandardScaler()

X = scaler_g.fit_transform(df_genes.values)   
Y = scaler_l.fit_transform(df_lipids.values) 
print(X.shape)

n_components = 50 
cca = CCA(n_components=n_components, scale=False)  
Xc, Yc = cca.fit_transform(X, Y)

corrs = np.array([np.corrcoef(Xc[:, k], Yc[:, k])[0,1] 
                  for k in range(n_components)])
shared_var = corrs**2

import pickle
output = {
    'scaler_genes': scaler_g,
    'scaler_lipids': scaler_l,
    'cca_model': cca,
    'Xc': Xc,
    'Yc': Yc,
    'canonical_correlations': corrs,
    'shared_variance': shared_var
}

with open('cca_results.pkl', 'wb') as f:
    pickle.dump(output, f, protocol=pickle.HIGHEST_PROTOCOL)

print("All outputs saved to cca_results.pkl")
scaler_g = output['scaler_genes']
scaler_l = output['scaler_lipids']
cca = output['cca_model']
Xc = output['Xc']
Yc = output['Yc']
corrs = output['canonical_correlations']
shared_var = output['shared_variance']

import numpy as np

n_samples, n_components = Xc.shape
p = Y.shape[1]  

lipid_loading = np.zeros((p, n_components))  
for k in range(n_components):
    for j in range(p):
        lipid_loading[j, k] = np.corrcoef(Y[:, j], Xc[:, k])[0, 1]

lipid_loading_sq = lipid_loading**2  
comm_Y = lipid_loading_sq.mean(axis=0)  

redundancy = comm_Y * shared_var   
cum_redundancy = np.cumsum(redundancy)

for k in range(5):   
    print(f"Component {k+1}:")
    print(f"  • ρ_{k+1}^2 = {shared_var[k]:.4f}") 
    print(f"  •  avg. lipid communality = {comm_Y[k]:.4f}")
    print(f"  •  redundancy (i.e. lipid‐variance explained by X) = {redundancy[k]:.4f}")
    print(f"  •  cumulative up to {k+1} = {cum_redundancy[k]:.4f}")
    print("")

for k in range(50): 
    print(f"Component {k+1}:")
    print(f"  • ρ_{k+1}^2 = {shared_var[k]:.4f}") 
    print(f"  •  avg. lipid communality = {comm_Y[k]:.4f}")
    print(f"  •  redundancy (i.e. lipid‐variance explained by X) = {redundancy[k]:.4f}")
    print(f"  •  cumulative up to {k+1} = {cum_redundancy[k]:.4f}")
    print("")

import numpy as np
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42

components = np.arange(1, len(redundancy) + 1)

fig, ax1 = plt.subplots(figsize=(8, 5))

ax1.bar(components, redundancy, color='lightgray', alpha=0.8, label='Redundancy per component')
ax1.set_xlabel('Canonical Component (k)')
ax1.set_ylabel('Redundancy (mean $r^2$)')
ax1.tick_params(axis='y')
ax1.set_xticks(components)
ax1.set_xticklabels(components, rotation=0)

ax2 = ax1.twinx()
ax2.plot(components, cum_redundancy, color='black', marker='o', linewidth=2, label='Cumulative redundancy')
ax2.set_ylabel('Cumulative Redundancy')
ax2.tick_params(axis='y')

plt.title('Redundancy‐Scree Plot')
lines_1, labels_1 = ax1.get_legend_handles_labels()
lines_2, labels_2 = ax2.get_legend_handles_labels()
ax1.legend(lines_1 + lines_2, labels_1 + labels_2, loc='upper left')

plt.tight_layout()
plt.savefig("ccaresult.pdf")
plt.show()