In [None]:
import os

# force all backends to use just 1 thread
os.environ["OPENBLAS_NUM_THREADS"]   = "1"
os.environ["OMP_NUM_THREADS"]        = "1"
os.environ["MKL_NUM_THREADS"]        = "1"
os.environ["VECLIB_MAXIMUM_THREADS"] = "1"
os.environ["NUMEXPR_NUM_THREADS"]    = "1"

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from scipy.stats import pearsonr
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42

from threadpoolctl import threadpool_limits, threadpool_info
threadpool_limits(limits=8)
import os
os.environ['OMP_NUM_THREADS'] = '6'

In [None]:
dat = pd.read_parquet("./zenodo/maindata_2.parquet")
dat = dat.loc[dat['Sample'] == "ReferenceAtlas",:]

## Summarize genes and lipids at the cell type level

In [None]:
gexpr = pd.read_parquet("./zenodo/multimodal/multimodal_on_macoscko.parquet")
gexpr = gexpr[~gexpr.index.duplicated(keep='first')]
genes = gexpr.iloc[:,:-176]
lipids = gexpr.iloc[:,-173:]
celltypesnow = pd.read_hdf("./zenodo/multimodal/celltypesnow.h5ad", key="table")
lipids = lipids.groupby(celltypesnow).mean()
genes = genes.groupby(celltypesnow).mean()
genes

## Prepare the data

In [None]:
X = genes
y = lipids
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

pca = PCA(n_components=0.99, random_state=42) # we'll use a PCA capturing most variance of gene expression and look at the PC loadings to bypass multicollinearity
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)
print(f"Original number of features: {X_train.shape[1]}")
print(f"Reduced number of principal components: {X_train_pca.shape[1]}")

## Linear predictive model genes to lipids

In [None]:
linear_regressor = LinearRegression()

multi_output_regressor = MultiOutputRegressor(linear_regressor, n_jobs=8)
multi_output_regressor.fit(X_train_pca, y_train)
print("Linear Regression MultiOutputRegressor training complete.")

y_train_pred = multi_output_regressor.predict(X_train_pca)
y_test_pred = multi_output_regressor.predict(X_test_pca)

train_mse = mean_squared_error(y_train, y_train_pred, multioutput='raw_values')
test_mse = mean_squared_error(y_test, y_test_pred, multioutput='raw_values')

train_r = []
test_r = []

for i, lipid in enumerate(y_train.columns):
    r_train, _ = pearsonr(y_train[lipid], y_train_pred[:, i])
    train_r.append(r_train)
    r_test, _ = pearsonr(y_test[lipid], y_test_pred[:, i])
    test_r.append(r_test)

pearson_df = pd.DataFrame({
    'Lipid': y_train.columns,
    'Train_Pearson_R': train_r,
    'Test_Pearson_R': test_r,
    'Train_MSE': train_mse,
    'Test_MSE': test_mse
})

pearson_df.to_csv("linear_genestolipids.csv")

plt.hist(pearson_df['Test_Pearson_R'], bins=20, color="darkred", alpha=0.8)
plt.hist(pearson_df['Train_Pearson_R'], bins=20, color="black", alpha=0.8)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
plt.gca().yaxis.set_ticks([])
plt.gca().xaxis.set_ticks_position('bottom')
plt.show()

pearson_df.sort_values('Test_Pearson_R')[:20]

## Elastic net: a linear but regularized model

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
from tqdm import tqdm

param_grid = {
    'alpha': [0.001, 0.01, 0.1, 1.0],
    'l1_ratio': [0.1, 0.5, 0.9]
}

lipids = []
train_mses = []
test_mses = []
train_r_values = []
test_r_values = []
best_alphas = []
best_l1_ratios = []

for lipid in tqdm(y_train.columns):

    y_train_lipid = y_train[lipid]
    y_test_lipid = y_test[lipid]

    elastic_net = ElasticNet(max_iter=100000, random_state=42)

    grid_search = GridSearchCV(
        elastic_net, 
        param_grid, 
        cv=3, 
        scoring='neg_mean_squared_error',
        n_jobs=8, 
        verbose=2
    )
    grid_search.fit(X_train_pca, y_train_lipid)

    best_model = grid_search.best_estimator_
    best_alpha = grid_search.best_params_['alpha']
    best_l1_ratio = grid_search.best_params_['l1_ratio']
    best_model.fit(X_train_pca, y_train_lipid)

    y_train_pred = best_model.predict(X_train_pca)
    y_test_pred = best_model.predict(X_test_pca)

    # metrics
    train_mse = mean_squared_error(y_train_lipid, y_train_pred)
    test_mse = mean_squared_error(y_test_lipid, y_test_pred)
    r_train, _ = pearsonr(y_train_lipid, y_train_pred)
    r_test, _ = pearsonr(y_test_lipid, y_test_pred)
    lipids.append(lipid)
    train_mses.append(train_mse)
    test_mses.append(test_mse)
    train_r_values.append(r_train)
    test_r_values.append(r_test)
    best_alphas.append(best_alpha)
    best_l1_ratios.append(best_l1_ratio)

pearson_df = pd.DataFrame({
    'Lipid': lipids,
    'Train_Pearson_R': train_r_values,
    'Test_Pearson_R': test_r_values,
    'Train_MSE': train_mses,
    'Test_MSE': test_mses,
    'Best_Alpha': best_alphas,
    'Best_L1_Ratio': best_l1_ratios
})

pearson_df.to_csv("elastic_genestolipids.csv", index=False)

plt.hist(pearson_df['Test_Pearson_R'], bins=20, color="darkred", alpha=0.8)
plt.hist(pearson_df['Train_Pearson_R'], bins=20, color="black", alpha=0.8)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
plt.gca().yaxis.set_ticks([])
plt.gca().xaxis.set_ticks_position('bottom')
plt.show()

pearson_df.sort_values('Test_Pearson_R')[:20]

## Elastic net but using genes as predictors

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
from tqdm import tqdm

param_grid = {
    'alpha': [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2],
    'l1_ratio': [0.1, 0.5, 0.9]
}

lipids = []
train_mses = []
test_mses = []
train_r_values = []
test_r_values = []
best_alphas = []
best_l1_ratios = []
all_feature_importances = []

for lipid in tqdm(y_train.columns):

    y_train_lipid = y_train[lipid]
    y_test_lipid = y_test[lipid]

    elastic_net = ElasticNet(max_iter=100000, random_state=42)

    grid_search = GridSearchCV(
        elastic_net, 
        param_grid, 
        cv=3, 
        scoring='neg_mean_squared_error',
        n_jobs=8, 
        verbose=2
    )
    grid_search.fit(X_train_scaled, y_train_lipid)

    best_model = grid_search.best_estimator_
    best_alpha = grid_search.best_params_['alpha']
    best_l1_ratio = grid_search.best_params_['l1_ratio']
    best_model.fit(X_train_scaled, y_train_lipid)

    y_train_pred = best_model.predict(X_train_scaled)
    y_test_pred = best_model.predict(X_test_scaled)

    # metrics
    train_mse = mean_squared_error(y_train_lipid, y_train_pred)
    test_mse = mean_squared_error(y_test_lipid, y_test_pred)
    r_train, _ = pearsonr(y_train_lipid, y_train_pred)
    r_test, _ = pearsonr(y_test_lipid, y_test_pred)
    lipids.append(lipid)
    train_mses.append(train_mse)
    test_mses.append(test_mse)
    train_r_values.append(r_train)
    test_r_values.append(r_test)
    best_alphas.append(best_alpha)
    best_l1_ratios.append(best_l1_ratio)
    
    feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': abs(best_model.coef_)
    })

    import pickle
    with open('elastic_net_model_'+lipid+'.pkl', 'wb') as f:
        pickle.dump(best_model, f)
        
    all_feature_importances.append(feature_importance)

pearson_df = pd.DataFrame({
    'Lipid': lipids,
    'Train_Pearson_R': train_r_values,
    'Test_Pearson_R': test_r_values,
    'Train_MSE': train_mses,
    'Test_MSE': test_mses,
    'Best_Alpha': best_alphas,
    'Best_L1_Ratio': best_l1_ratios
})

#pearson_df.to_csv("elastic_genestolipids.csv", index=False)

plt.hist(pearson_df['Test_Pearson_R'], bins=20, color="darkred", alpha=0.8)
plt.hist(pearson_df['Train_Pearson_R'], bins=20, color="black", alpha=0.8)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
plt.gca().yaxis.set_ticks([])
plt.gca().xaxis.set_ticks_position('bottom')
#plt.savefig("elastic_genestolipids.pdf")
plt.show()

pearson_df.sort_values('Test_Pearson_R')[:20]

## Study the gene feature importances to predict lipids to chase functional categories

In [None]:
all_feature_importancesss = [pd.Series(all_feature_importances[x]['importance'].values, index = all_feature_importances[x]['feature'].values) for x in range(len(all_feature_importances))]
featimps = pd.DataFrame(all_feature_importancesss)
featimps.index = y_train.columns

featimps.sum()[featimps.sum() > np.percentile(featimps.sum(), 90)].index # 2188 genes that contribute significantly to the prediction

In [None]:
featimps = pd.DataFrame(all_feature_importancesss)
featimps.index = y_train.columns
featimps = featimps.loc[:,featimps.sum()[featimps.sum() > np.percentile(featimps.sum(), 90)].index]
featimps

In [None]:
# compare relative ranks of features rather than absolute importance values

df_ranked = featimps.rank(axis=1, method='max')
df_ranked[featimps == 0] = 9999

x = range(len(df_ranked.sum().sort_values()))
y = df_ranked.sum().sort_values().values

from scipy.signal import savgol_filter

y_smooth = savgol_filter(y, window_length=51, polyorder=4)
dy = np.gradient(y_smooth)
dy2 = np.gradient(dy)
curvature = np.abs(dy2) / (1 + dy**2)**1.5
elbow_point = np.argmax(curvature[50:-50]) + 50 

plt.plot(x, y)
plt.xticks([])
plt.axvline(x=elbow_point, color='r', linestyle='--', label=f'Elbow at {elbow_point}')
plt.legend()
plt.show()

In [None]:
genesthatpredictlipids = df_ranked.sum().sort_values().index[:elbow_point]
lipidpredgene = -df_ranked.loc[:,genesthatpredictlipids]
lipidpredgene[df_ranked == 9999] = None
lipidpredgene = lipidpredgene-lipidpredgene.min().min()
background = genes.columns.values
background

In [None]:
# 1) how many high feature importance genes are cell type markers? are markers over or underrepresented?

## focus on the "smaller" classes + readd cell type markers in a controlled way - we need to say something about them too...
celltypes = genes.index.unique()
import numpy as np

def extract_markers(celltypes):
    markers = set()
    
    for celltype in celltypes:
        _, marker_part = celltype.split('_', 1)
        marker_tokens = marker_part.split('_')
        for token in marker_tokens:
            try:
                int(token)
            except ValueError:
                markers.add(token)

    return sorted(list(markers))

markers = extract_markers(celltypes)
len(np.unique(markers)) # they are nice but too many! but we can downsample many times to have 300 of them and take the best iteration

In [None]:
np.intersect1d(markers,genesthatpredictlipids)

In [None]:
A = set(markers)
B = set(genesthatpredictlipids)
C = set(background)

from scipy.stats import chi2_contingency

# a, b, and c are Python sets
A_B = A.intersection(B)            # A ∩ B
A_notB = A.difference(B)           # A ∩ (C \ B)
notA_B = B.difference(A)           # (C \ A) ∩ B
notA_notB = C.difference(A.union(B))  # (C \ A) ∩ (C \ B)

n11 = len(A_B)
n12 = len(A_notB)
n21 = len(notA_B)
n22 = len(notA_notB)

table = np.array([[n11, n12],
                  [n21, n22]], dtype=np.float64)
table

In [None]:
# Chi-square Test
chi2_stat, chi2_p, dof, expected = chi2_contingency(table)
print("Chi-square statistic:", chi2_stat)
print("p-value:", chi2_p)
print("Degrees of freedom:", dof)
print("Expected counts:\n", expected)

import numpy as np

def odds_ratio_and_ci_2x2(n11, n12, n21, n22, alpha=0.05):
    """
    Returns the odds ratio and an approximate (1-alpha)% CI 
    for a 2x2 contingency table.
    """
    # odds ratio
    or_ = (n11 * n22) / (n12 * n21)
    
    # standard error of ln(OR)
    se_log_or = np.sqrt(1/n11 + 1/n12 + 1/n21 + 1/n22)
    
    # z-value for the two-sided (1-alpha) confidence
    z = 1.96 if alpha == 0.05 else None
   
    # log of odds ratio
    log_or = np.log(or_)
    
    # confidence limits in log scale
    lower_log = log_or - z * se_log_or
    upper_log = log_or + z * se_log_or
    
    # exponentiate back
    lower = np.exp(lower_log)
    upper = np.exp(upper_log)
    
    return or_, (lower, upper)

OR, (ci_lower, ci_upper) = odds_ratio_and_ci_2x2(n11, n12, n21, n22)
print(f"Odds Ratio: {OR:.4f}")
print(f"95% CI: ({ci_lower:.4f}, {ci_upper:.4f})") # no over, no under representation, can't say

In [None]:
# 2) gene ontology?

genesthatpredictlipids

In [None]:
from __future__ import print_function
import os
import numpy as np
import matplotlib.pyplot as plt
import goatools
from goatools.anno.genetogo_reader import Gene2GoReader
from goatools.base import download_go_basic_obo, download_ncbi_associations
from goatools.obo_parser import GODag
from goatools.test_data.genes_NCBI_10090_ProteinCoding import GENEID2NT as GeneID2nt_mus
from goatools.goea.go_enrichment_ns import GOEnrichmentStudyNS
import collections as cx
import pandas as pd
from goatools.godag_plot import plot_gos, plot_results, plot_goid2goobj
from goatools.associations import read_ncbi_gene2go
from goatools.anno.factory import get_objanno
from goatools.go_enrichment import GOEnrichmentStudy
import mygene
import matplotlib as mpl
import matplotlib.pyplot as plt
from goatools.gosubdag.gosubdag import GoSubDag
mpl.rcParams['pdf.fonttype'] = 42

obo_dag = GODag("./zenodo/mixed/go-basic.obo")
# wget http://geneontology.org/ontology/go-basic.obo
obo_fname = download_go_basic_obo()

# dictionary of  gene symbols: Gene Ontology terms
associations = read_ncbi_gene2go('./zenodo/mixed/gene2go', taxids=[10090],namespace='MF')  # 10090 is the taxid for mouse
obj_ncbi = get_objanno('./zenodo/mixed/gene2go', taxid=10090)
associations = obj_ncbi.get_id2gos(namespace='all')

bp_terms = []
for go_id, go_term in obo_dag.items():
    if go_term.namespace == 'biological_process':
        bp_terms.append(go_id)
    elif go_term.namespace == 'molecular_function':
        bp_terms.append(go_id)
    elif go_term.namespace == 'cellular_component':
        bp_terms.append(go_id)
        
        
go_subdag = GoSubDag(bp_terms, obo_dag)
bp_associations = {}
for gene, terms in associations.items():
    bp_associations[gene] = [term for term in terms if term in bp_terms]
    
mg = mygene.MyGeneInfo()

def convert_symbols_to_entrez(gene_symbols):
    gene_info = mg.querymany(gene_symbols, scopes='symbol', fields='entrezgene', species='mouse')
    entrez_ids = [int(gene['entrezgene']) for gene in gene_info if 'entrezgene' in gene]
    return entrez_ids
population_genes = convert_symbols_to_entrez(background)

## XGBoost as a vanilla but powerfully generalizing nonlinear model

In [None]:
import anndata
import scanpy as sc

X_train_scaled = pd.DataFrame(X_train_scaled, index = X_train.index, columns = X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, index = X_test.index, columns = X_test.columns)

# at least some feature selection is needed... 

adata = anndata.AnnData(X_train_scaled)

sc.pp.highly_variable_genes( #################
    adata, 
    n_top_genes=3000, 
    min_mean=0.0125, 
    max_mean=3, 
    min_disp=0.5
)

adata_hvg = adata[:, adata.var['highly_variable']]

X_train_scaled_fc = pd.DataFrame(
    adata_hvg.X, 
    index=X_train_scaled.index, 
    columns=adata_hvg.var.index
)

X_test_scaled_fc = X_test_scaled.loc[:, X_train_scaled_fc.columns]
X_test_scaled_fc

In [None]:
import joblib

residuals_list = []

param_grid = {
    "n_estimators": [50, 200, 400],
    "learning_rate": [0.01, 0.05, 0.1],
    "max_depth": [3, 5, 7],
    # "subsample": [0.8, 1.0],
    # "colsample_bytree": [0.8, 1.0],
}

lipids = []
train_mses = []
test_mses = []
train_r_values = []
test_r_values = []
best_params_list = []

for lipid in tqdm(y_train.columns):

    X_train_sub, X_val_sub, y_train_sub, y_val_sub = train_test_split(
        X_train_scaled_fc,
        y_train[lipid],
        test_size=0.2,
        random_state=42
    )

    best_val_mse = float("inf")
    best_params = None
    best_model = None

    subsample_values = [1.0]
    colsample_values = [1.0]

    for n_estimators in param_grid["n_estimators"]:
        for learning_rate in param_grid["learning_rate"]:
            for max_depth in param_grid["max_depth"]:
                for subsample in subsample_values:
                    for colsample_bytree in colsample_values:
                        
                        xgb_regressor = xgb.XGBRegressor(
                            objective='reg:squarederror',
                            n_estimators=n_estimators,
                            learning_rate=learning_rate,
                            max_depth=max_depth,
                            subsample=subsample,
                            colsample_bytree=colsample_bytree,
                            random_state=42,
                            n_jobs=8
                        )

                        xgb_regressor.fit(X_train_sub, y_train_sub)
                        val_pred = xgb_regressor.predict(X_val_sub)
                        val_mse = mean_squared_error(y_val_sub, val_pred)

                        if val_mse < best_val_mse:
                            best_val_mse = val_mse
                            best_params = {
                                "n_estimators": n_estimators,
                                "learning_rate": learning_rate,
                                "max_depth": max_depth,
                                "subsample": subsample,
                                "colsample_bytree": colsample_bytree,
                            }
                            best_model = xgb_regressor

    print(f"Best val MSE for {lipid}: {best_val_mse}")
    print(f"Best params for {lipid}: {best_params}")

    xgb_regressor_final = xgb.XGBRegressor(
        objective='reg:squarederror',
        **best_params,  
        random_state=42,
        n_jobs=8
    )
    xgb_regressor_final.fit(X_train_scaled_fc, y_train[lipid])

    y_train_pred = xgb_regressor_final.predict(X_train_scaled_fc)
    y_test_pred = xgb_regressor_final.predict(X_test_scaled_fc)
    train_mse = mean_squared_error(y_train[lipid], y_train_pred)
    test_mse = mean_squared_error(y_test[lipid], y_test_pred)
    r_train, _ = pearsonr(y_train[lipid], y_train_pred)
    r_test, _ = pearsonr(y_test[lipid], y_test_pred)
    lipids.append(lipid)
    train_mses.append(train_mse)
    test_mses.append(test_mse)
    train_r_values.append(r_train)
    test_r_values.append(r_test)
    best_params_list.append(best_params)

    train_residuals = y_train[lipid] - y_train_pred
    test_residuals = y_test[lipid] - y_test_pred
    train_residuals_df = pd.DataFrame({"lipid": lipid, "residual": train_residuals})
    train_residuals_df["set"] = "train"
    test_residuals_df = pd.DataFrame({"lipid": lipid, "residual": test_residuals})
    test_residuals_df["set"] = "test"
    residuals_list.append(train_residuals_df)
    residuals_list.append(test_residuals_df)

    joblib.dump(xgb_regressor_final, f"{lipid}_best_xgb_model.pkl")

pearson_df = pd.DataFrame({
    "Lipid": lipids,
    "Train_MSE": train_mses,
    "Test_MSE": test_mses,
    "Train_Pearson_R": train_r_values,
    "Test_Pearson_R": test_r_values,
    "Best_Params": best_params_list
})

#pearson_df.to_csv("xgb_genestolipids.csv", index=False)

plt.hist(pearson_df['Test_Pearson_R'], bins=20, color="darkred", alpha=0.8)
plt.hist(pearson_df['Train_Pearson_R'], bins=20, color="black", alpha=0.8)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
plt.gca().yaxis.set_ticks([])
plt.gca().xaxis.set_ticks_position('bottom')
plt.show()

pearson_df.sort_values('Test_Pearson_R')[:20]

residuals_df = pd.concat(residuals_list)
adata = anndata.AnnData(
    X=residuals_df[["residual"]].values,
    obs=residuals_df[["lipid", "set"]].copy(),
    var=pd.DataFrame(index=["residual"])
)
#adata.write_h5ad("residuals.h5ad")

In [None]:
plt.hist(pearson_df['Test_Pearson_R'], bins=20, color="darkred", alpha=0.8)
plt.hist(pearson_df['Train_Pearson_R'], bins=20, color="black", alpha=0.8)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
plt.gca().yaxis.set_ticks([])
plt.gca().xaxis.set_ticks_position('bottom')
plt.show()

In [None]:
difficult = pearson_df.sort_values(by="Test_Pearson_R")
difficult = difficult.loc[difficult["Test_Pearson_R"] < 0.34,:]
print(difficult['Lipid'].values)

## Characterize the hard-to-predict lipids: tests to assess if any class, chain length, or insaturation shift occurs in a given lipid set (on elastic net)

In [None]:
THR = 0.4
hardtopredict = pearson_df.sort_values('Test_Pearson_R')['Lipid'][pearson_df.sort_values('Test_Pearson_R')['Test_Pearson_R'] < THR]

import re

df = pd.DataFrame(pearson_df['Lipid']).fillna('')
df.columns = ["lipid_name"]

# extract the "class" etc from the lipid_name
df["class"] = df["lipid_name"].apply(lambda x: re.split(' |\(', x)[0])
df["carbons"] = df["lipid_name"].apply(lambda x: int(re.search(r'(\d+):', x).group(1)) if re.search(r'(\d+):', x) else np.nan)
df["insaturations"] = df["lipid_name"].apply(lambda x: int(re.search(r':(\d+)', x).group(1)) if re.search(r':(\d+)', x) else np.nan)
df["insaturations_per_Catom"] = df["insaturations"] / df["carbons"]

df["broken"] = df["lipid_name"].str.endswith('_uncertain')
df.loc[df["broken"], 'carbons'] = np.nan
df.loc[df["broken"], 'class'] = np.nan
df.loc[df["broken"], 'insaturations'] = np.nan
df.loc[df["broken"], 'insaturations_per_Catom'] = np.nan
df.loc[df["broken"], 'color'] = "gray"

df.index = df['lipid_name']
df = df.drop_duplicates()

test = df.loc[hardtopredict,:]
nontest = df.loc[~df.index.isin(hardtopredict),:]

test

In [None]:
def permutation_test_categorical(
    test_labels, 
    other_labels, 
    n_permutations=10_000, 
    alternative='two-sided', 
    random_state=None
):
    """
    Perform a permutation test to assess whether each category in test_labels 
    is over- or under-represented compared to what we would expect by chance.
    
    Parameters
    ----------
    test_labels : 1D array-like of categorical labels (the "test" set)
    other_labels : 1D array-like of categorical labels (all non-test elements)
    n_permutations : int, optional
        Number of random permutations
    alternative : {'two-sided', 'greater', 'less'}, optional
        - 'two-sided': tests if the proportion differs in either direction
        - 'greater': tests if test_labels has a higher proportion of the category
        - 'less': tests if test_labels has a lower proportion of the category
    random_state : int, optional
        If provided, sets the random seed for reproducibility
    
    Returns
    -------
    results : pd.DataFrame
        A DataFrame with columns: 'category', 'observed_count', 'expected_count',
        'observed_proportion', 'expected_proportion', 'p_value'
    """
    if random_state is not None:
        np.random.seed(random_state)
    
    test_labels = np.array(test_labels)
    other_labels = np.array(other_labels)
    all_labels = np.concatenate([test_labels, other_labels])
    n_test = len(test_labels)
    unique_categories = np.unique(all_labels)
    total_counts = {cat: np.sum(all_labels == cat) for cat in unique_categories}
    expected_props = {cat: count/len(all_labels) for cat, count in total_counts.items()}
    observed_counts = {cat: np.sum(test_labels == cat) for cat in unique_categories}
    observed_props = {cat: count/n_test for cat, count in observed_counts.items()}
    perm_counts = {cat: np.zeros(n_permutations) for cat in unique_categories}

    for i in range(n_permutations):
        np.random.shuffle(all_labels)
        perm_test = all_labels[:n_test]
        for cat in unique_categories:
            perm_counts[cat][i] = np.sum(perm_test == cat)
    
    results = []
    for cat in unique_categories:
        observed = observed_counts[cat]
        distribution = perm_counts[cat]
        expected = expected_props[cat] * n_test
        
        if alternative == 'two-sided':
            observed_dev = abs(observed - expected)
            p_value = np.mean(abs(distribution - expected) >= observed_dev)
        
        elif alternative == 'greater':
            p_value = np.mean(distribution >= observed)
        
        elif alternative == 'less':
            p_value = np.mean(distribution <= observed)
        
        results.append({
            'category': cat,
            'observed_count': observed,
            'expected_count': expected,
            'observed_proportion': observed_props[cat],
            'expected_proportion': expected_props[cat],
            'p_value': p_value
        })
    
    return pd.DataFrame(results)

class_enrichments = permutation_test_categorical(
    test['class'].values, nontest['class'].values, 
    n_permutations=5000, 
    alternative='two-sided', 
    random_state=42
)

class_enrichments # no class seems to be statistically differentially represented in this set

## Stratification: (lipid) metabolism genes, transporters, TFs, cellular localization, cell-cell communication, NTs

## Can we impaint genes from lipidomic measurements using the lipizones? Assess predictability first, spatial patterns next

In [None]:
X = pd.read_hdf("./zenodo/multimodal/lipicent.h5ad", key="table")
y = pd.read_hdf("./zenodo/multimodal/genecent.h5ad", key="table")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5, random_state=42 # !!! make a larger test set since some lipizone centroids are almost redundant given how we define a lipizone and we want to prevent the model from trivially memorizing
)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

pca = PCA(n_components=0.99, random_state=42) # we'll use a PCA capturing most variance of lipids and look at the PC loadings to bypass multicollinearity
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)
print(f"Original number of features: {X_train.shape[1]}")
print(f"Reduced number of principal components: {X_train_pca.shape[1]}")

linear_regressor = LinearRegression()

multi_output_regressor = MultiOutputRegressor(linear_regressor, n_jobs=8)
multi_output_regressor.fit(X_train_pca, y_train)
print("Linear Regression MultiOutputRegressor training complete.")

y_train_pred = multi_output_regressor.predict(X_train_pca)
y_test_pred = multi_output_regressor.predict(X_test_pca)

train_mse = mean_squared_error(y_train, y_train_pred, multioutput='raw_values')
test_mse = mean_squared_error(y_test, y_test_pred, multioutput='raw_values')

train_r = []
test_r = []

for i, lipid in enumerate(y_train.columns):
    r_train, _ = pearsonr(y_train[lipid], y_train_pred[:, i])
    train_r.append(r_train)
    r_test, _ = pearsonr(y_test[lipid], y_test_pred[:, i])
    test_r.append(r_test)

# NOTE: ONCE LIPIZONES ARE DEFINED, IT IS VERY EASY TO PREDICT GENES FROM LIPIDS
pearson_df = pd.DataFrame({
    'Lipid': y_train.columns,
    'Train_Pearson_R': train_r,
    'Test_Pearson_R': test_r,
    'Train_MSE': train_mse,
    'Test_MSE': test_mse
})

plt.hist(pearson_df['Test_Pearson_R'], bins=20, color="darkred", alpha=0.8)
plt.hist(pearson_df['Train_Pearson_R'], bins=20, color="black", alpha=0.8)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().xaxis.set_ticks_position('bottom')
plt.show()

pearson_df.sort_values('Test_Pearson_R')[:20]

## Study enzyme-lipid correlations

In [None]:
filtered_premanannot = pd.read_csv("/data/luca/lipidatlas/ManuscriptGithub/zenodo/csv/corereactions_wenzymes.csv")

corronz = pd.read_hdf("/data/luca/lipidatlas/ManuscriptGithub/zenodo/csv/allvsallcorr.h5ad", key="table")

expanded_reamat = filtered_premanannot[["reagent", "product", "enzyme"]]
def convert_to_tuple(text):
    if not isinstance(text, str):
        return text
    
    if text.startswith('(') and text.endswith(')'):
        text = text[1:-1]
        items = [item.strip().strip("'") for item in text.split(',')]
        return tuple(items)
    return text

expanded_reamat['enzyme'] = expanded_reamat['enzyme'].apply(convert_to_tuple)
expanded_reamat = expanded_reamat.explode("enzyme", ignore_index=True)
expanded_reamat
expanded_reamat = expanded_reamat.loc[expanded_reamat['product'].isin(corronz.index),:]
expanded_reamat = expanded_reamat.loc[expanded_reamat['reagent'].isin(corronz.index),:]

expanded_reamat = expanded_reamat.loc[expanded_reamat['enzyme'].isin(corronz.index),:]
expanded_reamat['corr_enz_prod'] = [corronz.loc[expanded_reamat['product'].iloc[i], np.array(expanded_reamat['enzyme'].iloc[i])] for i in range(expanded_reamat.shape[0])]
expanded_reamat['corr_enz_rea'] = [corronz.loc[expanded_reamat['reagent'].iloc[i], np.array(expanded_reamat['enzyme'].iloc[i])] for i in range(expanded_reamat.shape[0])]

import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42

# 1) compute medians and get sorted enzyme order
medians = expanded_reamat.groupby('enzyme')['corr_enz_prod'].median()
order = medians.sort_values().index.tolist()

# 2) build list of arrays for each enzyme in that order
data = [
    expanded_reamat.loc[expanded_reamat['enzyme'] == enz, 'corr_enz_prod']
    for enz in order
]

plt.figure(figsize=(10, 6))

# 3) boxplot with patch_artist=True so we can fill
bp = plt.boxplot(
    data,
    labels=order,
    patch_artist=True,
    boxprops=dict(facecolor='lightgray', edgecolor='none', linewidth=1.5),
    whiskerprops=dict(color='lightgray', linewidth=1.5),
    capprops=dict(color='lightgray', linewidth=1.5),
    flierprops=dict(
        marker='o',
        markerfacecolor='lightgray',
        markeredgecolor='lightgray',
        markersize=3,
        linestyle='none'
    ),
    medianprops=dict(color='white', linewidth=1.5)
)

# 4) dark red dashed line with 50% transparency
plt.axhline(0.0, color='darkred', linestyle='--', linewidth=1, alpha=0.5)

plt.suptitle("")
plt.xlabel("Enzyme")
plt.ylabel("Correlation enzyme-product")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
#plt.savefig("correnzprod.pdf")
plt.show()


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
from matplotlib.colors import Normalize
import matplotlib.cm as cm

matplotlib.rcParams['pdf.fonttype'] = 42

# 1) compute medians and get sorted enzyme order
medians = expanded_reamat.groupby('enzyme')['corr_enz_prod'].median()
order = medians.sort_values().index.tolist()

# 2) prepare DataFrame with numeric x positions + jitter
df = expanded_reamat.copy()
df['enzyme'] = pd.Categorical(df['enzyme'], categories=order, ordered=True)
df['x'] = df['enzyme'].cat.codes
np.random.seed(0)
df['x_jitter'] = df['x'] + np.random.uniform(-0.25, 0.25, size=len(df))

# Ensure lipid column exists
# Replace 'lipid' with your actual lipid name column if different
if 'lipid' not in df.columns and 'product' in df.columns:
    df = df.rename(columns={'product': 'lipid'})

# 3) set up colormap normalization
vmin, vmax = df['corr_enz_prod'].min(), df['corr_enz_prod'].max()
norm = Normalize(vmin=vmin, vmax=vmax)
cmap = cm.get_cmap('coolwarm')

plt.figure(figsize=(10, 6))

# 4) scatter each point colored by its correlation
sc = plt.scatter(
    df['x_jitter'],
    df['corr_enz_prod'],
    c=df['corr_enz_prod'],
    cmap=cmap,
    norm=norm,
    s=30,
    alpha=0.8,
    edgecolors='none'
)

# 5) add a colorbar
cbar = plt.colorbar(sc)
cbar.set_label('Correlation enzyme–product')

# 6) horizontal zero line
plt.axhline(0.0, color='darkred', linestyle='--', linewidth=1, alpha=0.5)

# 7) annotate the 3 highest and 3 lowest correlations by lipid name
extremes = pd.concat([
    df.nlargest(3, 'corr_enz_prod'),
    df.nsmallest(3, 'corr_enz_prod')
])

# 8) annotate an additional 14 random points (excluding extremes)
remaining = df.drop(extremes.index)
random_pts = remaining.sample(n=14, random_state=42)

to_label = pd.concat([extremes, random_pts])

for _, row in to_label.iterrows():
    label = row['lipid'] if 'lipid' in row else ''
    plt.text(
        row['x_jitter'],
        row['corr_enz_prod'],
        f"{label}\n{row['corr_enz_prod']:.2f}",
        fontsize=7,
        ha='center',
        va='bottom' if row['corr_enz_prod'] > 0 else 'top'
    )

# 9) finalize axes
plt.xticks(range(len(order)), order, rotation=45, ha='right')
plt.xlabel("Enzyme")
plt.ylabel("Correlation enzyme–product")
plt.tight_layout()
#plt.savefig("strip.pdf")
plt.show()


In [None]:
np.sum(expanded_reamat['corr_enz_prod'] > 0.5)

In [None]:
expanded_reamat['corr_enz_prod']

In [None]:
expanded_reamat['corr_enz_prod'].mean()

In [None]:
expanded_reamat['corr_enz_prod'].std()