In [2]:
import warnings 
warnings.filterwarnings('ignore')

# Libraries

In [3]:
#Basic libraries
import os
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import LeaveOneGroupOut, RepeatedStratifiedKFold, LeaveOneOut, RepeatedKFold
from sklearn.base import clone
from scipy.stats import mannwhitneyu, spearmanr
from stabl.visualization import scatterplot_features, boxplot_features
from stabl.stabl import Stabl, save_stabl_results

from sklearn.linear_model import Lasso, LassoCV, LogisticRegressionCV, LogisticRegression, LinearRegression, ElasticNetCV, Lasso

#STABL pipelines
from stabl.multi_omic_pipelines import multi_omic_stabl, multi_omic_stabl_cv
from stabl.single_omic_pipelines import single_omic_stabl, single_omic_stabl_cv

#Preprocessing functions
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from stabl.preprocessing import LowInfoFilter, remove_low_info_samples


# Import Data

In [4]:
X_noEGA_pen = pd.read_csv('./Onset of Labor csv/immunome_noEGA_pen_OOL.csv',index_col="ID")
X_noEGA = pd.read_csv('./Onset of Labor csv/immunome_noEGA_OOL.csv',index_col="ID")
EGA = pd.read_csv('./Onset of Labor csv/immunome_EGA_OOL.csv',index_col="ID")["EGA"]


X = X_noEGA_pen
pen = "pen"

y = pd.read_csv('./Onset of Labor csv/outcome_OOL.csv',index_col="ID").iloc[:,0]

# Preprocessing

In [40]:
remove_low_info_samples(X, threshold=1.)
preprocessing = Pipeline(
	steps=[
		('lif', LowInfoFilter(0.2)),
		('variance', VarianceThreshold(0.0)),
		('impute', SimpleImputer(strategy='median'))
	])

X = pd.DataFrame(
	data=preprocessing.fit_transform(X),
	index=X.index,
	columns=preprocessing.get_feature_names_out()
)


# Models

## single_omic_stabl_pipeline personalized

STABL doesn't seem to give good results so it was taken out and we added lines to access the coefficients of the models built (Lasso, Lasso 1SE and EN)

In [41]:
from sklearn.svm import l1_min_c

from stabl.metrics import jaccard_matrix
from stabl.utils import compute_CI, permutation_test_between_clfs
from stabl.preprocessing import LowInfoFilter, remove_low_info_samples

from scipy import stats
from scipy.stats import mannwhitneyu
from sklearn.metrics import roc_auc_score, average_precision_score, r2_score, mean_squared_error, mean_absolute_error

from stabl.pipelines_utils import save_plots, compute_scores_table

lasso = Lasso(max_iter=int(1e6))
lasso_cv = LassoCV(n_alphas=50, max_iter=int(1e6), n_jobs=-1)
en_cv = ElasticNetCV(n_alphas=50, max_iter=int(1e6), n_jobs=-1, l1_ratio=.5)

logit_lasso_cv = LogisticRegressionCV(penalty="l1", solver="liblinear", Cs=np.logspace(-2, 2, 50),
                                      max_iter=int(1e6), class_weight="balanced", scoring="roc_auc",
                                      n_jobs=-1
                                      )

logit_en_cv = LogisticRegressionCV(penalty="elasticnet", solver="saga", Cs=np.logspace(-2, 2, 50),
                                   max_iter=int(1e6), class_weight="balanced", scoring="roc_auc",
                                   n_jobs=-1, l1_ratios=[.5]
                                   )

logit = LogisticRegression(penalty=None, class_weight="balanced", max_iter=int(1e6))
linreg = LinearRegression()

def compute_scores_table_without_STABL(
        predictions_dict,
        y,
        task_type="binary",
        selected_features_dict=None
):
    """Function to output the table of scores
    for a STABL against Lasso benchmark on a single omic.

    Parameters
    ----------
    selected_features_dict
    predictions_dict: dict
        Dictionary of raw predictions (should contain a "Lasso" key).

    y: pd.Series
        pandas Series containing the outcomes.

    task_type: string, default="binary"
        Type of task, can either be "binary" or "regression".

    Returns
    -------
    table_of_scores: pd.DataFrame
    """

    scores_columns = []
    if selected_features_dict is not None:
        if task_type == "binary":
            scores_columns = ["ROC AUC", "Average Precision", "N features", "CVS"]

        elif task_type == "regression":
            scores_columns = ["R2", "RMSE", "MAE", "N features", "CVS"]
            
    else:
        if task_type == "binary":
            scores_columns = ["ROC AUC", "Average Precision"]

        elif task_type == "regression":
            scores_columns = ["R2", "RMSE", "MAE"]

    table_of_scores = pd.DataFrame(data=None, columns=scores_columns)

    for model, preds in predictions_dict.items():
        #stabl_preds = predictions_dict["STABL"]

        if task_type == "binary":
            model_roc = roc_auc_score(y, preds)
            model_roc_CI = compute_CI(y, preds, scoring="roc_auc")
            cell_value = f"{model_roc:.3f} [{model_roc_CI[0]:.3f}, {model_roc_CI[1]:.3f}]"
            # if model != "STABL":
            #     p_value = permutation_test_between_clfs(y, preds, stabl_preds, scoring="roc_auc")[1]
            #     cell_value = cell_value + f" (p={p_value})"
            table_of_scores.loc[model, "ROC AUC"] = cell_value

            model_ap = average_precision_score(y, preds)
            model_ap_CI = compute_CI(y, preds, scoring="average_precision")
            cell_value = f"{model_ap:.3f} [{model_ap_CI[0]:.3f}, {model_ap_CI[1]:.3f}]"
            # if model != "STABL":
            #     p_value = permutation_test_between_clfs(y, preds, stabl_preds, scoring="average_precision")[1]
            #     cell_value = cell_value + f" (p={p_value})"
            table_of_scores.loc[model, "Average Precision"] = cell_value

        elif task_type == "regression":
            model_r2 = r2_score(y, preds)
            model_r2_CI = compute_CI(y, preds, scoring="r2")
            table_of_scores.loc[model, "R2"] = f"{model_r2:.3f} [{model_r2_CI[0]:.3f}, {model_r2_CI[1]:.3f}]"

            model_rmse = np.sqrt(mean_squared_error(y, preds))
            model_rmse_CI = compute_CI(y, preds, scoring="rmse")
            table_of_scores.loc[model, "RMSE"] = f"{model_rmse:.3f} [{model_rmse_CI[0]:.3f}, {model_rmse_CI[1]:.3f}]"

            model_mae = mean_absolute_error(y, preds)
            model_mae_CI = compute_CI(y, preds, scoring="mae")
            table_of_scores.loc[model, "MAE"] = f"{model_mae:.3f} [{model_mae_CI[0]:.3f}, {model_mae_CI[1]:.3f}]"

        if selected_features_dict is not None:
            #sel_features_stabl = selected_features_dict["STABL"]["Fold nb of features"]
            #jaccard_mat_stabl = jaccard_matrix(selected_features_dict["STABL"]["Fold selected features"], remove_diag=False)
            #jaccard_val_stabl = jaccard_mat_stabl[np.triu_indices_from(jaccard_mat_stabl, k=1)]

            #median_features = np.median(sel_features_stabl)
            #iqr_features = np.quantile(sel_features_stabl, [.25, .75])
            #cell_value = f"{median_features:.3f} [{iqr_features[0]:.3f}, {iqr_features[1]:.3f}]"
            #table_of_scores.loc["STABL", "N features"] = cell_value

            #jaccard_median = np.median(jaccard_val_stabl)
            #jaccard_iqr = np.quantile(jaccard_val_stabl, [.25, .75])
            #cell_value = f"{jaccard_median:.3f} [{jaccard_iqr[0]:.3f}, {jaccard_iqr[1]:.3f}]"
            #table_of_scores.loc["STABL", "CVS"] = cell_value

            if model != "STABL":
                sel_features = selected_features_dict[model]["Fold nb of features"]
                jaccard_mat = jaccard_matrix(selected_features_dict[model]["Fold selected features"], remove_diag=False)
                jaccard_val = jaccard_mat[np.triu_indices_from(jaccard_mat, k=1)]
                #p_value_feature = mannwhitneyu(x=sel_features, y=sel_features_stabl).pvalue
                #p_value_feature = f" (p={p_value_feature:.3e})"
                #p_value_cvs = mannwhitneyu(x=jaccard_val, y=jaccard_val_stabl).pvalue
                #p_value_cvs = f" (p={p_value_cvs:.3e})"

                median_features = np.median(sel_features)
                iqr_features = np.quantile(sel_features, [.25, .75])
                cell_value = f"{median_features:.3f} [{iqr_features[0]:.3f}, {iqr_features[1]:.3f}]" #+ p_value_feature
                table_of_scores.loc[model, "N features"] = cell_value

                jaccard_median = np.median(jaccard_val)
                jaccard_iqr = np.quantile(jaccard_val, [.25, .75])
                cell_value = f"{jaccard_median:.3f} [{jaccard_iqr[0]:.3f}, {jaccard_iqr[1]:.3f}]" #+ p_value_cvs
                table_of_scores.loc[model, "CVS"] = cell_value

    return table_of_scores

def single_omic_cv(
        X,
        y,
        outer_splitter,
        task_type,
        save_path,
        outer_groups=None
):
    
    models = ["Lasso", "Lasso 1SE", "ElasticNet"]

    os.makedirs(Path(save_path, "Training CV"), exist_ok=True)
    os.makedirs(Path(save_path, "Summary"), exist_ok=True)

    # Initializing the df containing the data of all omics
    predictions_dict = dict()
    selected_features_dict = dict()
    coefficients_dict = dict()

    for model in models:
        predictions_dict[model] = pd.DataFrame(data=None, index=y.index)
        selected_features_dict[model] = []
        coefficients_dict[model] = pd.DataFrame(data=None, index=X.columns)

    i = 1
    for train, test in outer_splitter.split(X, y, groups=outer_groups):
        # Jonas additional code in case outer_splitter is LeaveOneOut
        if isinstance(outer_splitter, LeaveOneOut):
            print(f" Iteration {i} over {X.shape[0]} ".center(80, '*'), "\n")
        elif isinstance(outer_groups, (list, tuple, np.ndarray)):
            print(f" Iteration {i} over {outer_splitter.get_n_splits(groups=outer_groups)} ".center(80, '*'), "\n")
        else:
            print(f" Iteration {i} over {outer_splitter.get_n_splits()} ".center(80, '*'), "\n")
        # end additional code
        train_idx, test_idx = y.iloc[train].index, y.iloc[test].index

        fold_selected_features = dict()
        for model in models:
            fold_selected_features[model] = []

        print(f"{len(train_idx)} train samples, {len(test_idx)} test samples")

        # __other models__
        y_train, y_test = y.loc[train_idx], y.loc[test_idx]
        X_train = X.loc[train_idx]
        X_test = X.loc[test_idx]
        #X_train = pd.DataFrame(
        #    data=preprocessing.fit_transform(X_train),
        #    columns=preprocessing.get_feature_names_out(),
        #    index=X_train.index
        #)

        #X_test = pd.DataFrame(
        #    data=preprocessing.transform(X_test),
        #    columns=preprocessing.get_feature_names_out(),
        #    index=X_test.index
        #)

        # __Lasso__
        if task_type == "binary":
            inner_splitter = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=42)
            model = clone(logit_lasso_cv).set_params(cv=inner_splitter)
            predictions = model.fit(X_train, y_train).predict_proba(X_test)[:, 1]
        else:
            inner_splitter = RepeatedKFold(n_splits=5, n_repeats=5, random_state=42)
            model = clone(lasso_cv).set_params(cv=inner_splitter)
            predictions = model.fit(X_train, y_train).predict(X_test)

        selected_features_dict["Lasso"].append(list(X_train.columns[np.where(model.coef_.flatten())]))
        predictions_dict["Lasso"].loc[test_idx, f"Fold n°{i}"] = predictions
        coefficients_dict["Lasso"][f"Fold n°{i}"] = model.coef_

        # __Lasso 1SE__
        if task_type == "binary":
            # Jonas additional code
            new_best_c_corr = model.C_[0] - model.scores_[True].std() / np.sqrt(inner_splitter.get_n_splits())
            if new_best_c_corr < 0:
                best_c_corr = abs(model.C_[0])
            else:
                best_c_corr = new_best_c_corr
            # end of new code
            model = LogisticRegression(penalty='l1', solver='liblinear', C=best_c_corr, class_weight='balanced',
                                       max_iter=2_000_000)
            predictions = model.fit(X_train, y_train).predict_proba(X_test)[:, 1]

        selected_features_dict["Lasso 1SE"].append(list(X_train.columns[np.where(model.coef_.flatten())]))
        predictions_dict["Lasso 1SE"].loc[test_idx, f"Fold n°{i}"] = predictions
        coefficients_dict["Lasso 1SE"][f"Fold n°{i}"] = model.coef_

        # __EN__
        if task_type == "binary":
            model = clone(logit_en_cv).set_params(cv=inner_splitter)
            predictions = model.fit(X_train, y_train).predict_proba(X_test)[:, 1]

        else:
            model = clone(en_cv).set_params(cv=inner_splitter)
            predictions = model.fit(X_train, y_train).predict(X_test)

        selected_features_dict["ElasticNet"].append(list(X_train.columns[np.where(model.coef_.flatten())]))
        predictions_dict["ElasticNet"].loc[test_idx, f"Fold n°{i}"] = predictions
        coefficients_dict["ElasticNet"][f"Fold n°{i}"] = model.coef_

        i += 1

    # __SAVING_RESULTS__

    if y.name is None:
        y.name = "outcome"

    summary_res_path = Path(save_path, "Summary")
    cv_res_path = Path(save_path, "Training CV")

    jaccard_matrix_dict = dict()
    formatted_features_dict = dict()

    for model in models:

        jaccard_matrix_dict[model] = jaccard_matrix(selected_features_dict[model])
        
        # Jonas additional code in case outer_splitter is LeaveOneOut
        if isinstance(outer_splitter, LeaveOneOut):
            index=[f"Fold {i}" for i in range(X.shape[0])]
        elif isinstance(outer_groups, (list, tuple, np.ndarray)):
            index=[f"Fold {i}" for i in range(outer_splitter.get_n_splits(groups=outer_groups))]
        else:
            index=[f"Fold {i}" for i in range(outer_splitter.get_n_splits())]
        # end additional code

        formatted_features_dict[model] = pd.DataFrame(
            data={
                "Fold selected features": selected_features_dict[model],
                "Fold nb of features": [len(el) for el in selected_features_dict[model]]
            },
            index=index # Jonas'additional code linked to this parameter
        )
        formatted_features_dict[model].to_csv(Path(cv_res_path, f"Selected Features {model}.csv"))
        coefficients_dict[model].to_csv(Path(cv_res_path, f"{model} coefficients.csv"))

    predictions_dict = {model: predictions_dict[model].median(axis=1) for model in predictions_dict.keys()}

    table_of_scores = compute_scores_table_without_STABL(
        predictions_dict=predictions_dict,
        y=y,
        task_type=task_type,
        selected_features_dict=formatted_features_dict
    )

    table_of_scores.to_csv(Path(summary_res_path, "Scores training CV.csv"))
    table_of_scores.to_csv(Path(cv_res_path, "Scores training CV.csv"))

    save_plots(
        predictions_dict=predictions_dict,
        y=y,
        task_type=task_type,
        save_path=cv_res_path
    )

    return predictions_dict

In [42]:
#outer_splitter=RepeatedStratifiedKFold(n_repeats=10, n_splits=5, random_state=42)
outer_splitter=LeaveOneGroupOut()
ID = pd.read_csv("./Onset of Labor csv/ID.csv", index_col="ID")

# Necessary to reorder the IDs the same way it is in X and y
all_data = X.join([y, ID])
Id = np.array(all_data["Id"])

single_omic_cv(
        X=X,
        y=y,
        outer_splitter=outer_splitter,
        task_type='regression',
        save_path=f"./Results_EGA_correction/immunome_noEGA_pen_OOL/Other_models_with_LeaveOneGroupOut",
        outer_groups=Id
)

***************************** Iteration 1 over 53 ****************************** 

147 train samples, 3 test samples


KeyboardInterrupt: 

# Univariate

In [26]:
os.makedirs(f"../Results/immunome_EGA_pen_OOL/Other_models_with_LeaveOneGroupOut/Univariate", exist_ok=True)

impute_X = SimpleImputer(strategy="median").fit_transform(X)
impute_X = pd.DataFrame(data = impute_X, index = X.index, columns = X.columns)

Spearmancorr = {}
features = impute_X.columns
for feature in features:
	corr, pval = spearmanr(impute_X[feature], y)
	Spearmancorr[feature] = [corr, pval]

SpearmanPvalue = pd.DataFrame(Spearmancorr).T
SpearmanPvalue.columns = ['Spearman corr', 'pvalue']
SpearmanPvalue.sort_values('pvalue', inplace=True)
SpearmanPvalue.to_csv(f"../Results/immunome_EGA_pen_OOL/Other_models"+'/Univariate/SpearmanCorrelationsPval.csv', index=True)

scatterplot_features(
	SpearmanPvalue[:10].index,
	X,
	y,
	show_fig=False,
	export_file=True,
	path=f"../Results/immunome_EGA_pen_OOL/Other_models/Univariate")

## Univariate Analysis

We tryed to look at the last weeks but it didn't give good results so we are doing a standard univariate analysis

In [7]:
all_data = pd.read_csv("./Onset of Labor csv/immunome_noEGA_DOS_pen_OOL.csv", index_col=0)
remove_low_info_samples(all_data, threshold=1.)

Unnamed: 0_level_0,Bcells_CREB_IFNa,Bcells_CREB_IL246,Bcells_CREB_unstim,Bcells_ERK_IL246,Bcells_ERK_unstim,Bcells_IkB_IFNa,Bcells_IkB_IL246,Bcells_IkB_unstim,Bcells_MAPKAPK2_IFNa,Bcells_MAPKAPK2_IL246,...,Tregs_STAT3_IFNa,Tregs_STAT3_IL246,Tregs_STAT3_unstim,Tregs_STAT5_IFNa,Tregs_STAT5_IL246,Tregs_STAT5_unstim,Tregs_STAT6_IFNa,Tregs_STAT6_IL246,Tregs_STAT6_unstim,DOS
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
P1_26,0.0,0.0,0.0,0.0,0,0.034718,0.041471,0.025620,0.003377,-0.001789,...,2.296418,2.109044,0.107672,1.634649,2.102302,0.296542,0.598535,1.940051,0.438072,-101
P1_33,0.0,0.0,0.0,0.0,0,0.026869,0.017180,0.045330,0.010623,0.001202,...,2.232350,2.111159,0.016361,1.666357,2.086241,0.292513,0.418868,1.917416,0.566338,-51
P1_35,0.0,0.0,0.0,0.0,0,-0.015819,0.002324,0.082293,-0.013121,-0.015124,...,2.291796,2.254705,0.019508,1.672324,2.162579,0.348858,0.650719,2.164235,0.509774,-37
P100_29,0.0,0.0,0.0,0.0,0,-0.009388,-0.005600,0.097715,-0.032358,-0.021496,...,1.994880,2.000276,0.000000,1.801344,2.339593,0.115737,0.467283,1.671560,0.401472,-71
P100_37,0.0,0.0,0.0,0.0,0,0.033034,0.039467,0.110837,-0.000648,0.024808,...,2.192700,1.925426,0.000000,1.613582,2.078898,0.456924,0.436636,1.743342,0.572790,-15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
P98_36,0.0,0.0,0.0,0.0,0,-0.099873,-0.083251,0.169816,-0.095115,-0.074781,...,2.244208,2.186482,0.048295,1.637413,2.112697,0.493131,0.511886,1.972187,0.577122,-24
P98_38,0.0,0.0,0.0,0.0,0,-0.019887,-0.018802,0.088509,-0.009145,-0.010932,...,2.321106,2.218698,0.000000,1.715735,2.200151,0.394697,0.558892,1.941205,0.524335,-9
P99_24,0.0,0.0,0.0,0.0,0,0.000000,0.000000,0.000000,-0.007880,-0.011182,...,1.926415,2.058333,0.000000,2.023936,2.565416,0.109877,0.491894,1.712550,0.504544,-106
P99_38,0.0,0.0,0.0,0.0,0,0.004561,0.016001,0.050217,-0.003302,0.017738,...,2.145378,2.043997,0.000000,1.765265,2.362462,0.328198,0.442802,1.884497,0.606656,-7


In [8]:
os.makedirs("./Onset of Labor csv/Univariate regated OOL", exist_ok=True)

y = all_data["DOS"]
X = all_data.drop("DOS", axis=1)

impute_X = SimpleImputer(strategy="median").fit_transform(X)
impute_X = pd.DataFrame(data = impute_X, index = X.index, columns = X.columns)

Spearmancorr = {}
features = impute_X.columns
for feature in features:
	corr, pval = spearmanr(impute_X[feature], y)
	Spearmancorr[feature] = [corr, pval]

SpearmanPvalue = pd.DataFrame(Spearmancorr).T
SpearmanPvalue.columns = ['Spearman corr', 'pvalue']
SpearmanPvalue.sort_values('pvalue', inplace=True)
SpearmanPvalue.to_csv("./Onset of Labor csv/Univariate regated OOL/SpearmanCorrelationsPval.csv", index=True)

scatterplot_features(
	SpearmanPvalue[:10].index,
	X,
	y,
	show_fig=False,
	export_file=True,
	path="./Onset of Labor csv/Univariate regated OOL")

Some of the features were not associated with a corr score nor pvalue : they all have only null values.

In [42]:
os.makedirs("./Onset of Labor csv/Univariate unfited features", exist_ok=True)

unfitted = ["Bcells_ERK_unstim",
"Bcells_p38_unstim",
"CD4Teff_CREB_unstim",
"CD4Teff_p38_unstim",
"CD4Tnaive_p38_unstim",
"CD56loCD16posNK_ERK_unstim",
"mDCs_ERK_unstim",
"ncMCs_ERK_unstim",
"ncMCs_p38_unstim",
"Tregs_p38_unstim"]

scatterplot_features(
	unfitted,
	X,
	y,
	show_fig=False,
	export_file=True,
	path="./Onset of Labor csv/Univariate unfited features")