In [1]:
import warnings
warnings.filterwarnings('ignore')

# Libraries

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import GroupShuffleSplit
from sklearn.base import clone
from sklearn.linear_model import Lasso

from stabl.stabl import Stabl, plot_stabl_path, plot_fdr_graph, save_stabl_results
from stabl.preprocessing import LowInfoFilter, remove_low_info_samples

%config InlineBackend.figure_formats=['retina']

In [3]:
from stabl.multi_omic_pipelines import multi_omic_stabl, multi_omic_stabl_cv, late_fusion_lasso_cv
from stabl.single_omic_pipelines import single_omic_stabl, single_omic_stabl_cv
from stabl.pipelines_utils import compute_features_table

# Data

## Training Data

In [6]:
y_train = pd.read_csv("../Sample Data/Onset of Labor/Training/DOS.csv", index_col=0).DOS
patients_id = pd.read_csv("../Sample Data/Onset of Labor/Training/ID.csv",index_col=0).Id

# y_train.to_csv("../Sample Data/Onset of Labor/Training/y.csv")
# patients_id.to_csv("../Sample Data/Onset of Labor/Training/outer_groups.csv")


meta_train = pd.read_csv("../Sample Data/Onset of Labor/Training/Metabolomics.csv", index_col=0)
cyto_train = pd.read_csv("../Sample Data/Onset of Labor/Training/CyTOF.csv", index_col=0)
prot_train = pd.read_csv("../Sample Data/Onset of Labor/Training/Proteomics.csv", index_col=0)

## Validation Data

In [5]:
y_test = pd.read_csv("../Sample Data/Onset of Labor/Validation/DOS_validation.csv",index_col=0).DOS
cyto_test = pd.read_csv("../Sample Data/Onset of Labor/Validation/CyTOF_validation.csv", index_col=0)
prot_test = pd.read_csv("../Sample Data/Onset of Labor/Validation/Proteomics_validation.csv", index_col=0)

## Data dictionaries

We will define here different dictionaries depending on if we use validation or not

In [6]:
train_data_dict = {
    "CyTOF": cyto_train,
    "Proteomics": prot_train,
    "Metabolomics": meta_train
}

train_data_dict_red = {
    "CyTOF": cyto_train,
    "Proteomics": prot_train
}

test_data_dict = {
    "CyTOF": cyto_test,
    "Proteomics": prot_test
}

# Results folder

In [7]:
result_folder = "./Results Onset of Labor"

# Multi-omic Training-CV

In [8]:
lasso = Lasso(max_iter=int(1e6))  # Redefining the base model as we are in the regressio case

stabl = Stabl(
    base_estimator=clone(lasso),
    lambda_name='alpha',
    lambda_grid=np.logspace(0, 2, 10),
    n_bootstraps=300,
    artificial_proportion=1.,
    artificial_type="random_permutation",
    replace=False,
    fdr_threshold_range=np.arange(0.1, 1, 0.01),
    sample_fraction=.5,
    random_state=42
)

outer_splitter = GroupShuffleSplit(n_splits=100, test_size=.2, random_state=42)

stability_selection = clone(stabl).set_params(artificial_type=None, hard_threshold=.3)

In [1]:
predictions_dict = multi_omic_stabl_cv(
    data_dict=train_data_dict,
    y=y_train,
    outer_splitter=outer_splitter,
    stabl=stabl,
    stability_selection=stability_selection,
    task_type="regression",
    save_path=Path(result_folder, "Cytof_Prot_Meta"),
    outer_groups=patients_id
)

# Multi-omic Training (Cytof+Prot+Meta)

In [10]:
stabl_multi = Stabl(
    base_estimator=lasso,
    lambda_name='alpha',
    lambda_grid=np.logspace(0, 2, 30),
    n_bootstraps=1000,
    artificial_proportion=1.,
    artificial_type="random_permutation",
    replace=False,
    fdr_threshold_range=np.arange(0.2, 1, 0.01),
    sample_fraction=.5,
    random_state=42
)

stability_selection = clone(stabl_multi).set_params(artificial_type=None, hard_threshold=.1)

In [11]:
predictions_dict = multi_omic_stabl(
    data_dict=train_data_dict,
    y=y_train,
    stabl=stabl_multi,
    stability_selection=stability_selection,
    task_type="regression",
    save_path=Path(result_folder, "Cytof_Prot_Meta"),
)

                                                               18<00:00,  4.53s/it]

STABL finished on CyTOF; 2 features selected


                                                               23<00:00, 47.08s/it]

STABL finished on Proteomics; 17 features selected


                                                               14<00:00, 66.34s/it]

STABL finished on Metabolomics; 11 features selected


                                                               58<00:00, 51.38s/it]

# Multi-omic Training-Validation (Cytof+Prot)

In [12]:
stabl_multi = Stabl(
    base_estimator=lasso,
    lambda_name='alpha',
    lambda_grid=np.logspace(0, 2, 30),
    n_bootstraps=1000,
    artificial_proportion=1.,
    artificial_type="random_permutation",
    replace=False,
    fdr_threshold_range=np.arange(0.2, 1, 0.01),
    sample_fraction=.5,
    random_state=42
)

stability_selection = clone(stabl_multi).set_params(artificial_type=None, hard_threshold=.1)

In [13]:
predictions_dict = multi_omic_stabl(
    data_dict=train_data_dict_red,
    y=y_train,
    stabl=stabl_multi,
    stability_selection=stability_selection,
    task_type="regression",
    save_path=Path(result_folder, "Cytof_Prot"),
    X_test=pd.concat(test_data_dict.values(),axis=1),
    y_test=y_test
)

                                                               55<00:00,  4.85s/it]

STABL finished on CyTOF; 2 features selected


                                                               25<00:00, 45.67s/it]

STABL finished on Proteomics; 17 features selected


                                                               47<00:00,  5.53s/it]

# Late fusion Lasso

In [14]:
late_fusion_lasso_cv(
    train_data_dict=train_data_dict,
    y=y_train,
    outer_splitter=outer_splitter,
    task_type="regression",
    save_path=Path(result_folder, "Cytof_Prot_Meta"),
    groups=patients_id
)

Omic CyTOF
Iteration 1 over 100
Iteration 2 over 100
Iteration 3 over 100
Iteration 4 over 100
Iteration 5 over 100
Iteration 6 over 100
Iteration 7 over 100
Iteration 8 over 100
Iteration 9 over 100
Iteration 10 over 100
Iteration 11 over 100
Iteration 12 over 100
Iteration 13 over 100
Iteration 14 over 100
Iteration 15 over 100
Iteration 16 over 100
Iteration 17 over 100
Iteration 18 over 100
Iteration 19 over 100
Iteration 20 over 100
Iteration 21 over 100
Iteration 22 over 100
Iteration 23 over 100
Iteration 24 over 100
Iteration 25 over 100
Iteration 26 over 100
Iteration 27 over 100
Iteration 28 over 100
Iteration 29 over 100
Iteration 30 over 100
Iteration 31 over 100
Iteration 32 over 100
Iteration 33 over 100
Iteration 34 over 100
Iteration 35 over 100
Iteration 36 over 100
Iteration 37 over 100
Iteration 38 over 100
Iteration 39 over 100
Iteration 40 over 100
Iteration 41 over 100
Iteration 42 over 100
Iteration 43 over 100
Iteration 44 over 100
Iteration 45 over 100
Iteratio

# Table of features

## Table of features for Cytof+Prot+Meta case

In [15]:
selected_features_dict = dict()
for model in ["STABL", "EF Lasso", "SS 03", "SS 05", "SS 08"]:
    path = Path(result_folder, "Cytof_Prot_Meta", "Training-Validation", f"{model} coefficients.csv")
    try:
        selected_features_dict[model] = list(pd.read_csv(path, index_col=0).iloc[:, 0].index)
    except:
        selected_features_dict[model] = []

In [16]:
features_table = compute_features_table(
    selected_features_dict,
    X_train=pd.concat(train_data_dict.values(), axis=1),
    y_train=y_train,
    #X_test=pd.concat(test_data_dict.values(), axis=1),
    #y_test=y_test,
    task_type="regression"
)

In [17]:
features_table.to_csv(Path(result_folder, "Cytof_Prot_Meta", "Training-Validation", "Table of features.csv"))

## Table of features for Cytof+Prot case (with validation)

In [18]:
selected_features_dict = dict()
for model in ["STABL", "EF Lasso", "SS 03", "SS 05", "SS 08"]:
    path = Path(result_folder, "Cytof_Prot", "Training-Validation", f"{model} coefficients.csv")
    try:
        selected_features_dict[model] = list(pd.read_csv(path, index_col=0).iloc[:, 0].index)
    except:
        selected_features_dict[model] = []

In [19]:
features_table = compute_features_table(
    selected_features_dict,
    X_train=pd.concat(train_data_dict.values(), axis=1),
    y_train=y_train,
    X_test=pd.concat(test_data_dict.values(), axis=1),
    y_test=y_test,
    task_type="regression"
)

In [20]:
features_table.to_csv(Path(result_folder,"Cytof_Prot", "Training-Validation", "Table of features.csv"))

Spearman corr and Pvalue pipeline
---

**Cyto-Prot-Meta**

In [117]:
from scipy.stats import spearmanr
import numpy as np

Spearmancorr = {}

data_dict = {
    "CyTOF": pd.concat([cyto_train, cyto_test], axis = 0),
    "Proteomics": pd.concat([prot_train, prot_test], axis = 0),
    "Metabolomics": meta_train
}
X_tot = pd.concat(data_dict.values(), axis="columns")
y = pd.concat([y_train, y_test], axis = 0)

features = X_tot.columns

for feature in features:
    
    corr, pval = spearmanr(X_tot[feature], y)
    Spearmancorr[feature] = [corr, pval]

SpearmanPvalue = pd.DataFrame(Spearmancorr).T
SpearmanPvalue.columns = ['Spearman corr', 'pvalue']
SpearmanPvalue.sort_values('pvalue', inplace=True)
SpearmanPvalue.to_csv('/Users/jonasamar/Stabl/Notebook examples/Results Onset of Labor/Cytof_Prot_Meta/Summary/SpearmanCorrelationsPval.csv', index=True)

In [126]:
from stabl.visualization import scatterplot_features

scatterplot_features(
        SpearmanPvalue[:10].index,
        X_tot,
        y,
        show_fig=False,
        export_file=True,
        path='/Users/jonasamar/Stabl/Notebook examples/Results Onset of Labor/Cytof_Prot_Meta/Univariate')

**Cyto-Prot**

In [128]:
from scipy.stats import spearmanr
import numpy as np

Spearmancorr = {}

data_dict = {
    "CyTOF": pd.concat([cyto_train, cyto_test], axis = 0),
    "Proteomics": pd.concat([prot_train, prot_test], axis = 0)
}
X_tot = pd.concat(data_dict.values(), axis="columns")
y = pd.concat([y_train, y_test], axis = 0)

features = X_tot.columns

for feature in features:
    
    corr, pval = spearmanr(X_tot[feature], y)
    Spearmancorr[feature] = [corr, pval]

SpearmanPvalue = pd.DataFrame(Spearmancorr).T
SpearmanPvalue.columns = ['Spearman corr', 'pvalue']
SpearmanPvalue.sort_values('pvalue', inplace=True)
SpearmanPvalue.to_csv('/Users/jonasamar/Stabl/Notebook examples/Results Onset of Labor/Cytof_Prot/Summary/SpearmanCorrelationsPval.csv', index=True)

In [129]:
from stabl.visualization import scatterplot_features

scatterplot_features(
        SpearmanPvalue[:10].index,
        X_tot,
        y,
        show_fig=False,
        export_file=True,
        path='/Users/jonasamar/Stabl/Notebook examples/Results Onset of Labor/Cytof_Prot/Univariate')