In [1]:
import warnings
warnings.filterwarnings('ignore')

# Libraries

## Basic libraries

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.base import clone

from stabl.stabl import Stabl, plot_stabl_path, plot_fdr_graph, save_stabl_results
from stabl.preprocessing import LowInfoFilter, remove_low_info_samples

%config InlineBackend.figure_formats=['retina']

## Stabl Pipelines

In [3]:
from stabl.multi_omic_pipelines import multi_omic_stabl, multi_omic_stabl_cv
from stabl.single_omic_pipelines import single_omic_stabl, single_omic_stabl_cv
from stabl.pipelines_utils import compute_features_table

# Data

## Training data

In [4]:
# Importing the training data
X_train = pd.read_csv('../Sample Data/COVID-19/Training/Proteomics.csv',index_col="sampleID")
y_train = pd.read_csv("../Sample Data/COVID-19/Training/Mild&ModVsSevere.csv", index_col=0).iloc[:, 0]

## Validation Data

In [5]:
X_val = pd.read_csv("../Sample Data/COVID-19/Validation/Validation_proteomics.csv", index_col=0)
y_val = ~pd.read_csv("../Sample Data/COVID-19/Validation/Validation_outcome(WHO.0 ≥ 5).csv", index_col=0).iloc[:,0]

In [20]:
pd.concat([y_train, y_val], axis=0)

007-0003    1
007-0005    1
007-0007    1
007-0008    0
007-0009    1
           ..
384_D0      1
384_D7      1
385_D3      1
385_D0      1
386_D0      1
Length: 852, dtype: object

# Result folder name

In [6]:
result_folder = "./Results COVID-19"

# Single-omic Training-CV

In [7]:
stabl = Stabl(
    lambda_grid=np.linspace(0.01, 5, 10),
    n_bootstraps=1000,
    artificial_type="random_permutation",
    replace=False,
    fdr_threshold_range=np.arange(0.1, 1, 0.01),
    sample_fraction=.5,
    random_state=42
)

stability_selection = clone(stabl).set_params(hard_threshold=.1, artificial_type = None)

outer_splitter = RepeatedStratifiedKFold(n_splits=5, n_repeats=20, random_state=42)

In [15]:
predictions_dict = single_omic_stabl_cv(
    X=X_train,
    y=y_train,
    outer_splitter=outer_splitter,
    stabl=stabl,
    stability_selection=stability_selection,
    task_type="binary",
    save_path=result_folder,
    outer_groups=None
)

***************************** Iteration 1 over 100 ***************************** 

54 train samples, 14 test samples


exception calling callback for <Future at 0x7fa2ae798f40 state=finished returned list>
Traceback (most recent call last):
  File "/Users/jonasamar/Stabl/.venv/lib/python3.9/site-packages/joblib/externals/loky/_base.py", line 26, in _invoke_callbacks
    callback(self)
  File "/Users/jonasamar/Stabl/.venv/lib/python3.9/site-packages/joblib/parallel.py", line 385, in __call__
    self.parallel.dispatch_next()
  File "/Users/jonasamar/Stabl/.venv/lib/python3.9/site-packages/joblib/parallel.py", line 834, in dispatch_next
    if not self.dispatch_one_batch(self._original_iterator):
  File "/Users/jonasamar/Stabl/.venv/lib/python3.9/site-packages/joblib/parallel.py", line 901, in dispatch_one_batch
    self._dispatch(tasks)
  File "/Users/jonasamar/Stabl/.venv/lib/python3.9/site-packages/joblib/parallel.py", line 819, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "/Users/jonasamar/Stabl/.venv/lib/python3.9/site-packages/joblib/_parallel_backends.py", line 556, 

KeyboardInterrupt: 

# Tables of features

In [8]:
selected_features_dict = dict()
for model in ["STABL", "Lasso", "Lasso 1SE", "ElasticNet", "SS 03", "SS 05", "SS 08"]:
    path = Path(result_folder, "Training-Validation", f"{model} coefficients.csv")
    try:
        selected_features_dict[model] = list(pd.read_csv(path, index_col=0).iloc[:, 0].index)
    except:
        selected_features_dict[model] = []

In [9]:
features_table = compute_features_table(
    selected_features_dict,
    X_train=X_train,
    y_train=y_train,
    X_test=X_val,
    y_test=y_val,
    task_type="binary")

In [12]:
import os

os.makedirs(Path(result_folder, "Training-Validation"))
features_table.to_csv(Path(result_folder, "Training-Validation", "Table of features.csv"))

In [4]:
import csv

# Lecture du fichier CSV
with open('/Users/jonasamar/Stabl/Notebook examples/Results COVID-19/Summary/Scores training CV.csv', newline='') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        print(row)

['', 'ROC AUC', 'Average Precision', 'N features', 'CVS']
['STABL', '0.871 [0.770, 0.951]', '0.785 [0.618, 0.936]', '7.500 [5.000, 13.250]', '0.222 [0.154, 0.294]']
['SS 03', '0.847 [0.739, 0.938] (p=0.281)', '0.755 [0.563, 0.923] (p=0.316)', '3.500 [2.000, 4.000] (p=6.237e-19)', '0.200 [0.100, 0.333] (p=5.364e-14)']
['SS 05', '0.480 [0.432, 0.500] (p=0.0)', '0.359 [0.250, 0.478] (p=0.0)', '0.000 [0.000, 1.000] (p=6.859e-36)', '0.000 [0.000, 0.000] (p=0.000e+00)']
['SS 08', '0.500 [0.500, 0.500] (p=0.0)', '0.368 [0.265, 0.485] (p=0.0)', '0.000 [0.000, 0.000] (p=5.233e-39)', '0.000 [0.000, 0.000] (p=0.000e+00)']
['Lasso', '0.855 [0.739, 0.954] (p=0.367)', '0.829 [0.675, 0.936] (p=0.33)', '21.500 [8.000, 105.250] (p=6.825e-09)', '0.105 [0.043, 0.217] (p=0.000e+00)']
['Lasso 1SE', '0.854 [0.747, 0.943] (p=0.362)', '0.823 [0.675, 0.932] (p=0.338)', '22.000 [5.000, 103.250] (p=3.913e-05)', '0.071 [0.011, 0.200] (p=0.000e+00)']
['ElasticNet', '0.859 [0.738, 0.957] (p=0.411)', '0.811 [0.644, 

In [38]:
from PyPDF2 import PdfReader

def get_pvalue_from_Boxplot(model):
    reader = PdfReader('/Users/jonasamar/Stabl/Notebook examples/Results COVID-19/Training CV/' + model + '/' + model + ' Boxplot of median predictions.pdf')            
    # getting a specific page from the pdf file
    page = reader.pages[0]

    # extracting text from page
    text = page.extract_text()
    start_index = text.find('U-test pvalue = ') + len('U-test pvalue = ')
    end_index = text.find('\n', start_index)
    return text[start_index:end_index]

In [39]:
# Modifying a csv file to add the U-test pvalue
with open('/Users/jonasamar/Stabl/Notebook examples/Results COVID-19/Summary/Scores training CV.csv', newline='') as csvfile:
    reader = csv.reader(csvfile)
    with open('/Users/jonasamar/Stabl/Notebook examples/Results COVID-19/Summary/Scores training CV (2).csv', mode='w', newline='') as new_csvfile:
        writer = csv.writer(new_csvfile)
        for i, row in enumerate(reader):
            # modified values
            if i == 0:
                row.append('U-test pvalue')
            else:
                model = row[0]
                row.append(get_pvalue_from_Boxplot(model))
            writer.writerow(row)