In [None]:
import warnings
warnings.filterwarnings('ignore')

# Libraries

## Basic libraries

In [3]:
import numpy as np
import pandas as pd
from pathlib import Path
import os

from sklearn.model_selection import LeaveOneOut, RepeatedStratifiedKFold, RepeatedKFold
from sklearn.base import clone

from stabl.stabl import Stabl, plot_stabl_path, plot_fdr_graph, save_stabl_results
from stabl.preprocessing import LowInfoFilter, remove_low_info_samples

%config InlineBackend.figure_formats=['retina']

## Stabl pipelines

In [4]:
from stabl.multi_omic_pipelines import multi_omic_stabl, multi_omic_stabl_cv
from stabl.single_omic_pipelines import single_omic_stabl, single_omic_stabl_cv
from stabl.pipelines_utils import compute_features_table

# Data

## Importing dataset

In [20]:
dataset = pd.read_csv("../Sample Data/Stroke/preprocessed_HT.csv")
dataset.shape


(19320, 7)

In [28]:
cellpop = 'Bcells*'

data1 = dataset[(dataset['population']==cellpop) & (dataset['time']=='P1')]
data2 = dataset[(dataset['population']==cellpop) & (dataset['time']=='P2')]

    
    
# Rearrangement of the data
dict_x1 = {}
dict_x2 = {}
dict_y = {}
for sample in data1['sampleID']:
    dict_x1[sample] = {}
    dict_x2[sample] = {}
    for feature in data1['reagent'].unique():
        mask1 = (data1['sampleID']==sample) & (data1['reagent']==feature)
        mask2 = (data2['sampleID']==sample) & (data2['reagent']==feature)
        dict_x1[sample][feature] = float(data1[mask1]['feature'])
        dict_x2[sample][feature] = float(data2[mask2]['feature'])
    dict_y[sample] = data1[data1['sampleID']==sample]['group'].iloc[0]

X1 = pd.DataFrame(dict_x1).T
X2 = pd.DataFrame(dict_x2).T


  dict_x1[sample][feature] = float(data1[mask1]['feature'])
  dict_x2[sample][feature] = float(data2[mask2]['feature'])


In [47]:
X1

Unnamed: 0,CREB,STAT5,p38,STAT1,STAT3,S6,IkB,NFkB,ERK,STAT6,MAPKAPK2,Frequency
1326,0.163761,0.019402,0.0,0.0,0.0,0.168335,0.38331,0.544126,0.015134,0.0,0.337807,3.491947
1330,0.129051,0.025996,0.0,0.0,0.0,0.132083,0.300183,0.509023,0.04269,0.0,0.345787,2.088344
1331,0.083876,0.0,0.0,0.0,0.0,0.104575,0.254709,0.430262,0.027357,0.0,0.31462,6.88761
1337,0.076364,0.0,0.0,0.0,0.0,0.096712,0.298459,0.521528,0.009084,0.0,0.200995,4.322059
1357,0.071108,0.0,0.0,0.0,0.0,0.112229,0.284054,0.415279,0.0,0.0,0.230635,5.422968
1377,0.147159,0.0,0.0,0.0,0.0,0.176046,0.382887,0.799698,0.059829,0.0,0.387676,5.540952
1381,0.098972,0.0,0.0,0.0,0.0,0.107852,0.253246,0.434483,0.0,0.0,0.307554,3.07903
1400,0.060437,0.0,0.0,0.0,0.0,0.108241,0.27314,0.376081,0.0,0.0,0.187076,4.309668
1414,0.753279,0.079632,1.628586,0.0,0.0,0.544961,1.196805,1.447339,0.082607,0.030337,0.86241,3.456166
1418,0.156495,0.0,0.0,0.0,0.0,0.15638,0.482679,0.79652,0.048431,0.0,0.454194,5.11677


In [None]:
for column in dataset.columns:
    print(f"{column} values : {dataset[column].unique()}")
    print(f"card({column}) : {len(dataset[column].unique())}")
    print()

# Test 1 : for Bcells

In [None]:
dataset = dataset[dataset['population']=='Bcells*']
dataset['group'][dataset['group']=='No']=0
dataset['group'][dataset['group']=='Yes']=1
#dataset

# P1

In [None]:
data_path = Path('./Data', './BCells_P1')
os.makedirs(data_path, exist_ok=True)
os.makedirs('./Results', exist_ok=True)

In [48]:
P1_dataset = dataset[dataset['time']=='P1']

## Multivariate Analysis

### Dataset

In [None]:
P1_dict_x = {}
P1_dict_y = {}
for sample in dataset['sampleID']:
    P1_dict_x[sample] = {}
    for feature in dataset['reagent'].unique():
        mask = (P1_dataset['sampleID']==sample) & (P1_dataset['reagent']==feature)
        P1_dict_x[sample][feature] = float(P1_dataset[mask]['feature'])
    P1_dict_y[sample] = P1_dataset[P1_dataset['sampleID']==sample]['group'].iloc[0]
        
pd.DataFrame(P1_dict_x).T.to_csv(Path(data_path, "X_P1_Bcells.csv"), index=True)
pd.DataFrame([P1_dict_y]).T.to_csv(Path(data_path, "y_P1_Bcells.csv"), index=True)

In [None]:
X = pd.read_csv(Path(data_path, "X_P1_Bcells.csv"), index_col=0)
y = pd.read_csv(Path(data_path, "y_P1_Bcells.csv"), index_col=0).iloc[:, 0]
y.name = None

### Result folder name

In [None]:
result_folder = "./Results/Results P1 Bcells Kfold"

### Single-omic Training-CV

In [None]:
stabl = Stabl(
    lambda_grid=np.linspace(0.01, 5, 10),
    n_bootstraps=1000,
    artificial_type="random_permutation",
    replace=False,
    fdr_threshold_range=np.arange(0.1, 1, 0.01),
    sample_fraction=.5,
    random_state=42
)

stability_selection = clone(stabl).set_params(hard_threshold=.1, artificial_type = None)

#outer_splitter = LeaveOneOut()
#outer_splitter = RepeatedStratifiedKFold(n_splits=len(X), n_repeats=20, random_state=42)
outer_splitter = RepeatedKFold(n_splits=len(X), n_repeats=10, random_state=42)

In [None]:
from stabl.single_omic_pipelines import single_omic_stabl_cv

predictions_dict = single_omic_stabl_cv(
    X=X,
    y=y,
    outer_splitter=outer_splitter,
    stabl=stabl,
    stability_selection=stability_selection,
    task_type="binary",
    save_path=result_folder,
    outer_groups=None
)

### Tables of features

In [None]:
selected_features_dict = dict()
for model in ["STABL", "Lasso", "Lasso 1SE", "ElasticNet", "SS 03", "SS 05", "SS 08"]:
    path = Path(result_folder, "Training-Validation", f"{model} coefficients.csv")
    try:
        selected_features_dict[model] = list(pd.read_csv(path, index_col=0).iloc[:, 0].index)
    except:
        selected_features_dict[model] = []
        
features_table = compute_features_table(
    selected_features_dict,
    X_train=X,
    y_train=y,
    X_test=None,
    y_test=None,
    task_type="binary")

os.makedirs(Path(result_folder, "Training-Validation"))
features_table.to_csv(Path(result_folder, "Training-Validation", "Table of features.csv"))

## Univariate

In [None]:
from scipy.stats import spearmanr
import numpy as np

Spearmancorr = {}

features = X.columns

for feature in features:
    
    corr, pval = spearmanr(X[feature], y)
    Spearmancorr[feature] = [corr, pval]

SpearmanPvalue = pd.DataFrame(Spearmancorr).T
SpearmanPvalue.columns = ['Spearman corr', 'pvalue']
SpearmanPvalue.sort_values('pvalue', inplace=True)
SpearmanPvalue.to_csv(Path(result_folder, 'Summary', 'SpearmanCorrelationsPval.csv'), index=True)

In [None]:
from stabl.visualization import boxplot_features

os.makedirs(Path(result_folder, 'Univariate'))

boxplot_features(
        SpearmanPvalue[:5].index,
        X,
        y,
        show_fig=False,
        export_file=True,
        path=Path(result_folder, 'Univariate'))

### Rearrangement of results

In [None]:
import shutil

# Source and destination paths
for model in ["STABL", "Lasso", "Lasso 1SE", "ElasticNet", "SS 03", "SS 05", "SS 08"]:
    src_folder = Path(result_folder, 'Training CV', model)
    dst_folder = Path(result_folder, 'Summary')

    # Loop over the files in the source folder
    for filename in os.listdir(src_folder):
        if "Boxplot" in filename:
            src_file = os.path.join(src_folder, filename)
            dst_file = os.path.join(dst_folder, filename)
            shutil.copy(src_file, dst_file)

In [None]:
from PyPDF2 import PdfReader
import csv

def get_pvalue_from_Boxplot(model):
    reader = PdfReader(Path(result_folder, 'Summary', model + ' Boxplot of median predictions.pdf'))         
    # getting a specific page from the pdf file
    page = reader.pages[0]

    # extracting text from page
    text = page.extract_text()
    start_index = text.find('U-test pvalue = ') + len('U-test pvalue = ')
    end_index = text.find('\n', start_index)
    return text[start_index:end_index]

# Modifying a csv file to add the U-test pvalue
with open(Path(result_folder, 'Summary', 'Scores training CV.csv', newline='')) as csvfile:
    reader = csv.reader(csvfile)
    with open(Path(result_folder, 'Summary', 'Scores training CV (2).csv'), mode='w', newline='') as new_csvfile:
        writer = csv.writer(new_csvfile)
        for i, row in enumerate(reader):
            # modified values
            if i == 0:
                row.append('U-test pvalue')
            else:
                model = row[0]
                row.append(get_pvalue_from_Boxplot(model))
            writer.writerow(row)