In [4]:
import warnings 
warnings.filterwarnings('ignore')

# Libraries

In [1]:
#Basic libraries
import os
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import GroupShuffleSplit, RepeatedStratifiedKFold, LeaveOneOut
from sklearn.base import clone
from scipy.stats import mannwhitneyu, spearmanr
from stabl.visualization import scatterplot_features, boxplot_features
from stabl.stabl import Stabl, save_stabl_results

from sklearn.linear_model import LassoCV, LogisticRegressionCV, LogisticRegression, LinearRegression, ElasticNetCV, Lasso

#STABL pipelines
from stabl.multi_omic_pipelines import multi_omic_stabl, multi_omic_stabl_cv
from stabl.single_omic_pipelines import single_omic_stabl, single_omic_stabl_cv

#Preprocessing functions
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from stabl.preprocessing import LowInfoFilter, remove_low_info_samples


# Import Data

In [31]:
X_EGA_pen = pd.read_csv('Onset of Labor csv/immunome_EGA_pen_OOL.csv',index_col="ID")
X_EGA = pd.read_csv('Onset of Labor csv/immunome_EGA_OOL.csv',index_col="ID")

X = X_EGA_pen
data_name = "immunome_EGA_pen_OOL"

y = pd.read_csv('./Onset of Labor csv/outcome_OOL.csv',index_col="ID").iloc[:,0]
EGA_error = pd.read_csv('./Onset of Labor csv/EGA_error.csv',index_col="ID").iloc[:,0]

# Preprocessing

In [28]:
remove_low_info_samples(X, threshold=1.)

Unnamed: 0_level_0,Bcells_149Sm_CREB_IFNa,Bcells_149Sm_CREB_IL246,Bcells_149Sm_CREB_unstim,Bcells_150Nd_STAT5_IFNa,Bcells_150Nd_STAT5_IL246,Bcells_150Nd_STAT5_unstim,Bcells_151Eu_p38_unstim,Bcells_153Eu_STAT1_IFNa,Bcells_153Eu_STAT1_unstim,Bcells_154Sm_STAT3_IFNa,...,Tregs_166Er_NFkB_LPS,Tregs_166Er_NFkB_unstim,Tregs_167Er_ERK_IFNa,Tregs_167Er_ERK_IL246,Tregs_167Er_ERK_LPS,Tregs_167Er_ERK_unstim,Tregs_168Er_STAT6_IFNa,Tregs_168Er_STAT6_IL246,Tregs_168Er_STAT6_unstim,EGA
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
P1_26,0.0,0.0,0.0,0.859409,0.334423,0.00000,0,0.997950,0.0,1.895377,...,-0.027191,0.976451,-0.042583,-0.015911,-0.003624,0.244893,0.598535,1.940051,0.438072,26.4
P1_33,0.0,0.0,0.0,0.943593,0.364598,0.00000,0,0.640385,0.0,1.825286,...,0.076991,0.926734,-0.022876,0.030015,-0.027112,0.204665,0.418868,1.917416,0.566338,33.6
P1_35,0.0,0.0,0.0,1.001723,0.372325,0.00000,0,0.710037,0.0,1.887490,...,0.030714,1.036537,-0.001069,0.049183,-0.000579,0.207885,0.650719,2.164235,0.509774,35.6
P100_29,0.0,0.0,0.0,0.950186,0.343260,0.00000,0,0.573795,0.0,1.690015,...,0.020408,0.930944,0.054910,-0.009537,0.002500,0.169933,0.467283,1.671560,0.401472,29.4
P100_37,0.0,0.0,0.0,1.022674,0.360202,0.00000,0,0.955863,0.0,1.844373,...,-0.033676,1.035290,-0.036671,0.017481,-0.003831,0.261298,0.436636,1.743342,0.572790,37.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
P98_36,0.0,0.0,0.0,1.117574,0.583635,0.02525,0,0.661128,0.0,1.899606,...,-0.081461,1.218162,-0.051101,0.006277,-0.050852,0.262577,0.511886,1.972187,0.577122,36.6
P98_38,0.0,0.0,0.0,1.044718,0.472136,0.00000,0,0.764981,0.0,1.872822,...,-0.006901,1.160724,-0.013569,0.046043,0.022717,0.223747,0.558892,1.941205,0.524335,38.7
P99_24,0.0,0.0,0.0,1.125924,0.172951,0.00000,0,0.359964,0.0,1.684167,...,0.027199,0.957135,0.001288,0.011170,-0.017698,0.253924,0.491894,1.712550,0.504544,24.6
P99_38,0.0,0.0,0.0,1.078673,0.219154,0.00000,0,0.585111,0.0,1.727729,...,0.007123,1.092816,0.018081,0.051817,0.011974,0.253305,0.442802,1.884497,0.606656,38.7


In [29]:
preprocessing = Pipeline(
	steps=[
		('lif', LowInfoFilter(0.2)),
		('variance', VarianceThreshold(0.0)),
		('impute', SimpleImputer(strategy='median')),
		('std', StandardScaler())
	])

# Training CV

In [None]:
run_name = "LassoKF_L0.2_V0.1_B0.5"

In [14]:
logit_en = LogisticRegression(penalty='l1', max_iter=int(1e6), solver='liblinear', class_weight='balanced')

stabl = Stabl(base_estimator=clone(logit_en),
	lambda_name='C',
	lambda_grid=list(np.linspace(0.01, 1, 30)),
	n_bootstraps=1000,
	artificial_type='knockoff',
	artificial_proportion=1.,
	sample_fraction=0.5,
	replace=False,
	fdr_threshold_range=list(np.arange(0., 1., .01)),
	sample_weight_bootstrap=None,
	bootstrap_threshold=1e-5,
	backend_multi='threading',
	verbose=0,
	n_jobs=-1,
	random_state=42)

stability_selection = clone(stabl).set_params(artificial_type=None, hard_threshold=0.3)

In [None]:
outer_splitter = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=42)

single_omic_stabl_cv(
	X=X,
	y=y.astype(int),
	outer_splitter=outer_splitter,
	stabl=stabl,
	stability_selection=stability_selection,
	task_type='regression',
	save_path=f"../Results/{data_name}/{run_name}"
)

# Univariate

In [37]:
#os.makedirs(f"../Results/{data_name}/{run_name}" + '/Univariate', exist_ok=True)
os.makedirs(f"./Results/{data_name}" + '/Univariate', exist_ok=True)

impute_X = SimpleImputer(strategy="median").fit_transform(X)
impute_X = pd.DataFrame(data = impute_X, index = X.index, columns = X.columns)

Spearmancorr = {}
features = impute_X.columns
for feature in features:
	corr, pval = spearmanr(impute_X[feature], y)
	Spearmancorr[feature] = [corr, pval]

SpearmanPvalue = pd.DataFrame(Spearmancorr).T
SpearmanPvalue.columns = ['Spearman corr', 'pvalue']
SpearmanPvalue.sort_values('pvalue', inplace=True)
#SpearmanPvalue.to_csv(f"../Results/{data_name}/{run_name}"+'/Univariate/SpearmanCorrelationsPval.csv', index=True)
SpearmanPvalue.to_csv(f"./Results/{data_name}"+'/Univariate/SpearmanCorrelationsPval.csv', index=True)

scatterplot_features(
	SpearmanPvalue[:10].index,
	X,
	y,
	show_fig=False,
	export_file=True,
	#path=f"../{run_name}/Results/Univariate")
	path=f"./Results/{data_name}/Univariate")



# Final STABL

In [None]:
X_STD = pd.DataFrame(
	data=preprocessing.fit_transform(X),
	index=X.index,
	columns=preprocessing.get_feature_names_out()
)

finalstabl = clone(stabl)
finalstabl.fit(X_STD,y)

save_stabl_results(finalstabl,f"../Results/{data_name}/{run_name}"+'/FinalSTABL/',X_STD,y,task_type='regression')