In [4]:
import warnings 
warnings.filterwarnings('ignore')

# Libraries

In [5]:
#Basic libraries
import os
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import GroupShuffleSplit, RepeatedStratifiedKFold, LeaveOneOut
from sklearn.base import clone
from scipy.stats import mannwhitneyu, spearmanr
from stabl.visualization import scatterplot_features, boxplot_features
from stabl.stabl import Stabl, save_stabl_results

from sklearn.linear_model import LassoCV, LogisticRegressionCV, LogisticRegression, LinearRegression, ElasticNetCV, Lasso

#STABL pipelines
from stabl.multi_omic_pipelines import multi_omic_stabl, multi_omic_stabl_cv
from stabl.single_omic_pipelines import single_omic_stabl, single_omic_stabl_cv

#Preprocessing functions
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from stabl.preprocessing import LowInfoFilter, remove_low_info_samples


# Import Data

In [7]:
X_EGA_pen = pd.read_csv('Onset of Labor csv/immunome_EGA_pen_OOL.csv',index_col="ID")
X_EGA = pd.read_csv('Onset of Labor csv/immunome_EGA_OOL.csv',index_col="ID")

X = X_EGA_pen
data_name = "immunome_EGA_pen_OOL"

y = pd.read_csv('./Onset of Labor csv/outcome_OOL.csv',index_col="ID").iloc[:,0]

# Preprocessing

In [None]:
remove_low_info_samples(X)

In [None]:
preprocessing = Pipeline(
	steps=[
		('lif', LowInfoFilter(0.2)),
		('variance', VarianceThreshold(0.01)),
		('impute', SimpleImputer(strategy='median')),
		('std', StandardScaler())
	])

# Training CV

In [None]:
run_name = "LassoKF_L0.2_V0.1_B0.5"

In [14]:
logit_en = LogisticRegression(penalty='l1', max_iter=int(1e6), solver='liblinear', class_weight='balanced')

stabl = Stabl(base_estimator=clone(logit_en),
	lambda_name='C',
	lambda_grid=list(np.linspace(0.01, 1, 30)),
	n_bootstraps=1000,
	artificial_type='knockoff',
	artificial_proportion=1.,
	sample_fraction=0.5,
	replace=False,
	fdr_threshold_range=list(np.arange(0., 1., .01)),
	sample_weight_bootstrap=None,
	bootstrap_threshold=1e-5,
	backend_multi='threading',
	verbose=0,
	n_jobs=-1,
	random_state=42)

stability_selection = clone(stabl).set_params(artificial_type=None, hard_threshold=0.3)

In [None]:
outer_splitter = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=42)

single_omic_stabl_cv(
	X=X,
	y=y.astype(int),
	outer_splitter=outer_splitter,
	stabl=stabl,
	stability_selection=stability_selection,
	task_type='regression',
	save_path=f"../Results/{data_name}/{run_name}"
)

# Univariate

In [None]:
os.makedirs(f"../Results/{data_name}/{run_name}" + '/Univariate', exist_ok=True)
Spearmancorr = {}
features = X.columns
for feature in features:
	corr, pval = spearmanr(X[feature], y)
	Spearmancorr[feature] = [corr, pval]

SpearmanPvalue = pd.DataFrame(Spearmancorr).T
SpearmanPvalue.columns = ['Spearman corr', 'pvalue']
SpearmanPvalue.sort_values('pvalue', inplace=True)
SpearmanPvalue.to_csv(f"../Results/{data_name}/{run_name}"+'/Univariate/SpearmanCorrelationsPval.csv', index=True)

scatterplot_features(
	SpearmanPvalue[:10].index,
	X,
	y,
	show_fig=False,
	export_file=True,
	path=f"../{run_name}/Results/Univariate")

# Final STABL

In [None]:
X_STD = pd.DataFrame(
	data=preprocessing.fit_transform(X),
	index=X.index,
	columns=preprocessing.get_feature_names_out()
)

finalstabl = clone(stabl)
finalstabl.fit(X_STD,y)

save_stabl_results(finalstabl,f"../Results/{data_name}/{run_name}"+'/FinalSTABL/',X_STD,y,task_type='regression')