In [2]:
import pandas as pd
import numpy as np
import os
import asgl
from pathlib import Path
from sklearn import clone

from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, LinearRegression, LassoCV, Lasso
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RepeatedKFold, RepeatedStratifiedKFold, LeaveOneOut, LeaveOneGroupOut
from sklearn.svm import l1_min_c
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

from scipy.stats import spearmanr

from stabl.pipelines_utils import compute_scores_table, save_plots
from stabl.preprocessing import LowInfoFilter
from stabl.metrics import jaccard_matrix

from stabl.stacked_generalization import stacked_multi_omic
import random

lasso_cv = LassoCV(alphas=[10.**i for i in np.arange(-3, 1.1, 0.05)], max_iter=int(1e6), n_jobs=-1)

linreg = LinearRegression()
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)

preprocessing = Pipeline(
    steps=[
        ("variance", VarianceThreshold(0.)),
        ("lif", LowInfoFilter()),
        ("impute", SimpleImputer(strategy="median")),
        ("std", StandardScaler())
    ]
)

## Import Data

In [89]:
#Import Data
X_ga = pd.read_csv('./Data/All/GA.csv',index_col=0)
X_cytof = pd.read_csv('./Data/All/Cytof.csv',index_col=0)
X_prot = pd.read_csv('./Data/All/Proteomics.csv',index_col=0)

X_ga_term = pd.read_csv('./Data/Term/GA.csv',index_col=0)
X_cytof_term = pd.read_csv('./Data/Term/Cytof.csv',index_col=0)
X_prot_term = pd.read_csv('./Data/Term/Proteomics.csv',index_col=0)

X_ga_preterm = pd.read_csv('./Data/Preterm/GA.csv',index_col=0)
X_cytof_preterm = pd.read_csv('./Data/Preterm/Cytof.csv',index_col=0)
X_prot_preterm = pd.read_csv('./Data/Preterm/Proteomics.csv',index_col=0)

all_data_dict = {
 		'CYTOF': X_cytof,
 		'GA': X_ga,
 		'Proteomics': X_prot
 	}

X_tot = pd.concat(all_data_dict.values(), axis=1)

groups = pd.read_csv('./Data/All/ID.csv',index_col=0).iloc[:,0]
y = pd.read_csv('./Data/All/DOS.csv',index_col=0).iloc[:,0]

In [133]:
splitter = LeaveOneGroupOut()

ef_predictions_dict = {"GA":pd.DataFrame(), "CyTOF&Proteomics":pd.DataFrame(), "CyTOF&Proteomics&GA":pd.DataFrame()}
lf_predictions_dict = {"GA":pd.DataFrame(), "CyTOF&Proteomics":pd.DataFrame(), "CyTOF&Proteomics&GA":pd.DataFrame()}

ef_coefs = {"CyTOF&Proteomics":pd.DataFrame(), "CyTOF&Proteomics&GA":pd.DataFrame()}
lf_coefs = {"CyTOF&Proteomics":pd.DataFrame(), "CyTOF&Proteomics&GA":pd.DataFrame()}
lf_importances = {"CyTOF&Proteomics":pd.DataFrame(), "CyTOF&Proteomics&GA":pd.DataFrame()}

def build_lasso_and_predict(X_train, y_train, X_test):
  X_train = pd.DataFrame(
            data=preprocessing.fit_transform(X_train),
            columns=preprocessing.get_feature_names_out(),
            index=X_train.index
        )

  X_test = pd.DataFrame(
            data=preprocessing.transform(X_test),
            columns=preprocessing.get_feature_names_out(),
            index=X_test.index
        )
  fit_lasso = clone(lasso_cv).fit(X_train, y_train)
  coefs = pd.DataFrame({'feature' : fit_lasso.feature_names_in_, 'coef': fit_lasso.coef_}).set_index('feature')
  return(coefs, fit_lasso.predict(X_test))

def build_stack_model_and_predict(train_data_dict, y_train, test_data_dict):
  train_preds = dict()
  test_preds = dict()
  features = list()
  coefs = list()
  
  for omic_name, X_train in train_data_dict.items():
    train_data_dict[omic_name] = pd.DataFrame(
              data=preprocessing.fit_transform(X_train),
              columns=preprocessing.get_feature_names_out(),
              index=X_train.index
          )

    test_data_dict[omic_name] = pd.DataFrame(
              data=preprocessing.transform(test_data_dict[omic_name]),
              columns=preprocessing.get_feature_names_out(),
              index=test_data_dict[omic_name].index
          )
    fit_lasso = clone(lasso_cv).fit(train_data_dict[omic_name], y_train)
    train_preds[omic_name] = pd.DataFrame(fit_lasso.predict(train_data_dict[omic_name]))
    test_preds[omic_name] = pd.DataFrame(fit_lasso.predict(test_data_dict[omic_name]))
    features = features + fit_lasso.feature_names_in_.tolist()
    coefs = coefs + fit_lasso.coef_.tolist()
    
  train_preds_df = pd.concat(train_preds.values(), axis=1)
  test_preds_df = pd.concat(test_preds.values(), axis=1)
  stack_model = clone(rf_regressor).fit(train_preds_df, y_train)
  coef_df = pd.DataFrame({'feature':features, 'coef':coefs}).set_index('feature')
  imp = pd.DataFrame({"omic": train_preds.keys(), "importance" : stack_model.feature_importances_}).set_index("omic")

  return(coef_df, imp, stack_model.predict(test_preds_df))
  
    
  fit_lasso = clone(lasso_cv).fit(X_train, y_train)
  coefs = pd.DataFrame({'feature' : fit_lasso.feature_names_in_, 'coef': fit_lasso.coef_}).set_index('feature')
  return(coefs, fit_lasso.predict(X_test))


for i, (train_index, test_index) in enumerate(splitter.split(X_ga, y, groups)):
   print(f"Fold {i}")
   ids = groups[test_index].index.to_list()
   y_train = y[train_index]
   y_test = y[test_index]
    
   ## All data
   # EF
   X_train = X_tot.iloc[train_index]
   X_test = X_tot.iloc[test_index]
   
   coefs, preds = build_lasso_and_predict(X_train, y_train, X_test)
   ef_predictions_dict["CyTOF&Proteomics&GA"] = pd.concat([ef_predictions_dict["CyTOF&Proteomics&GA"], pd.DataFrame({'sampleID': ids, 'pred' : preds}).set_index('sampleID')], axis=0)
   ef_coefs["CyTOF&Proteomics&GA"] = pd.concat([ef_coefs["CyTOF&Proteomics&GA"], coefs.rename(columns={'coef':f"Fold {i}"})], axis=1)

   # LF
   train_data_dict = {
 	  'CYTOF': X_cytof.iloc[train_index],
 		'GA': X_ga.iloc[train_index],
 		'Proteomics': X_prot.iloc[train_index]
 	 }
   test_data_dict = {
 		'CYTOF': X_cytof.iloc[test_index],
 		'GA': X_ga.iloc[test_index],
 		'Proteomics': X_prot.iloc[test_index]
 	 }
   
   coefs, imp, preds = build_stack_model_and_predict(train_data_dict, y_train, test_data_dict)
   lf_predictions_dict["CyTOF&Proteomics&GA"] = pd.concat([lf_predictions_dict["CyTOF&Proteomics&GA"], pd.DataFrame({'sampleID': ids, 'pred' : preds}).set_index('sampleID')], axis=0)
   lf_coefs["CyTOF&Proteomics&GA"] = pd.concat([lf_coefs["CyTOF&Proteomics&GA"], coefs.rename(columns={'coef':f"Fold {i}"})], axis=1) 
   lf_importances["CyTOF&Proteomics&GA"] = pd.concat([lf_importances["CyTOF&Proteomics&GA"], imp.rename(columns={'importance':f"Fold {i}"})], axis=1) 

   ## Prot + Cytof
   # EF
   X_train = pd.concat([all_data_dict["CYTOF"], all_data_dict["Proteomics"]], axis=1).iloc[train_index]
   X_test = pd.concat([all_data_dict["CYTOF"], all_data_dict["Proteomics"]], axis=1).iloc[test_index]
   
   coefs, preds = build_lasso_and_predict(X_train, y_train, X_test)
   ef_predictions_dict["CyTOF&Proteomics"] = pd.concat([ef_predictions_dict["CyTOF&Proteomics"], pd.DataFrame({'sampleID': ids, 'pred' : preds}).set_index('sampleID')], axis=0)
   ef_coefs["CyTOF&Proteomics"] = pd.concat([ef_coefs["CyTOF&Proteomics"], coefs.rename(columns={'coef':f"Fold {i}"})], axis=1)
   
   # LF
   train_data_dict = {
 		'CYTOF': X_cytof.iloc[train_index],
 		'Proteomics': X_prot.iloc[train_index]
 	}
   test_data_dict = {
 		'CYTOF': X_cytof.iloc[test_index],
 		'Proteomics': X_prot.iloc[test_index]
 	}
  
   coefs, imp, preds = build_stack_model_and_predict(train_data_dict, y_train, test_data_dict)
   lf_predictions_dict["CyTOF&Proteomics"] = pd.concat([lf_predictions_dict["CyTOF&Proteomics"], pd.DataFrame({'sampleID': ids, 'pred' : preds}).set_index('sampleID')], axis=0)
   lf_coefs["CyTOF&Proteomics"] = pd.concat([lf_coefs["CyTOF&Proteomics"], coefs.rename(columns={'coef':f"Fold {i}"})], axis=1) 
   lf_importances["CyTOF&Proteomics"] = pd.concat([lf_importances["CyTOF&Proteomics"], imp.rename(columns={'importance':f"Fold {i}"})], axis=1) 
    

Fold 0


['CYTOF' 'GA' 'Proteomics']
[0.0326053  0.53119046 0.43620424]
['CYTOF' 'Proteomics']
[0.06222248 0.93777752]
Fold 1
['CYTOF' 'GA' 'Proteomics']
[0.56285096 0.2948851  0.14226393]
['CYTOF' 'Proteomics']
[0.7745862 0.2254138]
Fold 2
['CYTOF' 'GA' 'Proteomics']
[0.04066454 0.03804535 0.92129011]


KeyboardInterrupt: 

In [109]:
ef_predictions_dict["GA"] = 7*(X_ga - 40)
lf_predictions_dict["GA"] = 7*(X_ga - 40)

In [4]:
result_path = "./Results2"
try:
    os.mkdir(result_path)
except FileExistsError:
    pass

In [129]:
all_preds_df = pd.concat(ef_predictions_dict.values(), axis=1)
all_preds_df.columns = list(ef_predictions_dict.keys())
all_preds_df.rename(columns={'CyTOF&Proteomics' : 'EF CyTOF&Proteomics', 'CyTOF&Proteomics&GA' : 'EF CyTOF&Proteomics&GA'}, inplace=True)
all_preds_df = pd.concat([all_preds_df, lf_predictions_dict['CyTOF&Proteomics'], lf_predictions_dict['CyTOF&Proteomics&GA']], axis=1)
all_preds_df.columns = list(all_preds_df.columns[:-2]) + ['LF CyTOF&Proteomics', 'LF CyTOF&Proteomics&GA']
all_preds_df.to_csv(Path(result_path, 'predictions.csv'))

y.to_csv(Path(result_path, 'outcomes.csv'))

for key, el in ef_coefs.items():
    el.to_csv(Path(result_path, f"EF {key} coefficients.csv"))

for key, el in lf_coefs.items():
    el.to_csv(Path(result_path, f"LF {key} coefficients.csv"))
    
for key, el in lf_importances.items():
    el.to_csv(Path(result_path, f"LF {key} omic importances.csv"))

In [8]:
all_preds_df = pd.read_csv(Path(result_path, 'predictions.csv'), index_col=0)
y = pd.read_csv(Path(result_path, 'outcomes.csv'), index_col=0)

In [16]:
scores = pd.DataFrame(index = all_preds_df.columns, columns = ['rmse', 'R2', 'Spearmanr', 'pvalue'])

for col in all_preds_df.columns:
    y_preds = all_preds_df[col]
    lin_reg = clone(linreg).fit(np.array(y).reshape(-1, 1), np.array(y_preds).reshape(-1, 1))
    y_reg = lin_reg.intercept_ + lin_reg.coef_[0]*y
    scores.loc[col, 'R2'] = r2_score(y_preds, y_reg)
    corr, pval = spearmanr(y_preds, y)
    scores.loc[col, 'Spearmanr'], scores.loc[col, 'pvalue'] = corr, f'{pval:.6e}'
    scores.loc[col, 'rmse'] = np.sqrt(mean_squared_error(y_preds, y))

scores.to_csv(Path(result_path, 'scores.csv'))