In [1]:
import warnings
warnings.filterwarnings('ignore')

# Libraries

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from scipy.stats import zscore
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.base import clone
from sklearn.linear_model import Lasso

from stabl.stabl import Stabl, plot_stabl_path, plot_fdr_graph, save_stabl_results, export_stabl_to_csv
from stabl.preprocessing import LowInfoFilter, remove_low_info_samples

%config InlineBackend.figure_formats=['retina']

ModuleNotFoundError: No module named 'stabl'

In [None]:
from stabl.multi_omic_pipelines import multi_omic_stabl, multi_omic_stabl_cv, late_fusion_lasso_cv
from stabl.single_omic_pipelines import single_omic_stabl, single_omic_stabl_cv
from stabl.pipelines_utils import compute_features_table

# Data

In [3]:
Val_Celldensities =     pd.read_csv('../DataValidation/Val_celldensities.csv', index_col=0)
Val_Function =          pd.read_csv('../DataValidation/Val_functional.csv', index_col=0)
Val_Metavariables =     pd.read_csv('../DataValidation/Val_metavariables.csv', index_col=0)
Val_Neighborhood =      pd.read_csv('../DataValidation/Val_neighborhood.csv', index_col=0)

val_data = {
    'Val_Celldensities': Val_Celldensities,
    'Val_Function': Val_Function,
    'Val_Metavariables': Val_Metavariables,
    'Val_Neighborhood': Val_Neighborhood
}

for data_name, data_frame in val_data.items():
    numeric_columns = data_frame.select_dtypes(include=['float64', 'int64']).columns
    val_data[data_name][numeric_columns] = val_data[data_name][numeric_columns].apply(zscore)

Val_y = pd.read_csv('../DataValidation/Val_outcome.csv',index_col=0)
Val_y['site'] = 'Stanford'
#Val_y = Val_y.grade-1

FileNotFoundError: [Errno 2] No such file or directory: '../DataValidation/Val_celldensities.csv'

In [None]:
UOP_Celldensities =     pd.read_csv('../DataTraining/UOPfinal_celldensities.csv', index_col=0)
UOP_Function =          pd.read_csv('../DataTraining/UOPfinal_functional.csv', index_col=0)
UOP_Metavariables =     pd.read_csv('../DataTraining/UOPfinal_metavariables.csv', index_col=0)
UOP_Neighborhood =      pd.read_csv('../DataTraining/UOPfinal_neighborhood.csv', index_col=0)

UOP_data = {
    'UOP_Celldensities': UOP_Celldensities,
    'UOP_Function': UOP_Function,
    'UOP_Metavariables': UOP_Metavariables,
    'UOP_Neighborhood': UOP_Neighborhood
}

for data_name, data_frame in UOP_data.items():
    numeric_columns = data_frame.select_dtypes(include=['float64', 'int64']).columns
    UOP_data[data_name][numeric_columns] = UOP_data[data_name][numeric_columns].apply(zscore)

UOP_y = pd.read_csv('../DataTraining/UOPfinal_outcome.csv',index_col=0)
UOP_y['site'] = 'UOP'

In [None]:
X_Celldensities = pd.concat([Val_Celldensities, UOP_Celldensities])
X_Function = pd.concat([Val_Function, UOP_Function])
X_Metavariables = pd.concat([Val_Metavariables, UOP_Metavariables])
X_Neighborhood = pd.concat([Val_Neighborhood, UOP_Neighborhood])
y = pd.concat([Val_y, UOP_y])
y['patient_id'] = y.index.str.split('_').str.get(0)

data = {
    'Celldensities': X_Celldensities,
    'Function': X_Function,
    'Metavariables': X_Metavariables,
    'Neighborhood': X_Neighborhood,
    'Outcome': y
}
y

Unnamed: 0,grade,site,patient_id
S01_1,1,Stanford,S01
S01_2,1,Stanford,S01
S01_3,1,Stanford,S01
S02_1,1,Stanford,S02
S02_2,1,Stanford,S02
...,...,...,...
OC27_002,1,UOP,OC27
OC27_003,1,UOP,OC27
OC28_001,1,UOP,OC28
OC28_002,1,UOP,OC28


In [None]:
unique_patients = pd.DataFrame(y['patient_id'].unique(), columns=['patient_id'])
unique_patients = unique_patients.merge(y, on='patient_id', how = 'left') 
unique_patients = unique_patients.drop_duplicates(subset=['patient_id', 'grade', 'site'])
unique_patients

Unnamed: 0,patient_id,grade,site
0,S01,1,Stanford
3,S02,1,Stanford
6,S03,2,Stanford
9,S04,2,Stanford
12,S05,2,Stanford
15,S06,2,Stanford
18,S07,2,Stanford
21,S08,1,Stanford
24,S09,2,Stanford
27,S10,2,Stanford


In [None]:
# Split the dataframe into training and validation sets
train_df, val_df = train_test_split(unique_patients, test_size=0.4, stratify=unique_patients[['site', 'grade']], random_state=1)
train_df
# Print the shapes of the training and validation sets
print("Training set shape:", train_df.shape)
print("Validation set shape:", val_df.shape)
train_indices = y[y['patient_id'].isin(train_df['patient_id'])].index
train_indices

Training set shape: (28, 3)
Validation set shape: (20, 3)


Index(['S01_1', 'S01_2', 'S01_3', 'S02_1', 'S02_2', 'S02_3', 'S03_1', 'S03_2',
       'S03_3', 'S04_1', 'S04_2', 'S04_3', 'S06_1', 'S06_2', 'S06_3', 'S08_1',
       'S08_2', 'S08_3', 'S09_1', 'S09_2', 'S09_3', 'S11_1', 'S11_2', 'S11_3',
       'S12_1', 'S12_2', 'S12_3', 'S13_1', 'S13_2', 'S13_3', 'S15_1', 'S15_2',
       'S15_3', 'S16_1', 'S16_2', 'S16_3', 'S18_1', 'S18_2', 'S18_3', 'S23_1',
       'S23_2', 'S23_3', 'OC01_001', 'OC01_002', 'OC01_003', 'OC04_001',
       'OC04_002', 'OC04_003', 'OC05_001', 'OC05_002', 'OC05_003', 'OC07_001',
       'OC07_002', 'OC07_003', 'OC08_001', 'OC08_002', 'OC08_003', 'OC09_001',
       'OC09_002', 'OC09_003', 'OC11_001', 'OC11_002', 'OC11_003', 'OC12_001',
       'OC12_002', 'OC12_003', 'OC17_001', 'OC17_002', 'OC17_003', 'OC18_001',
       'OC18_002', 'OC18_003', 'OC19_001', 'OC19_002', 'OC19_003', 'OC21_001',
       'OC21_002', 'OC21_003', 'OC26_001', 'OC26_002', 'OC26_003', 'OC27_001',
       'OC27_002', 'OC27_003'],
      dtype='object')

In [None]:
# Split each dataframe in the data dictionary into train and test
train_data_dict = {}
test_data_dict = {}

for key, df in data.items():
    train_df = df.loc[train_indices]  # Select rows from the dataframe based on train indices
    test_df = df.drop(train_indices)  # Drop rows from the dataframe based on train indices
    
    train_data_dict[key] = train_df
    test_data_dict[key] = test_df


train_outcome = train_data_dict.pop('Outcome')
train_outcome = train_outcome.grade-1
test_outcome = test_data_dict.pop('Outcome')
test_outcome = test_outcome.grade-1

In [None]:
from scipy.stats import mannwhitneyu

# Perform Mann-Whitney U test for each numeric variable in the train_data_dict
train_results = {}
for key, df in train_data_dict.items():
    train_results[key] = {}
    for column in df.select_dtypes(include=['float64', 'int64']):
        x = df[column].dropna()  # Drop NaN values from df[column]
        y = train_outcome.loc[df[column].dropna().index]  # Match indices of train_outcome with non-NaN values in df[column]
        statistic, p_value = mannwhitneyu(x, y)
        train_results[key][column] = {'Statistic': statistic, 'p-value': p_value}

# Perform Mann-Whitney U test for each numeric variable in the test_data_dict
test_results = {}
for key, df in test_data_dict.items():
    test_results[key] = {}
    for column in df.select_dtypes(include=['float64', 'int64']):
        df[column] = df[column].fillna(df[column].mean())
        x = df[column].dropna()  # Drop NaN values from df[column]
        y = test_outcome.loc[df[column].dropna().index]  # Match indices of test_outcome with non-NaN values in df[column]
        statistic, p_value = mannwhitneyu(x, y)
        test_results[key][column] = {'Statistic': statistic, 'p-value': p_value}

# Create dataframes for train and test results
train_results_df = pd.DataFrame(train_results)
test_results_df = pd.DataFrame(test_results)

# Export train results to CSV
train_results_df.to_csv('train_mannwhitneyu_results.csv')

# Export test results to CSV
test_results_df.to_csv('test_mannwhitneyu_results.csv')

In [None]:
from scipy.stats import mannwhitneyu

# Function to perform Mann-Whitney U test and return p-value
def perform_mannwhitneyu(data1, data2):
    _, p_value = mannwhitneyu(data1, data2)
    return p_value

# Initialize a list to store the results
univariate = []

# Iterate over the variables (columns) in the training data
for column in train_data_dict[key].columns:
    
    # Perform Mann-Whitney U test on the variable and 'grade' column of train_outcome
    p_value = perform_mannwhitneyu(train_data_dict[key][column], np.array(train_outcome.grade))
    
    # Add the results to the list
    univariate.append([column, p_value])

# Create a DataFrame from the results list
univariate_df = pd.DataFrame(univariate, columns=['Variable', 'P-value'])

# Save the results to a CSV file
#univariate_df.to_csv('mannwhitneyu_results.csv', index=False)


KeyError: 'Outcome'

# Results folder

In [None]:
result_folder = "./RS_MC_RP"

# Main script

In [None]:
for omic_name, X_omic in train_data_dict.items():
    X_omic = remove_low_info_samples(X_omic)
    train_data_dict[omic_name] = X_omic

In [None]:
stabl = Stabl(
    lambda_name='C',
    lambda_grid=np.linspace(0.01, 5, 10),
    n_bootstraps=500,
    artificial_type="random_permutation",
    artificial_proportion=1.,
    replace=False,
    fdr_threshold_range=np.arange(0.2, 1, 0.01),
    sample_fraction=.7,
    random_state=111
 )

outer_splitter = RepeatedStratifiedKFold(n_splits=5, n_repeats=20, random_state=1)

stability_selection = clone(stabl).set_params(artificial_type=None, hard_threshold=0.5)

# Multi-omic Training-CV

In [None]:
np.random.seed(111)

In [None]:
predictions_dict = multi_omic_stabl_cv(
    data_dict=train_data_dict,
    y=train_outcome,
    outer_splitter=outer_splitter,
    stabl=stabl,
    stability_selection=stability_selection,
    task_type="binary",
    save_path=Path(result_folder)
)

# Multiomic Training to derive coefficients

In [None]:
np.random.seed(111)

In [None]:
stabl_multi = Stabl(
    lambda_grid=np.linspace(0.01, 5, 30),
    n_bootstraps=5000,
    artificial_proportion=1.,
    artificial_type="random_permutation",
    hard_threshold=None,
    replace=False,
    fdr_threshold_range=np.arange(0.2, 1, 0.01),
    sample_fraction=.7,
    random_state=111
)

stability_selection = clone(stabl_multi).set_params(artificial_type=None, hard_threshold=.3)

In [None]:
predictions_dict = multi_omic_stabl(
    data_dict=train_data_dict,
    y=train_outcome,
    stabl=stabl_multi,
    stability_selection=stability_selection,
    task_type="binary",
    save_path=Path(result_folder),
    X_test=pd.concat(test_data_dict.values(),axis=1),
    y_test=test_outcome
)

# Late fusion lasso

In [None]:
late_fusion_lasso_cv(
    train_data_dict=train_data_dict,
    y=train_outcome,
    outer_splitter=outer_splitter,
    task_type="binary",
    save_path=result_folder,
    groups=None
)

# Features Table

In [None]:
selected_features_dict = dict()
for model in ["STABL", "EF Lasso", "SS 03", "SS 05", "SS 08"]:
    path = Path(result_folder, "Training-Validation", f"{model} coefficients.csv")
    try:
        selected_features_dict[model] = list(pd.read_csv(path, index_col=0).iloc[:, 0].index)
    except:
        selected_features_dict[model] = []

In [None]:
features_table = compute_features_table(
    selected_features_dict,
    X_train=pd.concat(train_data_dict.values(), axis=1),
    y_train=train_outcome,
    task_type="binary"
)

In [None]:
features_table.to_csv(Path(result_folder, "Training-Validation", "Table of features.csv"))