In [8]:
import numpy as np
import pandas as pd
from scipy.stats import zscore
import os

from scipy.stats import ttest_ind, mannwhitneyu
from statsmodels.stats.multitest import fdrcorrection

In [9]:
alpha = 0.05
num_permutations = 1000

In [10]:
UOP_Celldensities =     pd.read_csv('../DataUOP/UOPfinal_celldensities.csv', index_col=0)
UOP_Function =          pd.read_csv('../DataUOP/UOPfinal_functional.csv', index_col=0)
UOP_Metavariables =     pd.read_csv('../DataUOP/UOPfinal_metavariables.csv', index_col=0)
UOP_Neighborhood =      pd.read_csv('../DataUOP/UOPfinal_neighborhood.csv', index_col=0)

UOP_data = {
    'UOP_Celldensities': UOP_Celldensities,
    'UOP_Function': UOP_Function,
    'UOP_Metavariables': UOP_Metavariables,
    'UOP_Neighborhood': UOP_Neighborhood
}

UOP_y = pd.read_csv('../DataUOP/UOPfinal_outcome.csv',index_col=0)
UOP_y = UOP_y.grade-1

In [11]:
train_means = {}
train_stds = {}
for data_name, data_frame in UOP_data.items():
    numeric_columns = data_frame.select_dtypes(include=['float64', 'int64']).columns
    train_means[data_name] = data_frame[numeric_columns].mean()
    train_stds[data_name] = data_frame[numeric_columns].std()


In [12]:
STApatient_Celldensities =     pd.read_csv('../../Patientlevel/DataStanford/STApatient_celldensities.csv', index_col=0)
STApatient_Function =          pd.read_csv('../../Patientlevel/DataStanford/STApatient_functional.csv', index_col=0)
STApatient_Metavariables =     pd.read_csv('../../Patientlevel/DataStanford/STApatient_metavariables.csv', index_col=0)
STApatient_Neighborhood =      pd.read_csv('../../Patientlevel/DataStanford/STApatient_neighborhood.csv', index_col=0)
#/Users/jeinhaus/Documents/Stabl/UOP/Patientlevel/DataStanford/STApatient_outcome.csv
STApatient_data = {
    'STA_Celldensities': STApatient_Celldensities,
    'STA_Function': STApatient_Function,
    'STA_Metavariables': STApatient_Metavariables,
    'STA_Neighborhood': STApatient_Neighborhood
}

# Normalize each data layer in STA_data using the mean and standard deviation from UOP_data
for data_name, data_frame in STApatient_data.items():
    if data_name in UOP_data:
        numeric_columns = data_frame.select_dtypes(include=['float64', 'int64']).columns
        data_frame[numeric_columns] = (data_frame[numeric_columns] - train_means[data_name]) / train_stds[data_name]


STA_y = pd.read_csv('../../Patientlevel/DataStanford/STApatient_grade_clinicaloutcomes.csv',index_col=0)
STA_grade = STA_y.grade-1
STA_death = STA_y.death
STA_cancerdeath = STA_y.cancerdeath
STA_recurrence = STA_y.recurrence

outcomes = {
        'STA_grade': STA_grade,
        'STA_death': STA_death,
        'STA_cancerdeath': STA_cancerdeath,
        'STA_recurrence': STA_recurrence,
    }

AttributeError: 'DataFrame' object has no attribute 'cancerdeath'

In [None]:
input_filenames = [
    './SS 05 coefficients Attempt01_0874.csv',
    './SS 05 coefficients Attempt02_0879.csv',
    './SS 08 coefficients Attempt04_0839.csv',
    './STABL coefficients (2).csv',
    './STABL coefficients Attempt01_0834.csv',
    './STABL coefficients Attempt02_0889.csv',
    './STABL coefficients Attempt04_0853.csv',
    './SS 08 coefficients Attempt07_0841.csv',
    './STABL coefficients Attempt06_0853.csv',
    './SS 08 coefficients Attempt04withCapping_0883.csv'
]


In [None]:
for input_filename in input_filenames:
    features = pd.read_csv(input_filename)

    #Concatenate the dataframes in STA_data
    concatenated_df = pd.concat(STApatient_data.values(), axis=1)

    # Filter the columns based on the "Feature" column of featureStabilityselection
    feature_columns = features['Feature'].tolist()
    feature_columns = feature_columns

    # Filter the columns and remove NaN values
    filtered_df = concatenated_df[feature_columns]

    # Prepare UOP_df
    concatenated_df_UOP = pd.concat(UOP_data.values(), axis=1)
    filtered_df_UOP = concatenated_df_UOP[feature_columns]

    # Modify the output filename to include the input file name
    input_filename_without_extension = os.path.splitext(os.path.basename(input_filename))[0]
    output_filename = f'Validation Patientlevel {input_filename_without_extension}.csv'

    univariate = pd.DataFrame()
    permutation_p_values = pd.DataFrame()

    for column in filtered_df_UOP.columns:
        x = filtered_df_UOP[column].dropna()
        outcome = UOP_y

        group1 = x[outcome == 0]  # Values in x corresponding to y.grade == 1
        group2 = x[outcome == 1]  # Values in x corresponding to y.grade == 2

        meandifference = group2.mean() - group1.mean()
        statistic, p_value_ttest = ttest_ind(group1, group2)
        statistic, p_value_mannwhitneyu = mannwhitneyu(group1, group2)

        univariate.loc[column, 'UOP meandifference higher-lower'] = meandifference
        univariate.loc[column, 'UOP p-value ttest'] = p_value_ttest
        univariate.loc[column, 'UOP p-value mannwhitneyu'] = p_value_mannwhitneyu

    for column in filtered_df.columns:
        x = filtered_df[column].dropna()
        univariate.loc[column, 'Feature'] = column

        for outcome_name, outcome in outcomes.items():
            group1 = x[outcome == 0]  # Values in x corresponding to outcome == 0
            group2 = x[outcome == 1]  # Values in x corresponding to outcome == 1

            meandifference = group2.mean() - group1.mean()
            statistic, p_value_ttest = ttest_ind(group1, group2)
            statistic, p_value_mannwhitneyu = mannwhitneyu(group1, group2)

            # Permutation testing
            num_permutations = 1000  # Define the number of permutations
            perm_p_values = []
            for _ in range(num_permutations):
                perm_outcome = np.random.permutation(outcome)
                perm_group1 = x[perm_outcome == 0]
                perm_group2 = x[perm_outcome == 1]

                perm_statistic, _ = ttest_ind(perm_group1, perm_group2)  # Use the same test as the original

                perm_p_values.append(perm_statistic)

            # Calculate permutation p-value
            perm_p_value = (np.abs(perm_p_values) >= np.abs(statistic)).mean()

            univariate.loc[column, f'{outcome_name} meandifference higher-lower'] = meandifference
            univariate.loc[column, f'{outcome_name} p-value ttest'] = p_value_ttest
            univariate.loc[column, f'{outcome_name} p-value mannwhitneyu'] = p_value_mannwhitneyu
            permutation_p_values.loc[column, f'{outcome_name} Permutation p-value'] = perm_p_value
            
    # Replace NaN values with 1 for FDR correction
    univariate['STA_grade p-value ttest'].fillna(1, inplace=True)
    #univariate['STA_death p-value ttest'].fillna(1, inplace=True)
    #univariate['STA_recurrence p-value ttest'].fillna(1, inplace=True)

    # Apply FDR correction
    alpha_0_05 = 0.05  # Define the desired significance level
    alpha_0_1 = 0.1  # Define the desired significance level
    fdr_adjusted_p_values_0_05 = fdrcorrection(univariate['STA_grade p-value ttest'], alpha=alpha_0_05)
    fdr_adjusted_p_values_0_1 = fdrcorrection(univariate['STA_grade p-value ttest'], alpha=alpha_0_1)
    univariate['STA_grade FDR-adjusted p-value'] = fdr_adjusted_p_values_0_05[1]
    univariate['STA_grade FDR-adjusted p-value 0.05'] = fdr_adjusted_p_values_0_05[0]
    univariate['STA_grade FDR-adjusted p-value 0.1'] = fdr_adjusted_p_values_0_1[0]

    fdr_adjusted_p_values_0_05 = fdrcorrection(univariate['STA_death p-value ttest'], alpha=alpha_0_05)
    fdr_adjusted_p_values_0_1 = fdrcorrection(univariate['STA_death p-value ttest'], alpha=alpha_0_1)
    univariate['STA_death FDR-adjusted p-value'] = fdr_adjusted_p_values_0_05[1]
    univariate['STA_death FDR-adjusted p-value 0.05'] = fdr_adjusted_p_values_0_05[0]
    univariate['STA_death FDR-adjusted p-value 0.1'] = fdr_adjusted_p_values_0_1[0]

    fdr_adjusted_p_values_0_05 = fdrcorrection(univariate['STA_recurrence p-value ttest'], alpha=alpha_0_05)
    fdr_adjusted_p_values_0_1 = fdrcorrection(univariate['STA_recurrence p-value ttest'], alpha=alpha_0_1)
    univariate['STA_recurrence FDR-adjusted p-value'] = fdr_adjusted_p_values_0_05[1]
    univariate['STA_recurrence FDR-adjusted p-value 0.05'] = fdr_adjusted_p_values_0_05[0]
    univariate['STA_recurrence FDR-adjusted p-value 0.1'] = fdr_adjusted_p_values_0_1[0]

    univariate.to_csv(output_filename)
