This notebook aims to find the best ordering of targets, from a random subset of permutations

new objective: get the predictions of all chains and average them out and measure the performance -- Does an ensemble of chains lead to a clear improvement over just one chain?

In [92]:
import numpy as np
import pandas as pd
import itertools
import random

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import r2_score, accuracy_score, mean_squared_error as mse, brier_score_loss
from predict_proba import Chain
import os
from scipy.stats import mode

In [29]:
def generate_permutations_with_order(variables, pairs_or_groups, num_permutations, constrained_elements=None, random_state=None):
    random.seed(random_state)
    permutations_list = [list(variables)]  # Add the original order only once as a list of strings
    
    while len(permutations_list) < num_permutations + 1: #+1 because the origin order is counted too
        perm = list(random.sample(variables, len(variables)))
        valid = True
        
        for pair_or_group in pairs_or_groups:
            idxs = [perm.index(var) for var in pair_or_group]
            if sorted(idxs) != idxs:
                valid = False
                break
        
        if valid:
            if constrained_elements:
            # Check if all constrained elements are present in the first positions of the permutation
                if all(elem in perm[:len(constrained_elements)] for elem in constrained_elements):
                    permutations_list.append(perm)
        
    
    return permutations_list

In [30]:
# TRY 2
def generate_permutations_with_order_2(variables, pairs_or_groups, num_permutations, constrained_elements=None, random_state=None):
    random.seed(random_state)
    
    # Generate all permutations of the variables
    permutations = itertools.permutations(variables)
    
    # Filter permutations according to the specified pairs or groups
    ordered_permutations = []
    for perm in permutations:
        valid = True
        for pair_or_group in pairs_or_groups:
            idxs = [perm.index(var) for var in pair_or_group]
            if sorted(idxs) != idxs:
                valid = False
                break
        if valid:
            ordered_permutations.append(perm)
    
    # If constrained elements are specified, ensure they appear at the beginning of each permutation
    if constrained_elements:
        constrained_permutations = []
        for perm in ordered_permutations:
            # Check if all constrained elements are present in the first positions of the permutation
            if all(elem in perm[:len(constrained_elements)] for elem in constrained_elements):
                constrained_permutations.append(perm)
        ordered_permutations = constrained_permutations
    
    # Select random permutations from the filtered permutations
    random_permutations = random.sample(ordered_permutations, min(num_permutations, len(ordered_permutations)))
    
    # Convert permutations to list of strings
    permutations_list = [list(variables)]  # Add the original order only once as a list of strings
    for perm in random_permutations:
        permutations_list.append(list(perm))  # Convert each permutation to a list of strings
    
    return permutations_list

In [31]:
def missingness_stratified_cv(df, N_FOLDS=5, random_state=None):
    # Add seed for reproducibility of the predictions (to get the same scores each time we run the code)
    np.random.seed(random_state)

    # Initial complete-case test fold assignment
    cv = pd.Series(np.nan, index=df.index)
    i_cc = (df.isna().sum(axis=1) == 0) # Complete cases
    cv.iloc[i_cc] = np.random.randint(low=0, high=N_FOLDS, size=i_cc.sum())

    # Go over columns from most missing to least missing
    for j in df.isna().sum().argsort()[::-1]:
        # Instances i that are not assigned yet but for which df[i,j] is observed
        i_tbf = (cv.isna()) & (~df.iloc[:,j].isna()) # to be filled
        # Fill them randomly
        cv.iloc[i_tbf] = np.random.randint(low=0, high=N_FOLDS, size=i_tbf.sum())

    return cv

In [32]:
possible_paths = [
    'C:/Users/lenne/OneDrive/Documenten/Master of Statistics and Data Science/2023-2024/Master thesis/Thesis_Sofia_Lennert/new_data',
    'C:/Users/anaso/Desktop/SOFIA MENDES/KU Leuven/Master Thesis/Thesis_Sofia_Lennert/new_data'
]

# Define file names
file = 'merged_data.csv'

# Find full paths to the CSV files
path = next((f'{path}/{file}' for path in possible_paths if os.path.exists(f'{path}/{file}')), None)

# Resulting DataFrame will have aggregated data from all four datasets based on the specific_column
pd.set_option('display.max_columns', None)

data = pd.read_csv(path)

def bin_column(value):
    if value in [0, 1, 2, 3]:
        return str(value)
    else:
        return '4+'
data['NRELAP'] = data['NRELAP'].apply(bin_column)

data

Unnamed: 0,USUBJID,AGE,SEX,RACE,CONTINENT,CESEV,CECONTRT,TOTRELAP,MHCONTRT,MHDIAGN,CARDIO,URINARY,MUSCKELET,FATIGUE,SMSTDY,NRELAP,NHPT-before,NHPT-2y,NHPT-after_2y,PASAT_2s-before,PASAT_2s-2y,PASAT_2s-after_2y,PASAT_3s-before,PASAT_3s-2y,PASAT_3s-after_2y,SDMT-before,SDMT-2y,T25FW-before,T25FW-2y,T25FW-after_2y,T-before,T-after,P-before,P-after,N-before,N-after,SLEC_before,SLEC_after,SES_after,SES_before,VAA,BDI-before,BDI-after,EDSS-before,EDSS-2y,EDSS-after_2y,KFSS1-Sensory-2y,KFSS1-Sensory-after_2y,KFSS1-Sensory-before,KFSS1-Brain-2y,KFSS1-Brain-after_2y,KFSS1-Brain-before,KFSS1-Bowel-2y,KFSS1-Bowel-after_2y,KFSS1-Bowel-before,KFSS1-Pyramidal-2y,KFSS1-Pyramidal-after_2y,KFSS1-Pyramidal-before,KFSS1-Cerebral-2y,KFSS1-Cerebral-after_2y,KFSS1-Cerebral-before,KFSS1-Visual-2y,KFSS1-Visual-after_2y,KFSS1-Visual-before,KFSS1-Cerebellar-2y,KFSS1-Cerebellar-after_2y,KFSS1-Cerebellar-before,KFSS_M-2y,KFSS_M-after_2y,KFSS_M-before,KFSS_P-2y,KFSS_P-after_2y,KFSS_P-before,M_R36-SF12-before,P_R36-SF12-before,R36-SF12-before_Ind,M_R36-SF12-after,P_R36-SF12-after,R36-SF12-after_Ind
0,MSOAC/0014,46.0,F,,,,,,,RRMS,0,0,0,0,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,MSOAC/0016,,M,WHITE,NORTH AMERICA,,,,Y,SPMS,1,1,0,1,,0.0,,,,,,,,,,,,8.55,6.60,,0.0,0.0,,,,,,,,,,,,6.00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,MSOAC/0019,44.0,M,NON-WHITE,,,,,,PPMS,1,1,0,0,,0.0,23.65,21.30,20.15,34.5,35.5,43.0,43.5,51.0,53.0,,,6.30,6.15,5.85,0.0,0.0,0.0,0.0,,,,,,,,,,3.75,3.50,3.0,0.333333,0.166667,0.500000,0.2,0.0,0.2,0.000000,0.166667,0.083333,0.333333,0.5,0.416667,0.0,0.0,0.0,0.333333,0.0,0.333333,0.0,0.2,0.0,0.185185,0.185185,0.240741,0.166667,0.083333,0.208333,0.828571,0.772152,1.0,0.857143,0.721519,1.0
3,MSOAC/0024,60.0,M,WHITE,NORTH AMERICA,,,,,SPMS,1,1,1,1,,0.0,34.45,37.50,,55.0,54.0,,60.0,60.0,,,,4.50,5.25,,0.0,0.0,0.0,0.0,1.0,0.0,,,,,,0.031746,0.023810,4.00,3.75,,0.333333,,0.333333,0.0,,0.1,0.583333,,0.666667,0.166667,,0.250000,0.0,,0.0,0.000000,,0.083333,0.2,,0.5,0.129630,,0.240741,0.291667,,0.375000,0.885714,0.569620,1.0,0.857143,0.716216,1.0
4,MSOAC/0030,28.0,F,WHITE,EUROPE,,,,,RRMS,1,1,0,1,,0.0,16.55,17.90,,,,,58.0,60.0,,63.5,69.0,4.85,4.70,,0.0,0.0,0.0,0.0,0.0,0.0,26.0,24.0,1.25,1.25,,0.063492,0.039683,2.00,1.50,,0.166667,,0.166667,0.2,,0.2,0.166667,,0.166667,0.166667,,0.333333,0.0,,0.2,0.166667,,0.083333,0.0,,0.1,0.111111,,0.203704,0.166667,,0.125000,0.933333,0.846154,0.0,0.833333,0.730769,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2460,MSOAC/9986,46.0,M,WHITE,OCEANIA,,,,,RRMS,1,1,0,1,,0.0,19.35,18.95,,,,,58.0,60.0,,51.0,60.0,3.90,3.80,,0.0,0.0,0.0,0.0,0.0,0.0,36.0,35.0,1.25,1.25,,0.047619,0.063492,2.75,2.50,,0.333333,,0.166667,0.0,,0.0,0.333333,,0.250000,0.166667,,0.333333,0.0,,0.0,0.000000,,0.000000,0.0,,0.2,0.111111,,0.148148,0.166667,,0.125000,0.833333,0.730769,0.0,0.800000,0.750000,0.0
2461,MSOAC/9987,18.0,F,,,,,,,RRMS,0,0,0,0,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2462,MSOAC/9995,38.0,F,,,MILD,,4.0,,RRMS,0,0,0,0,142.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2463,MSOAC/9998,40.0,F,WHITE,,,Y,2.0,Y,PPMS,0,1,0,1,79.0,1.0,23.80,22.40,22.50,21.5,30.5,33.5,31.5,39.5,40.5,,,6.15,6.00,6.20,0.0,0.0,0.0,0.0,,,,,,,,,,4.50,3.75,4.0,0.166667,0.250000,0.333333,0.4,0.6,0.6,0.166667,0.166667,0.166667,0.333333,0.5,0.500000,0.0,0.0,0.4,0.166667,0.0,0.166667,0.6,0.6,0.6,0.314815,0.351852,0.481481,0.166667,0.083333,0.166667,0.728571,0.658228,1.0,0.757143,0.594937,1.0


In [33]:
variables = ['KFSS_M-2y', 'KFSS_P-2y', 'EDSS-2y', 'T25FW-2y', 'NHPT-2y', 'P_R36-SF12-after', 'M_R36-SF12-after', 
             'SES_after', 'SLEC_after', 'KFSS_M-after_2y', 'KFSS_P-after_2y', 'EDSS-after_2y', 'NRELAP', 'CESEV']


columns_to_keep = ['AGE', 'SEX', 'RACE', 'CONTINENT', 'MHDIAGN', 'CARDIO', 'URINARY', 'MUSCKELET', 'FATIGUE', 
                    'NHPT-before', 'PASAT_2s-before', 'PASAT_3s-before', 'SDMT-before', 'T25FW-before', 'SLEC_before','SES_before',
                    'BDI-before', 'EDSS-before', 'KFSS_M-before', 'KFSS_P-before', 'M_R36-SF12-before',
                	'P_R36-SF12-before', 'R36-SF12-before_Ind', 'T-before','P-before','N-before']
# still need to change in OE dataframe the SLEC and SES so name is consistent with the others

features = data[columns_to_keep]
features

Unnamed: 0,AGE,SEX,RACE,CONTINENT,MHDIAGN,CARDIO,URINARY,MUSCKELET,FATIGUE,NHPT-before,PASAT_2s-before,PASAT_3s-before,SDMT-before,T25FW-before,SLEC_before,SES_before,BDI-before,EDSS-before,KFSS_M-before,KFSS_P-before,M_R36-SF12-before,P_R36-SF12-before,R36-SF12-before_Ind,T-before,P-before,N-before
0,46.0,F,,,RRMS,0,0,0,0,,,,,,,,,,,,,,,,,
1,,M,WHITE,NORTH AMERICA,SPMS,1,1,0,1,,,,,8.55,,,,6.00,,,,,,0.0,,
2,44.0,M,NON-WHITE,,PPMS,1,1,0,0,23.65,34.5,43.5,,6.30,,,,3.75,0.240741,0.208333,0.828571,0.772152,1.0,0.0,0.0,
3,60.0,M,WHITE,NORTH AMERICA,SPMS,1,1,1,1,34.45,55.0,60.0,,4.50,,,0.031746,4.00,0.240741,0.375000,0.885714,0.569620,1.0,0.0,0.0,1.0
4,28.0,F,WHITE,EUROPE,RRMS,1,1,0,1,16.55,,58.0,63.5,4.85,26.0,1.25,0.063492,2.00,0.203704,0.125000,0.933333,0.846154,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2460,46.0,M,WHITE,OCEANIA,RRMS,1,1,0,1,19.35,,58.0,51.0,3.90,36.0,1.25,0.047619,2.75,0.148148,0.125000,0.833333,0.730769,0.0,0.0,0.0,0.0
2461,18.0,F,,,RRMS,0,0,0,0,,,,,,,,,,,,,,,,,
2462,38.0,F,,,RRMS,0,0,0,0,,,,,,,,,,,,,,,,,
2463,40.0,F,WHITE,,PPMS,0,1,0,1,23.80,21.5,31.5,,6.15,,,,4.50,0.481481,0.166667,0.728571,0.658228,1.0,0.0,0.0,


In [34]:
object_columns = features.select_dtypes(include=['object'])
features = pd.get_dummies(features, columns=object_columns.columns, dtype=int)
features.head()

Unnamed: 0,AGE,CARDIO,URINARY,MUSCKELET,FATIGUE,NHPT-before,PASAT_2s-before,PASAT_3s-before,SDMT-before,T25FW-before,SLEC_before,SES_before,BDI-before,EDSS-before,KFSS_M-before,KFSS_P-before,M_R36-SF12-before,P_R36-SF12-before,R36-SF12-before_Ind,T-before,P-before,N-before,SEX_F,SEX_M,RACE_NON-WHITE,RACE_WHITE,CONTINENT_ASIA,CONTINENT_EURASIA,CONTINENT_EUROPE,CONTINENT_NORTH AMERICA,CONTINENT_OCEANIA,CONTINENT_SOUTH AMERICA,MHDIAGN_PPMS,MHDIAGN_RRMS,MHDIAGN_SPMS
0,46.0,0,0,0,0,,,,,,,,,,,,,,,,,,1,0,0,0,0,0,0,0,0,0,0,1,0
1,,1,1,0,1,,,,,8.55,,,,6.0,,,,,,0.0,,,0,1,0,1,0,0,0,1,0,0,0,0,1
2,44.0,1,1,0,0,23.65,34.5,43.5,,6.3,,,,3.75,0.240741,0.208333,0.828571,0.772152,1.0,0.0,0.0,,0,1,1,0,0,0,0,0,0,0,1,0,0
3,60.0,1,1,1,1,34.45,55.0,60.0,,4.5,,,0.031746,4.0,0.240741,0.375,0.885714,0.56962,1.0,0.0,0.0,1.0,0,1,0,1,0,0,0,1,0,0,0,0,1
4,28.0,1,1,0,1,16.55,,58.0,63.5,4.85,26.0,1.25,0.063492,2.0,0.203704,0.125,0.933333,0.846154,0.0,0.0,0.0,0.0,1,0,0,1,0,0,1,0,0,0,0,1,0


In [35]:
#variables = ['KFSS_M-2y', 'KFSS_P-2y', 'EDSS-2y', 'T25FW-2y', 'NHPT-2y', 'P_R36-SF12-after', 
             #'M_R36-SF12-after', 'SES_after', 'EDSS-after_2y', 'NRELAP', 'CESEV']

pairs_or_groups = [['KFSS_M-2y', 'EDSS-2y'], ['KFSS_P-2y', 'EDSS-2y'], ['KFSS_M-after_2y', 'EDSS-after_2y'], ['KFSS_P-after_2y', 'EDSS-after_2y']]  # Specify the pairs or groups
order_constraint = ['KFSS_M-2y', 'KFSS_P-2y', 'EDSS-2y', 'T25FW-2y', 'NHPT-2y']
num_permutations = 3  # Specify how many random permutations you want
random_state = 42

random_permutations = generate_permutations_with_order(variables, pairs_or_groups, num_permutations, order_constraint, random_state)

# Print the original order followed by all the random permutations
for idx, perm in enumerate(random_permutations, start=0):
    print(f"Permutation {idx}: {', '.join(perm)}")

Permutation 0: KFSS_M-2y, KFSS_P-2y, EDSS-2y, T25FW-2y, NHPT-2y, P_R36-SF12-after, M_R36-SF12-after, SES_after, SLEC_after, KFSS_M-after_2y, KFSS_P-after_2y, EDSS-after_2y, NRELAP, CESEV
Permutation 1: T25FW-2y, KFSS_M-2y, NHPT-2y, KFSS_P-2y, EDSS-2y, NRELAP, KFSS_M-after_2y, CESEV, SLEC_after, M_R36-SF12-after, P_R36-SF12-after, KFSS_P-after_2y, EDSS-after_2y, SES_after
Permutation 2: NHPT-2y, KFSS_P-2y, KFSS_M-2y, T25FW-2y, EDSS-2y, M_R36-SF12-after, SES_after, CESEV, SLEC_after, KFSS_P-after_2y, P_R36-SF12-after, KFSS_M-after_2y, NRELAP, EDSS-after_2y
Permutation 3: NHPT-2y, KFSS_P-2y, KFSS_M-2y, T25FW-2y, EDSS-2y, KFSS_P-after_2y, CESEV, M_R36-SF12-after, NRELAP, P_R36-SF12-after, SES_after, SLEC_after, KFSS_M-after_2y, EDSS-after_2y


In [36]:
ordered_targets = random_permutations[0]

In [37]:
# Set random state for reproducibility
random_state = 42
N_FOLDS = 5

In [38]:
# Generate CV folds
cv=missingness_stratified_cv(features, N_FOLDS, random_state)
cv = cv.to_frame(name="CV Fold")

Is it a problem that not all folds have the exact same number?

---

## Chain with *predicted* values propagated

In [13]:
y_pred_chains = []
y_test_list = [[] for _ in range(N_FOLDS)]  # Initialize y_test_list with empty lists for each fold index

# Iterate over each chain ordering
for ordered_targets_chain in random_permutations:
    y_pred_list_chain = []  # List to store predictions for this chain
    
    features_cv = pd.merge(features, pd.DataFrame(cv), left_index=True, right_index=True)
    targets_cv = pd.merge(data[ordered_targets_chain], pd.DataFrame(cv), left_index=True, right_index=True)

    # Fit and predict for each fold for this chain
    for i in range(0, N_FOLDS): 
        Xi_train = features_cv[features_cv['CV Fold'] != i].drop(["CV Fold"], axis=1)
        Xi_test = features_cv[features_cv['CV Fold'] == i].drop(["CV Fold"], axis=1)
        yi_train = targets_cv[targets_cv['CV Fold'] != i].drop(["CV Fold"], axis=1)
        yi_test = targets_cv[targets_cv['CV Fold'] == i].drop(["CV Fold"], axis=1)
        y_test_list_chain = []  # List to store y_test for this fold

        chain = Chain(
            model_reg=RandomForestRegressor(random_state=random_state),
            model_clf=RandomForestClassifier(random_state=random_state),
            propagate="pred",
        )
        chain.fit(Xi_train, yi_train, target_types=None)
        y_pred = chain.predict(Xi_test)
        y_pred_list_chain.append(pd.DataFrame(y_pred, columns=yi_test.columns, index=yi_test.index))

        # Append yi_test to the corresponding fold index in y_test_list
        y_test_list[i].append(yi_test)  # Append yi_test for fold i

    y_pred_chains.append(y_pred_list_chain)
    print("Permutation done")

Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done


In [14]:
#y_pred_chains[1][0]

In [15]:
#y_test_list[2][0]

In [16]:
transposed_list = list(zip(*y_pred_chains))

# Each element of transposed_list is a tuple containing dataframes from the same position in each inner list
# Now, create a list of lists from these tuples
reorganized_list = [list(df_tuple) for df_tuple in transposed_list]

In [17]:
#reorganized_list[0][1]

In [18]:
transposed_test_list = list(zip(*y_test_list))

# Each element of transposed_list is a tuple containing dataframes from the same position in each inner list
# Now, create a list of lists from these tuples
reorganized_test_list = [list(df_tuple) for df_tuple in transposed_test_list]

In [19]:
#reorganized_test_list[0][3]

In [70]:
# Define a function to reorder columns of dataframes
def reorder_columns(dataframes):
    # Get the column order from the first dataframe
    column_order = dataframes[0].columns
    # Reorder columns for each dataframe in the list
    reordered_dataframes = [df[column_order] for df in dataframes]
    return reordered_dataframes

# Apply the function to each list in reorganized_list
reordered_reorganized_list = [reorder_columns(dataframes) for dataframes in reorganized_list]

In [71]:
#reordered_reorganized_list[0][0]

In [72]:
def average_dataframes(dataframes):
    # Concatenate dataframes within the list
    concatenated_df = pd.concat(dataframes)
    # Group by index and calculate the mode for object columns and mean for other types
    averaged_df = concatenated_df.groupby(concatenated_df.index).agg(lambda x: x.mode()[0] if x.dtype == 'O' else x.mean())
    return averaged_df

# Apply the function to each list in reordered_reorganized_list
averaged_dataframes_list = [average_dataframes(dataframes) for dataframes in reordered_reorganized_list]

In [23]:
#averaged_dataframes_list[2]

In [24]:
y_pred_list = averaged_dataframes_list.copy()
y_test_list = reorganized_test_list[0]

y_test_cv = []
y_pred_cv = []

for j in range(len(y_test_list)):  # 5
    y_test_targ = []
    y_pred_targ = []
    nvar=y_test_list[0].shape[1]

    for i in range(0, nvar):  # or (1, 5)
        missing_rows_mask = y_test_list[j].iloc[:, i].isna()
        y_test = y_test_list[j].iloc[:, i][~missing_rows_mask]
        y_pred = y_pred_list[j].iloc[:, i][~missing_rows_mask]
        
        y_test_targ.append(y_test)
        y_pred_targ.append(y_pred)
    
    y_test_cv.append(y_test_targ)
    y_pred_cv.append(y_pred_targ)
# y_test_cv[fold][outcome]

In [25]:
# Initialize a list to store scores
scores = []
scores_with_std = []

# Iterate over each outcome variable in the folds
for variable_name in variables: 
    variable_scores = []
    
    # Compute scores for the variable across all folds
    for fold_index in range(len(y_test_cv)):
        y_test = y_test_cv[fold_index][variables.index(variable_name)] 
        y_pred = y_pred_cv[fold_index][variables.index(variable_name)] 
        
        # Check if the target variable is numerical or categorical
        if y_test.dtype.kind in 'bifc':
            score = r2_score(y_test, y_pred)
        else:
            score = accuracy_score(y_test, y_pred)
                  
        variable_scores.append(score)
    
    # Compute the average score for the variable across all folds
    variable_avg_score = np.mean(variable_scores)
    # Compute the standard deviation for the variable across all folds
    variable_std_score = np.std(variable_scores)
    scores_with_std.append((variable_name, variable_avg_score, variable_std_score))

# Print the scores with average and standard deviation along with variable names
print(f"Scores for each outcome (ensemble with {int(len(random_permutations))} chains - propagate predicted values)")
for variable_name, avg_score, std_score in scores_with_std:
    print(f"{variable_name}: {avg_score:.2f} (± {std_score:.2f})")

Scores for each outcome (ensemble with 6 chains - propagate predicted values)
KFSS_M-2y: 0.81 (± 0.02)
KFSS_P-2y: 0.75 (± 0.03)
EDSS-2y: 0.88 (± 0.02)
T25FW-2y: 0.71 (± 0.07)
NHPT-2y: 0.58 (± 0.17)
P_R36-SF12-after: 0.69 (± 0.05)
M_R36-SF12-after: 0.57 (± 0.02)
SES_after: 0.69 (± 0.04)
EDSS-after_2y: 0.74 (± 0.05)
NRELAP: 0.64 (± 0.02)
CESEV: 0.49 (± 0.03)


---

## Chain with *true* values propagated

In [180]:
y_pred_chains = []
y_test_list = [[] for _ in range(N_FOLDS)]  # Initialize y_test_list with empty lists for each fold index
y_pred_prob_list_chain = []
yi_test_dummies_list = [[] for _ in range(N_FOLDS)]


# Iterate over each chain ordering
for ordered_targets_chain in random_permutations:
    y_pred_list = []  # List to store predictions for this chain
    y_pred_prob_list = []
    
    features_cv = pd.merge(features, pd.DataFrame(cv), left_index=True, right_index=True)
    targets_cv = pd.merge(data[ordered_targets_chain], pd.DataFrame(cv), left_index=True, right_index=True)

    # Fit and predict for each fold for this chain
    for i in range(0, N_FOLDS): 
        Xi_train = features_cv[features_cv['CV Fold'] != i].drop(["CV Fold"], axis=1)
        Xi_test = features_cv[features_cv['CV Fold'] == i].drop(["CV Fold"], axis=1)
        yi_train = targets_cv[targets_cv['CV Fold'] != i].drop(["CV Fold"], axis=1)
        yi_test = targets_cv[targets_cv['CV Fold'] == i].drop(["CV Fold"], axis=1)
        y_test_list_chain = []  # List to store y_test for this fold

        # One hot encode categorical targets of test set to be able to compute brier score
        subset_yi_test = yi_test.select_dtypes(include=['object'])
        yi_test_dummies = pd.get_dummies(subset_yi_test, columns=subset_yi_test.columns, dtype=int)

        chain = Chain(
            model_reg=RandomForestRegressor(random_state=random_state),
            model_clf=RandomForestClassifier(random_state=random_state),
            propagate="true",
        )
        chain.fit(Xi_train, yi_train, target_types=None)
        y_pred = chain.predict(Xi_test)
        y_pred_prob = chain.predict_proba(Xi_test)
        y_pred_list.append(pd.DataFrame(y_pred, columns=yi_test.columns, index=yi_test.index))
        y_pred_prob_list.append(y_pred_prob)
    
        # Append yi_test to the corresponding fold index in y_test_list
        y_test_list[i].append(yi_test)  # Append yi_test for fold i
        yi_test_dummies_list[i].append(yi_test_dummies)

    y_pred_chains.append(y_pred_list)
    y_pred_prob_list_chain.append(y_pred_prob_list)
    print("Permutation done")

Permutation done
Permutation done
Permutation done
Permutation done


In [181]:
yi_test_dummies_list[3][3]

Unnamed: 0,CESEV_MILD,CESEV_MODERATE,CESEV_SEVERE,NRELAP_0.0,NRELAP_1.0,NRELAP_2.0,NRELAP_3.0,NRELAP_4+
4,0,0,0,1,0,0,0,0
12,0,0,1,0,0,1,0,0
14,0,0,1,0,1,0,0,0
19,0,1,0,0,1,0,0,0
24,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...
2436,0,0,0,1,0,0,0,0
2439,1,0,0,0,1,0,0,0
2445,0,0,0,1,0,0,0,0
2462,1,0,0,0,0,1,0,0


In [182]:
y_test_list

[[      KFSS_M-2y  KFSS_P-2y  EDSS-2y  T25FW-2y  NHPT-2y  P_R36-SF12-after  \
  0           NaN        NaN      NaN       NaN      NaN               NaN   
  3      0.129630   0.291667     3.75      5.25    37.50          0.716216   
  8           NaN        NaN     6.50     22.90    31.55          0.500000   
  13          NaN        NaN     8.00       NaN    37.90          0.468354   
  20     0.259259   0.166667     4.00      6.50    30.55          0.582278   
  ...         ...        ...      ...       ...      ...               ...   
  2449   0.333333   0.000000     4.00     12.25    27.25          0.346154   
  2454   0.240741   0.541667     6.50     11.55    28.50          0.443038   
  2457   0.259259   0.000000     3.50      5.30    23.00          0.807692   
  2458        NaN        NaN     4.00      7.95    28.50          0.683544   
  2460   0.111111   0.166667     2.50      3.80    18.95          0.750000   
  
        M_R36-SF12-after  SES_after  SLEC_after  KFSS_M-after

In [193]:
yi_test_dummies_list[2][0]

Unnamed: 0,NRELAP_0.0,NRELAP_1.0,NRELAP_2.0,NRELAP_3.0,NRELAP_4+,CESEV_MILD,CESEV_MODERATE,CESEV_SEVERE
2,1,0,0,0,0,0,0,0
5,0,0,1,0,0,1,0,0
15,1,0,0,0,0,0,0,0
23,1,0,0,0,0,0,0,0
31,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...
2441,1,0,0,0,0,0,0,0
2446,0,0,1,0,0,0,1,0
2450,1,0,0,0,0,0,0,0
2452,1,0,0,0,0,0,0,0


In [195]:
# Initialize an empty list to store the concatenated DataFrames
concatenated_dfs_chain = []

for y_pred_prob_list in y_pred_prob_list_chain:
    concatenated_dfs_fold = []
    # Iterate over each pair of arrays
    for j, fold in enumerate(y_pred_prob_list):
        dfs = []
        len_array = 0
        
        for i, array in enumerate(fold):
            # Convert array to DataFrame
            col = yi_test_dummies_list[j][0].columns[len_array:len_array+len(array[0])]
            df = pd.DataFrame(array, columns=col, index=yi_test_dummies_list[j][0].index)
            dfs.append(df)
            len_array += len(array[0])
        
        # Concatenate DataFrames
        concatenated_df = pd.concat(dfs, axis=1)
        concatenated_dfs_fold.append(concatenated_df)

    concatenated_dfs_chain.append(concatenated_dfs_fold)
    # Now you should have a list of concatenated DataFrames

In [199]:
concatenated_dfs_chain[0][2]

Unnamed: 0,NRELAP_0.0,NRELAP_1.0,NRELAP_2.0,NRELAP_3.0,NRELAP_4+,CESEV_MILD,CESEV_MODERATE,CESEV_SEVERE
2,0.870000,0.080000,0.030000,0.02,0.000000,0.140000,0.780000,0.080000
5,0.515652,0.173478,0.250870,0.03,0.030000,0.560000,0.410000,0.030000
15,0.570000,0.350000,0.020000,0.02,0.040000,0.050000,0.860000,0.090000
23,0.610000,0.230000,0.120000,0.02,0.020000,0.180000,0.610000,0.210000
31,0.640000,0.210000,0.080000,0.03,0.040000,0.290000,0.600000,0.110000
...,...,...,...,...,...,...,...,...
2441,0.890000,0.020000,0.040000,0.02,0.030000,0.080000,0.800000,0.120000
2446,0.412012,0.154821,0.141248,0.07,0.221919,0.178956,0.686607,0.134437
2450,0.830000,0.080000,0.050000,0.01,0.030000,0.173095,0.654177,0.172727
2452,0.640000,0.180000,0.140000,0.02,0.020000,0.280000,0.680000,0.040000


In [200]:
transposed_list_cat = list(zip(*concatenated_dfs_chain))

# Each element of transposed_list is a tuple containing dataframes from the same position in each inner list
# Now, create a list of lists from these tuples
reorganized_list_cat = [list(df_tuple) for df_tuple in transposed_list_cat]

In [201]:
# Apply the function to each list in reorganized_list
reordered_reorganized_list_cat = [reorder_columns(dataframes) for dataframes in reorganized_list_cat]

In [202]:
# Apply the function to each list in reordered_reorganized_list
averaged_dataframes_list_cat = [average_dataframes(dataframes) for dataframes in reordered_reorganized_list_cat]

In [203]:
averaged_dataframes_list_cat[4]

Unnamed: 0,NRELAP_0.0,NRELAP_1.0,NRELAP_2.0,NRELAP_3.0,NRELAP_4+,CESEV_MILD,CESEV_MODERATE,CESEV_SEVERE
6,0.405844,0.428488,0.133168,0.263202,0.161381,0.198155,0.317262,0.092500
9,0.392114,0.412720,0.151928,0.229337,0.173538,0.246933,0.295251,0.098180
10,0.327500,0.595000,0.067500,0.305000,0.162500,0.070000,0.402500,0.070000
16,0.447897,0.337879,0.171162,0.293037,0.130349,0.237161,0.246791,0.135723
17,0.432500,0.492500,0.072500,0.387500,0.075000,0.067500,0.405000,0.067500
...,...,...,...,...,...,...,...,...
2438,0.268750,0.618750,0.097500,0.245000,0.145000,0.125000,0.457500,0.042500
2443,0.505000,0.427500,0.057500,0.440000,0.040000,0.072500,0.410000,0.047500
2451,0.417500,0.362500,0.145000,0.347500,0.125000,0.205000,0.287500,0.110000
2453,0.400000,0.470000,0.115000,0.355000,0.067500,0.125000,0.407500,0.060000


In [204]:
transposed_list_all = list(zip(*y_pred_chains))

# Each element of transposed_list is a tuple containing dataframes from the same position in each inner list
# Now, create a list of lists from these tuples
reorganized_list_all = [list(df_tuple) for df_tuple in transposed_list_all]

# Apply the function to each list in reorganized_list
reordered_reorganized_list_all = [reorder_columns(dataframes) for dataframes in reorganized_list_all]

# Apply the function to each list in reordered_reorganized_list
averaged_dataframes_list_all = [average_dataframes(dataframes) for dataframes in reordered_reorganized_list_all]

In [205]:
transposed_test_list = list(zip(*y_test_list))

# Each element of transposed_list is a tuple containing dataframes from the same position in each inner list
# Now, create a list of lists from these tuples
reorganized_test_list = [list(df_tuple) for df_tuple in transposed_test_list]

In [206]:
reorganized_test_list[0]

[      KFSS_M-2y  KFSS_P-2y  EDSS-2y  T25FW-2y  NHPT-2y  P_R36-SF12-after  \
 0           NaN        NaN      NaN       NaN      NaN               NaN   
 3      0.129630   0.291667     3.75      5.25    37.50          0.716216   
 8           NaN        NaN     6.50     22.90    31.55          0.500000   
 13          NaN        NaN     8.00       NaN    37.90          0.468354   
 20     0.259259   0.166667     4.00      6.50    30.55          0.582278   
 ...         ...        ...      ...       ...      ...               ...   
 2449   0.333333   0.000000     4.00     12.25    27.25          0.346154   
 2454   0.240741   0.541667     6.50     11.55    28.50          0.443038   
 2457   0.259259   0.000000     3.50      5.30    23.00          0.807692   
 2458        NaN        NaN     4.00      7.95    28.50          0.683544   
 2460   0.111111   0.166667     2.50      3.80    18.95          0.750000   
 
       M_R36-SF12-after  SES_after  SLEC_after  KFSS_M-after_2y  \
 0     

In [207]:
y_pred_list = averaged_dataframes_list_all.copy()
y_test_list = reorganized_test_list[0]

y_test_cv = []
y_pred_cv = []

for j in range(len(y_test_list)):  # 5
    y_test_targ = []
    y_pred_targ = []
    nvar=y_test_list[0].shape[1]

    for i in range(0, nvar):  # or (1, 5)
        missing_rows_mask = y_test_list[j].iloc[:, i].isna()
        y_test = y_test_list[j].iloc[:, i][~missing_rows_mask]
        y_pred = y_pred_list[j].iloc[:, i][~missing_rows_mask]
        
        y_test_targ.append(y_test)
        y_pred_targ.append(y_pred)
    
    y_test_cv.append(y_test_targ)
    y_pred_cv.append(y_pred_targ)
# y_test_cv[fold][outcome]

In [208]:
def normalized_mean_squared_error(true, pred):
    num = mse(true, pred)
    mean_value = np.mean(true)
    mean = np.full_like(true, mean_value)
    den = mse(true, mean)
    nmse_loss = num/den
    #rrmse_loss = np.sqrt(squared_error)
    return nmse_loss

In [209]:
# Initialize a list to store scores
scores_with_std = []

# Iterate over each outcome variable in the folds
for variable_name in variables: 
    variable_scores = []
    
    # Check if the target variable is numerical or categorical
    if y_test_cv[0][variables.index(variable_name)].dtype.kind in 'bifc':
        # Compute scores for the variable across all folds
        for fold_index in range(len(y_test_cv)):
            y_test = y_test_cv[fold_index][variables.index(variable_name)] 
            y_pred = y_pred_cv[fold_index][variables.index(variable_name)] 
            
            score = normalized_mean_squared_error(y_test, y_pred)
            variable_scores.append(score)
        
        # Compute the average score for the variable across all folds
        variable_avg_score = np.mean(variable_scores)
        
        # Compute the standard deviation for the variable across all folds
        variable_std_score = np.std(variable_scores)
        
        # Append the tuple with three elements to the scores_with_std list
        scores_with_std.append((variable_name, variable_avg_score, variable_std_score))

num_normalized_brier=[]
num_std_brier=[]
# Print the scores with average and standard deviation along with variable names
print("Scores for each outcome (chain - true values):")
for variable_name, avg_score, std_score in scores_with_std:
    print(f"{variable_name}: {avg_score:.2f} (± {std_score:.2f})")
    num_normalized_brier.append(avg_score)
    num_std_brier.append(std_score)

Scores for each outcome (chain - true values):
KFSS_M-2y: 0.19 (± 0.02)
KFSS_P-2y: 0.25 (± 0.03)
EDSS-2y: 0.12 (± 0.02)
T25FW-2y: 0.29 (± 0.08)
NHPT-2y: 0.41 (± 0.17)
P_R36-SF12-after: 0.31 (± 0.05)
M_R36-SF12-after: 0.43 (± 0.02)
SES_after: 0.31 (± 0.05)
SLEC_after: 0.35 (± 0.03)
KFSS_M-after_2y: 0.34 (± 0.03)
KFSS_P-after_2y: 0.48 (± 0.06)
EDSS-after_2y: 0.24 (± 0.04)


In [210]:
yi_test_dummies_avg = []

for yi_test_dummies_fold in yi_test_dummies_list:
    # Calculate the percentage of 1s in each column
    yi_test_dummies_avg_fold=[]
    for yi_test_dummies_chain in yi_test_dummies_fold:

        percentages = yi_test_dummies_chain.sum() / len(yi_test_dummies_chain)

        yi_test_dummies_avg_chain = pd.DataFrame(0, index=yi_test_dummies_chain.index, columns=yi_test_dummies_chain.columns)

        # Replace values in each column with the corresponding percentage
        for col in yi_test_dummies_avg_chain.columns:
            yi_test_dummies_avg_chain[col] = yi_test_dummies_chain[col].apply(lambda x: percentages[col])

        yi_test_dummies_avg_fold.append(yi_test_dummies_avg_chain)
    yi_test_dummies_avg.append(yi_test_dummies_avg_fold)

In [211]:
transposed_dummy_avg_list = list(zip(*yi_test_dummies_avg))

# Each element of transposed_list is a tuple containing dataframes from the same position in each inner list
# Now, create a list of lists from these tuples
reorganized_dummy_avg_list = [list(df_tuple) for df_tuple in transposed_dummy_avg_list]

In [214]:
reorganized_dummy_avg_list[2][1]

Unnamed: 0,CESEV_MILD,CESEV_MODERATE,CESEV_SEVERE,NRELAP_0.0,NRELAP_1.0,NRELAP_2.0,NRELAP_3.0,NRELAP_4+
1,0.105051,0.212121,0.060606,0.646465,0.20404,0.086869,0.038384,0.024242
7,0.105051,0.212121,0.060606,0.646465,0.20404,0.086869,0.038384,0.024242
11,0.105051,0.212121,0.060606,0.646465,0.20404,0.086869,0.038384,0.024242
21,0.105051,0.212121,0.060606,0.646465,0.20404,0.086869,0.038384,0.024242
30,0.105051,0.212121,0.060606,0.646465,0.20404,0.086869,0.038384,0.024242
...,...,...,...,...,...,...,...,...
2437,0.105051,0.212121,0.060606,0.646465,0.20404,0.086869,0.038384,0.024242
2447,0.105051,0.212121,0.060606,0.646465,0.20404,0.086869,0.038384,0.024242
2455,0.105051,0.212121,0.060606,0.646465,0.20404,0.086869,0.038384,0.024242
2459,0.105051,0.212121,0.060606,0.646465,0.20404,0.086869,0.038384,0.024242


In [215]:
transposed_dummy_list = list(zip(*yi_test_dummies_list))

# Each element of transposed_list is a tuple containing dataframes from the same position in each inner list
# Now, create a list of lists from these tuples
reorganized_dummy_list = [list(df_tuple) for df_tuple in transposed_dummy_list]

In [220]:
reorganized_dummy_list[2][0]

Unnamed: 0,CESEV_MILD,CESEV_MODERATE,CESEV_SEVERE,NRELAP_0.0,NRELAP_1.0,NRELAP_2.0,NRELAP_3.0,NRELAP_4+
0,0,0,0,1,0,0,0,0
3,0,0,0,1,0,0,0,0
8,0,0,0,1,0,0,0,0
13,0,1,0,1,0,0,0,0
20,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...
2449,0,0,0,1,0,0,0,0
2454,0,0,0,1,0,0,0,0
2457,0,0,0,1,0,0,0,0
2458,0,0,0,1,0,0,0,0


In [216]:
reorganized_dummy_avg_list_first=reorganized_dummy_avg_list[0]
reorganized_dummy_list_first=reorganized_dummy_list[0]

In [226]:
reorganized_dummy_list_first[3]

Unnamed: 0,NRELAP_0.0,NRELAP_1.0,NRELAP_2.0,NRELAP_3.0,NRELAP_4+,CESEV_MILD,CESEV_MODERATE,CESEV_SEVERE
4,1,0,0,0,0,0,0,0
12,0,0,1,0,0,0,0,1
14,0,1,0,0,0,0,0,1
19,0,1,0,0,0,0,1,0
24,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
2436,1,0,0,0,0,0,0,0
2439,0,1,0,0,0,1,0,0
2445,1,0,0,0,0,0,0,0
2462,0,0,1,0,0,1,0,0


In [227]:
# Initialize a list to store scores
scores_with_std = []
variables_cat = reorganized_dummy_list_first[0].columns

# Create a dictionary to store the scores for variables with the same letters before the '_'
variable_scores_dict = {}

# Iterate over each outcome variable in the folds
for level_name in variables_cat: 
    variable_scores = []
    
    # Compute scores for the variable across all folds
    for fold_index in range(len(yi_test_dummies_list)):
        y_test = reorganized_dummy_list_first[fold_index][level_name] 
        y_prob = averaged_dataframes_list_cat[fold_index][level_name] 
        y_prob_avg = reorganized_dummy_avg_list_first[fold_index][level_name] 
        
        # Compute the Brier score and the normalized Brier score
        brier_score = brier_score_loss(y_test, y_prob)
        brier_baseline = brier_score_loss(y_test, y_prob_avg)
        normalized_brier = brier_score / brier_baseline

        # Append the normalized Brier score to the variable scores list
        variable_scores.append(normalized_brier)
    
    # Check if the variable name has letters before the '_'
    prefix = level_name.split('_')[0]
    
    # Add the normalized Brier scores to the dictionary based on the prefix
    if prefix in variable_scores_dict:
        variable_scores_dict[prefix].extend(variable_scores)
    else:
        variable_scores_dict[prefix] = variable_scores

# Compute the average and standard deviation of normalized Brier score for each prefix
for prefix, scores in variable_scores_dict.items():
    avg_score = np.mean(scores)
    std_score = np.std(scores)
    scores_with_std.append((prefix, avg_score, std_score))

cat_normalized_brier = []
cat_std_brier = []
# Print the scores with average and standard deviation along with variable names
print("Normalized Brier scores for each categorical variable:")
for prefix, avg_score, std_score in scores_with_std:
    print(f"{prefix}: {avg_score:.2f} (± {std_score:.2f})")
    cat_normalized_brier.append(avg_score)
    cat_std_brier.append(std_score)

Normalized Brier scores for each categorical variable:
NRELAP: 1.80 (± 1.20)
CESEV: 1.05 (± 0.03)


In [228]:
combined_normalized_brier = np.concatenate((num_normalized_brier, cat_normalized_brier))
combined_normalized_brier

array([0.19200925, 0.25370331, 0.11688389, 0.28923067, 0.41057452,
       0.308628  , 0.43361167, 0.30858897, 0.35212073, 0.34410103,
       0.47547495, 0.24397494, 1.79906181, 1.04557615])

In [229]:
# Compute the average
average_normalized_brier = np.mean(combined_normalized_brier)
print("Average normalized Brier score:", average_normalized_brier)

Average normalized Brier score: 0.46953856377500724
