# Ensemble of 50 Model Chains

TThis notebook compares the scores of an ensemble of 50 Model Chains where we propagate either the target predictions or the true target values. 

In [1]:
import numpy as np
import pandas as pd
import itertools
import random

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import r2_score, accuracy_score
from chaining import Chain
import os
from scipy.stats import mode

In [2]:
def generate_permutations_with_order(variables, pairs_or_groups, num_permutations, constrained_elements=None, shuffle_at_end=None, random_state=None):
    random.seed(random_state)
    permutations_list = [list(variables)]  # Add the original order only once as a list of strings
    
    while len(permutations_list) < num_permutations + 1: # +1 because the original order is counted too
        perm = list(random.sample(variables, len(variables)))
        valid = True
        
        for pair_or_group in pairs_or_groups:
            idxs = [perm.index(var) for var in pair_or_group]
            if sorted(idxs) != idxs:
                valid = False
                break
        
        if valid:
            if constrained_elements:
                # Check if all constrained elements are present in the first positions of the permutation
                if all(elem in perm[:len(constrained_elements)] for elem in constrained_elements):
                    permutations_list.append(perm)
    
    # Shuffle the positions of variables specified to be shuffled at the end
    if shuffle_at_end:
        for idx, perm in enumerate(permutations_list[1:], start=1):  # Start from index 1 because original order shouldn't be shuffled
            for variable in shuffle_at_end:
                if variable in perm:
                    perm.remove(variable)
                    perm.insert(random.randint(0, len(perm)), variable)
    
    return permutations_list

In [3]:
def missingness_stratified_cv(df, N_FOLDS=5, random_state=None):
    # Add seed for reproducibility of the predictions (to get the same scores each time we run the code)
    np.random.seed(random_state)

    # Initial complete-case test fold assignment
    cv = pd.Series(np.nan, index=df.index)
    i_cc = (df.isna().sum(axis=1) == 0) # Complete cases
    cv.iloc[i_cc] = np.random.randint(low=0, high=N_FOLDS, size=i_cc.sum())

    # Go over columns from most missing to least missing
    for j in df.isna().sum().argsort()[::-1]:
        # Instances i that are not assigned yet but for which df[i,j] is observed
        i_tbf = (cv.isna()) & (~df.iloc[:,j].isna()) # to be filled
        # Fill them randomly
        cv.iloc[i_tbf] = np.random.randint(low=0, high=N_FOLDS, size=i_tbf.sum())

    return cv

In [4]:
def missingness_and_categorical_stratified_cv(df, N_FOLDS=5, random_state=None):
    # Add seed for reproducibility of the predictions (to get the same scores each time we run the code)
    np.random.seed(random_state)

    # Initial complete-case test fold assignment
    cv = pd.Series(np.nan, index=df.index)
    i_cc = (df.isna().sum(axis=1) == 0) # Complete cases
    cv.iloc[i_cc] = np.random.randint(low=0, high=N_FOLDS, size=i_cc.sum())

    # Stratify categorical variables
    for col in df.select_dtypes(include=['category']):
        counts = df[col].value_counts(normalize=True)
        for category in counts.index:
            idx = df[col] == category
            cv[idx] = cv[idx].fillna(np.random.choice(np.where(idx)[0], size=int(counts[category] * N_FOLDS), replace=False))

    # Go over columns from most missing to least missing
    for j in df.isna().sum().argsort()[::-1]:
        # Instances i that are not assigned yet but for which df[i,j] is observed
        i_tbf = (cv.isna()) & (~df.iloc[:,j].isna()) # to be filled
        # Fill them randomly
        cv.iloc[i_tbf] = np.random.randint(low=0, high=N_FOLDS, size=i_tbf.sum())

    return cv

In [5]:
# Define a function to reorder columns of dataframes
def reorder_columns(dataframes):
    # Get the column order from the first dataframe
    column_order = dataframes[0].columns
    # Reorder columns for each dataframe in the list
    reordered_dataframes = [df[column_order] for df in dataframes]
    return reordered_dataframes


In [6]:
# Define a function to average dataframes
def average_dataframes(dataframes):
    # Concatenate dataframes within the list
    concatenated_df = pd.concat(dataframes)
    # Group by index and calculate the mode for object columns and mean for other types
    averaged_df = concatenated_df.groupby(concatenated_df.index).agg(lambda x: x.mode()[0] if x.dtype == 'O' else x.mean())
    return averaged_df


---

In [7]:
possible_paths = [
    'C:/Users/lenne/OneDrive/Documenten/Master of Statistics and Data Science/2023-2024/Master thesis/Thesis_Sofia_Lennert/new_data',
    'C:/Users/anaso/Desktop/SOFIA MENDES/KU Leuven/Master Thesis/Thesis_Sofia_Lennert/new_data'
]

# Define file names
file = 'merged_data.csv'

# Find full paths to the CSV files
path = next((f'{path}/{file}' for path in possible_paths if os.path.exists(f'{path}/{file}')), None)

data = pd.read_csv(path)

# Bin the number of relapses into 0, 1, 2, 3 and 4+ 
def bin_column(value):
    if value in [0, 1, 2, 3]:
        return str(value)
    else:
        return '4+'
data['NRELAP'] = data['NRELAP'].apply(bin_column)

# Display all columns
pd.set_option('display.max_columns', None)
#data

In [8]:
# Choice of target variables, and listed already in the chain order 
variables = ['KFSS_M-2y', 'KFSS_P-2y', 'EDSS-2y', 'T25FW-2y', 'NHPT-2y', 'P_R36-SF12-after', 'M_R36-SF12-after', 
             'SES_after', 'SLEC_after', 'KFSS_M-after_2y', 'KFSS_P-after_2y', 'EDSS-after_2y', 'NRELAP', 'CESEV']

# Choice of input variables
columns_to_keep = ['AGE', 'SEX', 'RACE', 'CONTINENT', 'MHDIAGN', 'CARDIO', 'URINARY', 'MUSCKELET', 'FATIGUE', 
                    'NHPT-before', 'PASAT_2s-before', 'PASAT_3s-before', 'SDMT-before', 'T25FW-before', 'SLEC_before','SES_before',
                    'BDI-before', 'EDSS-before', 'KFSS_M-before', 'KFSS_P-before', 'M_R36-SF12-before',
                	'P_R36-SF12-before', 'R36-SF12-before_Ind', 'T-before','P-before','N-before']

features = data[columns_to_keep]
#features

In [9]:
# Use one-hot encoding for categorical and binary input variables
object_columns = features.select_dtypes(include=['object'])
features = pd.get_dummies(features, columns=object_columns.columns, dtype=int)
#features.head()

In [10]:
# Generate 50 permutations of the target variables: the chain orders for the different chains in the ensemble
pairs_or_groups = [['KFSS_M-2y', 'EDSS-2y'], ['KFSS_P-2y', 'EDSS-2y'], ['KFSS_M-after_2y', 'EDSS-after_2y'], ['KFSS_P-after_2y', 'EDSS-after_2y']]
order_constraint = ['KFSS_M-2y', 'KFSS_P-2y', 'EDSS-2y', 'T25FW-2y', 'NHPT-2y']  # Specify the order constraint
shuffle_at_end = ['NRELAP', 'CESEV']  # Specify variables to be shuffled at the end
num_permutations = 49  # Specify how many random permutations you want
random_state = 42
random_permutations = generate_permutations_with_order(variables, pairs_or_groups, num_permutations, order_constraint, shuffle_at_end, random_state)

# Print the original order followed by all the random permutations
for idx, perm in enumerate(random_permutations, start=0):
    print(f"Permutation {idx}: {', '.join(perm)}")

Permutation 0: KFSS_M-2y, KFSS_P-2y, EDSS-2y, T25FW-2y, NHPT-2y, P_R36-SF12-after, M_R36-SF12-after, SES_after, SLEC_after, KFSS_M-after_2y, KFSS_P-after_2y, EDSS-after_2y, NRELAP, CESEV
Permutation 1: T25FW-2y, KFSS_M-2y, NHPT-2y, KFSS_P-2y, EDSS-2y, NRELAP, KFSS_M-after_2y, SLEC_after, M_R36-SF12-after, CESEV, P_R36-SF12-after, KFSS_P-after_2y, EDSS-after_2y, SES_after
Permutation 2: NHPT-2y, NRELAP, KFSS_P-2y, KFSS_M-2y, T25FW-2y, CESEV, EDSS-2y, M_R36-SF12-after, SES_after, SLEC_after, KFSS_P-after_2y, P_R36-SF12-after, KFSS_M-after_2y, EDSS-after_2y
Permutation 3: NHPT-2y, KFSS_P-2y, KFSS_M-2y, T25FW-2y, EDSS-2y, KFSS_P-after_2y, M_R36-SF12-after, CESEV, P_R36-SF12-after, SES_after, SLEC_after, KFSS_M-after_2y, EDSS-after_2y, NRELAP
Permutation 4: KFSS_P-2y, KFSS_M-2y, NHPT-2y, T25FW-2y, EDSS-2y, NRELAP, KFSS_M-after_2y, P_R36-SF12-after, CESEV, KFSS_P-after_2y, SES_after, SLEC_after, EDSS-after_2y, M_R36-SF12-after
Permutation 5: T25FW-2y, KFSS_M-2y, KFSS_P-2y, EDSS-2y, NHPT-2y, 

In [11]:
ordered_targets = random_permutations[0]

In [12]:
# Set random state for reproducibility
random_state = 42
N_FOLDS = 5

In [13]:
# Generate CV folds
cv=missingness_and_categorical_stratified_cv(data[variables], N_FOLDS, random_state)
cv = cv.to_frame(name="CV Fold")

---

## Chain with *predicted* values propagated

In [29]:
y_pred_chains = []
y_test_list = [[] for _ in range(N_FOLDS)]  # Initialize y_test_list with empty lists for each fold index

# Iterate over each chain ordering
for ordered_targets_chain in random_permutations:
    y_pred_list_chain = []  # List to store predictions for this chain
    
    features_cv = pd.merge(features, pd.DataFrame(cv), left_index=True, right_index=True)
    targets_cv = pd.merge(data[ordered_targets_chain], pd.DataFrame(cv), left_index=True, right_index=True)

    # Fit and predict for each fold for this chain
    for i in range(0, N_FOLDS): 
        Xi_train = features_cv[features_cv['CV Fold'] != i].drop(["CV Fold"], axis=1)
        Xi_test = features_cv[features_cv['CV Fold'] == i].drop(["CV Fold"], axis=1)
        yi_train = targets_cv[targets_cv['CV Fold'] != i].drop(["CV Fold"], axis=1)
        yi_test = targets_cv[targets_cv['CV Fold'] == i].drop(["CV Fold"], axis=1)
        y_test_list_chain = []  # List to store y_test for this fold

        chain = Chain(
            model_reg=RandomForestRegressor(random_state=random_state),
            model_clf=RandomForestClassifier(random_state=random_state),
            propagate="pred",
        )
        chain.fit(Xi_train, yi_train, target_types=None)
        y_pred = chain.predict(Xi_test)
        y_pred_list_chain.append(pd.DataFrame(y_pred, columns=yi_test.columns, index=yi_test.index))

        # Append yi_test to the corresponding fold index in y_test_list
        y_test_list[i].append(yi_test)  # Append yi_test for fold i

    y_pred_chains.append(y_pred_list_chain)
    print("Permutation done")

Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done


In [30]:
transposed_list = list(zip(*y_pred_chains))

# Each element of transposed_list is a tuple containing dataframes from the same position in each inner list. From these tuples, we create a list of lists.
reorganized_list = [list(df_tuple) for df_tuple in transposed_list]

In [31]:
transposed_test_list = list(zip(*y_test_list))

# Each element of transposed_list is a tuple containing dataframes from the same position in each inner list. From these tuples, we create a list of lists.
reorganized_test_list = [list(df_tuple) for df_tuple in transposed_test_list]

In [32]:
# Apply the function to each list in reorganized_list: same column ordering for each dataframe
reordered_reorganized_list = [reorder_columns(dataframes) for dataframes in reorganized_list]

In [33]:
# Apply the function to each list in reordered_reorganized_list: average the (reordered) dataframes
averaged_dataframes_list = [average_dataframes(dataframes) for dataframes in reordered_reorganized_list]

In [34]:
# Remove rows in y_test and y_pred where the variable in question is missing in y_test (since without it, it is not possible to calculate the score)
y_pred_list = averaged_dataframes_list.copy()
y_test_list = reorganized_test_list[0]

y_test_cv = []
y_pred_cv = []

for j in range(len(y_test_list)):  
    y_test_targ = []
    y_pred_targ = []
    nvar=y_test_list[0].shape[1]

    for i in range(0, nvar):  
        missing_rows_mask = y_test_list[j].iloc[:, i].isna()
        y_test = y_test_list[j].iloc[:, i][~missing_rows_mask]
        y_pred = y_pred_list[j].iloc[:, i][~missing_rows_mask]
        
        y_test_targ.append(y_test)
        y_pred_targ.append(y_pred)
    
    y_test_cv.append(y_test_targ)
    y_pred_cv.append(y_pred_targ)

In [35]:
# Initialize a list to store scores
scores = []
scores_with_std = []

# Iterate over each outcome variable in the folds
for variable_name in variables: 
    variable_scores = []
    
    # Compute scores for the variable across all folds
    for fold_index in range(len(y_test_cv)):
        y_test = y_test_cv[fold_index][variables.index(variable_name)] 
        y_pred = y_pred_cv[fold_index][variables.index(variable_name)] 
        
        # Check if the target variable is numerical or categorical
        if y_test.dtype.kind in 'bifc':
            score = r2_score(y_test, y_pred)
        else:
            score = accuracy_score(y_test, y_pred)
                  
        variable_scores.append(score)
    
    # Compute the average score for the variable across all folds
    variable_avg_score = np.mean(variable_scores)
    # Compute the standard deviation for the variable across all folds
    variable_std_score = np.std(variable_scores)
    scores_with_std.append((variable_name, variable_avg_score, variable_std_score))

# Print the scores with average and standard deviation along with variable names
print(f"Scores for each outcome (ensemble with {int(len(random_permutations))} chains - propagate predicted values)")
for variable_name, avg_score, std_score in scores_with_std:
    print(f"{variable_name}: {avg_score:.6f} (± {std_score:.6f})")

Scores for each outcome (ensemble with 50 chains - propagate predicted values)
KFSS_M-2y: 0.81 (± 0.03)
KFSS_P-2y: 0.75 (± 0.04)
EDSS-2y: 0.88 (± 0.01)
T25FW-2y: 0.73 (± 0.09)
NHPT-2y: 0.51 (± 0.10)
P_R36-SF12-after: 0.70 (± 0.01)
M_R36-SF12-after: 0.57 (± 0.03)
SES_after: 0.69 (± 0.06)
SLEC_after: 0.66 (± 0.05)
KFSS_M-after_2y: 0.65 (± 0.05)
KFSS_P-after_2y: 0.51 (± 0.06)
EDSS-after_2y: 0.76 (± 0.02)
NRELAP: 0.64 (± 0.01)
CESEV: 0.50 (± 0.04)


---

## Chain with *true* values propagated

In [14]:
y_pred_chains = []
y_test_list = [[] for _ in range(N_FOLDS)]  # Initialize y_test_list with empty lists for each fold index

# Iterate over each chain ordering
for ordered_targets_chain in random_permutations:
    y_pred_list_chain = []  # List to store predictions for this chain
    
    features_cv = pd.merge(features, pd.DataFrame(cv), left_index=True, right_index=True)
    targets_cv = pd.merge(data[ordered_targets_chain], pd.DataFrame(cv), left_index=True, right_index=True)

    # Fit and predict for each fold for this chain
    for i in range(0, N_FOLDS): 
        Xi_train = features_cv[features_cv['CV Fold'] != i].drop(["CV Fold"], axis=1)
        Xi_test = features_cv[features_cv['CV Fold'] == i].drop(["CV Fold"], axis=1)
        yi_train = targets_cv[targets_cv['CV Fold'] != i].drop(["CV Fold"], axis=1)
        yi_test = targets_cv[targets_cv['CV Fold'] == i].drop(["CV Fold"], axis=1)
        y_test_list_chain = []  # List to store y_test for this fold

        chain = Chain(
            model_reg=RandomForestRegressor(random_state=random_state),
            model_clf=RandomForestClassifier(random_state=random_state),
            propagate="true",
        )
        chain.fit(Xi_train, yi_train, target_types=None)
        y_pred = chain.predict(Xi_test)
        y_pred_list_chain.append(pd.DataFrame(y_pred, columns=yi_test.columns, index=yi_test.index))

        # Append yi_test to the corresponding fold index in y_test_list
        y_test_list[i].append(yi_test)  # Append yi_test for fold i

    y_pred_chains.append(y_pred_list_chain)
    print("Permutation done")

Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done
Permutation done


In [15]:
transposed_list = list(zip(*y_pred_chains))

# Each element of transposed_list is a tuple containing dataframes from the same position in each inner list. From these tuples, we create a list of lists.
reorganized_list = [list(df_tuple) for df_tuple in transposed_list]

In [16]:
transposed_test_list = list(zip(*y_test_list))

# Each element of transposed_list is a tuple containing dataframes from the same position in each inner list. From these tuples, we create a list of lists.
reorganized_test_list = [list(df_tuple) for df_tuple in transposed_test_list]

In [20]:
# Apply the function to each list in reorganized_list: same column ordering for each dataframe
reordered_reorganized_list = [reorder_columns(dataframes) for dataframes in reorganized_list]

In [21]:
# Apply the function to each list in reordered_reorganized_list: average the (reordered) dataframes
averaged_dataframes_list = [average_dataframes(dataframes) for dataframes in reordered_reorganized_list]

In [22]:
# Remove rows in y_test and y_pred where the variable in question is missing in y_test (since without it, it is not possible to calculate the score)
y_pred_list = averaged_dataframes_list.copy()
y_test_list = reorganized_test_list[0]

y_test_cv = []
y_pred_cv = []

for j in range(len(y_test_list)):  
    y_test_targ = []
    y_pred_targ = []
    nvar=y_test_list[0].shape[1]

    for i in range(0, nvar):  
        missing_rows_mask = y_test_list[j].iloc[:, i].isna()
        y_test = y_test_list[j].iloc[:, i][~missing_rows_mask]
        y_pred = y_pred_list[j].iloc[:, i][~missing_rows_mask]
        
        y_test_targ.append(y_test)
        y_pred_targ.append(y_pred)
    
    y_test_cv.append(y_test_targ)
    y_pred_cv.append(y_pred_targ)
# y_test_cv[fold][outcome]

In [23]:
# Initialize a list to store scores
scores = []
scores_with_std = []

# Iterate over each outcome variable in the folds
for variable_name in variables: 
    variable_scores = []
    
    # Compute scores for the variable across all folds
    for fold_index in range(len(y_test_cv)):
        y_test = y_test_cv[fold_index][variables.index(variable_name)] 
        y_pred = y_pred_cv[fold_index][variables.index(variable_name)] 
        
        # Check if the target variable is numerical or categorical
        if y_test.dtype.kind in 'bifc':
            score = r2_score(y_test, y_pred)
        else:
            score = accuracy_score(y_test, y_pred)
                  
        variable_scores.append(score)
    
    # Compute the average score for the variable across all folds
    variable_avg_score = np.mean(variable_scores)
    # Compute the standard deviation for the variable across all folds
    variable_std_score = np.std(variable_scores)
    scores_with_std.append((variable_name, variable_avg_score, variable_std_score))

# Print the scores with average and standard deviation along with variable names
print(f"Scores for each outcome (ensemble with {int(len(random_permutations))} chains - propagate true values)")
for variable_name, avg_score, std_score in scores_with_std:
    print(f"{variable_name}: {avg_score:.6f} (± {std_score:.6f})")

Scores for each outcome (ensemble with 50 chains - propagate true values)
KFSS_M-2y: 0.807907 (± 0.025321)
KFSS_P-2y: 0.750440 (± 0.035443)
EDSS-2y: 0.883660 (± 0.008828)
T25FW-2y: 0.730929 (± 0.086771)
NHPT-2y: 0.533410 (± 0.106331)
P_R36-SF12-after: 0.703185 (± 0.012984)
M_R36-SF12-after: 0.572973 (± 0.033236)
SES_after: 0.681308 (± 0.058940)
SLEC_after: 0.649600 (± 0.043132)
KFSS_M-after_2y: 0.660131 (± 0.057459)
KFSS_P-after_2y: 0.512346 (± 0.066684)
EDSS-after_2y: 0.761977 (± 0.024816)
NRELAP: 0.643790 (± 0.012098)
CESEV: 0.559332 (± 0.043531)
