This notebook aims to find the best ordering of targets, from a random subset of permutations

new objective: get the predictions of all chains and average them out and measure the performance -- Does an ensemble of chains lead to a clear improvement over just one chain?

In [38]:
import numpy as np
import pandas as pd
import itertools
import random

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import r2_score, accuracy_score
from chaining import Chain
import os
from scipy.stats import mode

In [39]:
def generate_permutations_with_order(variables, pairs_or_groups, num_permutations, random_state=None):
    random.seed(random_state)
    permutations = itertools.permutations(variables)
    ordered_permutations = []

    for perm in permutations:
        valid = True
        for pair_or_group in pairs_or_groups:
            idxs = [perm.index(var) for var in pair_or_group]
            if sorted(idxs) != idxs:
                valid = False
                break
        if valid:
            ordered_permutations.append(perm)

    random_permutations = random.sample(ordered_permutations, min(num_permutations, len(ordered_permutations)))
    
    permutations_list = [list(variables)]  # Add the original order only once as a list of strings
    
    for perm in random_permutations:
        permutations_list.append(list(perm))  # Convert each permutation to a list of strings
    
    return permutations_list

In [40]:
def missingness_stratified_cv(df, N_FOLDS=5, random_state=None):
    # Add seed for reproducibility of the predictions (to get the same scores each time we run the code)
    np.random.seed(random_state)

    # Initial complete-case test fold assignment
    cv = pd.Series(np.nan, index=df.index)
    i_cc = (df.isna().sum(axis=1) == 0) # Complete cases
    cv.iloc[i_cc] = np.random.randint(low=0, high=N_FOLDS, size=i_cc.sum())

    # Go over columns from most missing to least missing
    for j in df.isna().sum().argsort()[::-1]:
        # Instances i that are not assigned yet but for which df[i,j] is observed
        i_tbf = (cv.isna()) & (~df.iloc[:,j].isna()) # to be filled
        # Fill them randomly
        cv.iloc[i_tbf] = np.random.randint(low=0, high=N_FOLDS, size=i_tbf.sum())

    return cv

In [41]:
possible_paths = [
    'C:/Users/lenne/OneDrive/Documenten/Master of Statistics and Data Science/2023-2024/Master thesis/Thesis_Sofia_Lennert/new_data',
    'C:/Users/anaso/Desktop/SOFIA MENDES/KU Leuven/Master Thesis/Thesis_Sofia_Lennert/new_data'
]

# Define file names
file = 'merged_data.csv'

# Find full paths to the CSV files
path = next((f'{path}/{file}' for path in possible_paths if os.path.exists(f'{path}/{file}')), None)

data = pd.read_csv(path)

# Assume missing means 0 relapses
#data['NRELAP'] = data['NRELAP'].fillna(0)
# Bin the number of relapses into 0, 1, 2, 3 and 4+ 
#def bin_column(value):
#    if value in [0, 1, 2, 3]:
#        return str(value)
#    else:
#        return '4+'
#data['NRELAP'] = data['NRELAP'].apply(bin_column)

# Resulting DataFrame will have aggregated data from all four datasets based on the specific_column
pd.set_option('display.max_columns', None)
data

Unnamed: 0,USUBJID,AGE,SEX,RACE,CONTINENT,CESEV,CECONTRT,TOTRELAP,MHCONTRT,MHDIAGN,CARDIO,URINARY,MUSCKELET,FATIGUE,SMSTDY,NRELAP,NHPT-before,NHPT-2y,NHPT-after_2y,PASAT_2s-before,PASAT_2s-2y,PASAT_2s-after_2y,PASAT_3s-before,PASAT_3s-2y,PASAT_3s-after_2y,SDMT-before,SDMT-2y,T25FW-before,T25FW-2y,T25FW-after_2y,T-before,T-after,P-before,P-after,N-before,N-after,DS,DS_L,DS_R,SLEC_before,SLEC_after,SES_after,SES_before,VAA,BDI-before,BDI-after,EDSS-before,EDSS-2y,EDSS-after_2y,KFSS1-Sensory-2y,KFSS1-Sensory-after_2y,KFSS1-Sensory-before,KFSS1-Brain-2y,KFSS1-Brain-after_2y,KFSS1-Brain-before,KFSS1-Bowel-2y,KFSS1-Bowel-after_2y,KFSS1-Bowel-before,KFSS1-Pyramidal-2y,KFSS1-Pyramidal-after_2y,KFSS1-Pyramidal-before,KFSS1-Cerebral-2y,KFSS1-Cerebral-after_2y,KFSS1-Cerebral-before,KFSS1-Visual-2y,KFSS1-Visual-after_2y,KFSS1-Visual-before,KFSS1-Cerebellar-2y,KFSS1-Cerebellar-after_2y,KFSS1-Cerebellar-before,KFSS_M-2y,KFSS_M-after_2y,KFSS_M-before,KFSS_P-2y,KFSS_P-after_2y,KFSS_P-before,M_R36-SF12-before,P_R36-SF12-before,R36-SF12-before_Ind,M_R36-SF12-after,P_R36-SF12-after,R36-SF12-after_Ind
0,MSOAC/0014,46.0,F,,,,,,,RRMS,0,0,0,0,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,MSOAC/0016,,M,WHITE,NORTH AMERICA,,,,Y,SPMS,1,1,0,1,,0.0,,,,,,,,,,,,8.55,6.60,,0.0,0.0,,,,,,,,,,,,,,,6.00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,MSOAC/0019,44.0,M,NON-WHITE,,,,,,PPMS,1,1,0,0,,0.0,23.65,21.30,20.15,34.5,35.5,43.0,43.5,51.0,53.0,,,6.30,6.15,5.85,0.0,0.0,0.0,0.0,,,,,,,,,,,,,3.75,3.50,3.0,0.333333,0.166667,0.500000,0.2,0.0,0.2,0.000000,0.166667,0.083333,0.333333,0.5,0.416667,0.0,0.0,0.0,0.333333,0.0,0.333333,0.0,0.2,0.0,0.185185,0.185185,0.240741,0.166667,0.083333,0.208333,0.657143,0.708861,1.0,0.685714,0.734177,1.0
3,MSOAC/0024,60.0,M,WHITE,NORTH AMERICA,,,,,SPMS,1,1,1,1,,0.0,34.45,37.50,,55.0,54.0,,60.0,60.0,,,,4.50,5.25,,0.0,0.0,0.0,0.0,1.0,0.0,,,,,,,,,0.031746,0.023810,4.00,3.75,,0.333333,,0.333333,0.0,,0.1,0.583333,,0.666667,0.166667,,0.250000,0.0,,0.0,0.000000,,0.083333,0.2,,0.5,0.129630,,0.240741,0.291667,,0.375000,0.657143,0.481013,1.0,0.632353,0.594937,1.0
4,MSOAC/0030,28.0,F,WHITE,EUROPE,,,,,RRMS,1,1,0,1,,0.0,16.55,17.90,,,,,58.0,60.0,,63.5,69.0,4.85,4.70,,0.0,0.0,0.0,0.0,0.0,0.0,,,,26.0,24.0,1.25,1.25,,0.063492,0.039683,2.00,1.50,,0.166667,,0.166667,0.2,,0.2,0.166667,,0.166667,0.166667,,0.333333,0.0,,0.2,0.166667,,0.083333,0.0,,0.1,0.111111,,0.203704,0.166667,,0.125000,0.733333,0.692308,0.0,0.700000,0.615385,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2460,MSOAC/9986,46.0,M,WHITE,OCEANIA,,,,,RRMS,1,1,0,1,,0.0,19.35,18.95,,,,,58.0,60.0,,51.0,60.0,3.90,3.80,,0.0,0.0,0.0,0.0,0.0,0.0,,,,36.0,35.0,1.25,1.25,,0.047619,0.063492,2.75,2.50,,0.333333,,0.166667,0.0,,0.0,0.333333,,0.250000,0.166667,,0.333333,0.0,,0.0,0.000000,,0.000000,0.0,,0.2,0.111111,,0.148148,0.166667,,0.125000,0.833333,0.576923,0.0,0.866667,0.634615,0.0
2461,MSOAC/9987,18.0,F,,,,,,,RRMS,0,0,0,0,,0.0,,,,,,,,,,,,,,,,,,,,,1.0,1.0,1.0,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2462,MSOAC/9995,38.0,F,,,MILD,,4.0,,RRMS,0,0,0,0,142.0,2.0,,,,,,,,,,,,,,,,,,,,,1.2,1.0,1.0,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2463,MSOAC/9998,40.0,F,WHITE,,,Y,2.0,Y,PPMS,0,1,0,1,79.0,1.0,23.80,22.40,22.50,21.5,30.5,33.5,31.5,39.5,40.5,,,6.15,6.00,6.20,0.0,0.0,0.0,0.0,,,,,,,,,,,,,4.50,3.75,4.0,0.166667,0.250000,0.333333,0.4,0.6,0.6,0.166667,0.166667,0.166667,0.333333,0.5,0.500000,0.0,0.0,0.4,0.166667,0.0,0.166667,0.6,0.6,0.6,0.314815,0.351852,0.481481,0.166667,0.083333,0.166667,0.642857,0.569620,1.0,0.642857,0.632911,1.0


In [42]:
#variables = ['KFSS_M-2y', 'EDSS-2y', 'T25FW-2y', 'NRELAP']# removed KFSS_P-2y for now -- ('SMSTDY' gave a score of -0.03)
variables = ['KFSS_M-2y', 'KFSS_P-2y', 'EDSS-2y', 'T25FW-2y', 'NHPT-2y', 'P_R36-SF12-after', 
            'M_R36-SF12-after', 'SES_after', 'EDSS-after_2y', 'NRELAP', 'CESEV']

# Extract targets
#targets = data[variables]

# Extract features by dropping the target columns
#features = data.drop(variables, axis=1)

columns_to_keep = ['AGE', 'SEX', 'RACE', 'CONTINENT', 'MHDIAGN', 'CARDIO', 'URINARY', 'MUSCKELET', 'FATIGUE', 
                    'NHPT-before', 'PASAT_2s-before', 'PASAT_3s-before', 'SDMT-before', 'T25FW-before', 'SLEC_before','SES_before',
                    'BDI-before', 'EDSS-before', 'KFSS_M-before', 'KFSS_P-before', 'M_R36-SF12-before',
                	'P_R36-SF12-before', 'R36-SF12-before_Ind', 'T-before','P-before','N-before']
# still need to change in OE dataframe the SLEC and SES so name is consistent with the others

features = data[columns_to_keep]
features

Unnamed: 0,AGE,SEX,RACE,CONTINENT,MHDIAGN,CARDIO,URINARY,MUSCKELET,FATIGUE,NHPT-before,PASAT_2s-before,PASAT_3s-before,SDMT-before,T25FW-before,SLEC_before,SES_before,BDI-before,EDSS-before,KFSS_M-before,KFSS_P-before,M_R36-SF12-before,P_R36-SF12-before,R36-SF12-before_Ind,T-before,P-before,N-before
0,46.0,F,,,RRMS,0,0,0,0,,,,,,,,,,,,,,,,,
1,,M,WHITE,NORTH AMERICA,SPMS,1,1,0,1,,,,,8.55,,,,6.00,,,,,,0.0,,
2,44.0,M,NON-WHITE,,PPMS,1,1,0,0,23.65,34.5,43.5,,6.30,,,,3.75,0.240741,0.208333,0.657143,0.708861,1.0,0.0,0.0,
3,60.0,M,WHITE,NORTH AMERICA,SPMS,1,1,1,1,34.45,55.0,60.0,,4.50,,,0.031746,4.00,0.240741,0.375000,0.657143,0.481013,1.0,0.0,0.0,1.0
4,28.0,F,WHITE,EUROPE,RRMS,1,1,0,1,16.55,,58.0,63.5,4.85,26.0,1.25,0.063492,2.00,0.203704,0.125000,0.733333,0.692308,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2460,46.0,M,WHITE,OCEANIA,RRMS,1,1,0,1,19.35,,58.0,51.0,3.90,36.0,1.25,0.047619,2.75,0.148148,0.125000,0.833333,0.576923,0.0,0.0,0.0,0.0
2461,18.0,F,,,RRMS,0,0,0,0,,,,,,,,,,,,,,,,,
2462,38.0,F,,,RRMS,0,0,0,0,,,,,,,,,,,,,,,,,
2463,40.0,F,WHITE,,PPMS,0,1,0,1,23.80,21.5,31.5,,6.15,,,,4.50,0.481481,0.166667,0.642857,0.569620,1.0,0.0,0.0,


In [43]:
object_columns = features.select_dtypes(include=['object'])
features = pd.get_dummies(features, columns=object_columns.columns, dtype=int)
features.head()

Unnamed: 0,AGE,CARDIO,URINARY,MUSCKELET,FATIGUE,NHPT-before,PASAT_2s-before,PASAT_3s-before,SDMT-before,T25FW-before,SLEC_before,SES_before,BDI-before,EDSS-before,KFSS_M-before,KFSS_P-before,M_R36-SF12-before,P_R36-SF12-before,R36-SF12-before_Ind,T-before,P-before,N-before,SEX_F,SEX_M,RACE_NON-WHITE,RACE_WHITE,CONTINENT_ASIA,CONTINENT_EURASIA,CONTINENT_EUROPE,CONTINENT_NORTH AMERICA,CONTINENT_OCEANIA,CONTINENT_SOUTH AMERICA,MHDIAGN_PPMS,MHDIAGN_RRMS,MHDIAGN_SPMS
0,46.0,0,0,0,0,,,,,,,,,,,,,,,,,,1,0,0,0,0,0,0,0,0,0,0,1,0
1,,1,1,0,1,,,,,8.55,,,,6.0,,,,,,0.0,,,0,1,0,1,0,0,0,1,0,0,0,0,1
2,44.0,1,1,0,0,23.65,34.5,43.5,,6.3,,,,3.75,0.240741,0.208333,0.657143,0.708861,1.0,0.0,0.0,,0,1,1,0,0,0,0,0,0,0,1,0,0
3,60.0,1,1,1,1,34.45,55.0,60.0,,4.5,,,0.031746,4.0,0.240741,0.375,0.657143,0.481013,1.0,0.0,0.0,1.0,0,1,0,1,0,0,0,1,0,0,0,0,1
4,28.0,1,1,0,1,16.55,,58.0,63.5,4.85,26.0,1.25,0.063492,2.0,0.203704,0.125,0.733333,0.692308,0.0,0.0,0.0,0.0,1,0,0,1,0,0,1,0,0,0,0,1,0


In [44]:
#variables = ['KFSS_M-2y', 'KFSS_P-2y', 'EDSS-2y', 'T25FW-2y', 'NHPT-2y', 'P_R36-SF12-after', 
             #'M_R36-SF12-after', 'SES_after', 'EDSS-after_2y', 'NRELAP', 'CESEV']

pairs_or_groups = [['EDSS-2y', 'EDSS-after_2y'], ['KFSS_M-2y', 'EDSS-2y'], ['KFSS_P-2y', 'EDSS-2y']]  # Specify the pairs or groups

num_permutations = 5  # Specify how many random permutations you want
random_state = 42

random_permutations = generate_permutations_with_order(variables, pairs_or_groups, num_permutations, random_state)

# Print the original order followed by all the random permutations
for idx, perm in enumerate(random_permutations, start=0):
    print(f"Permutation {idx}: {', '.join(perm)}")

Permutation 0: KFSS_M-2y, KFSS_P-2y, EDSS-2y, T25FW-2y, NHPT-2y, P_R36-SF12-after, M_R36-SF12-after, SES_after, EDSS-after_2y, NRELAP, CESEV
Permutation 1: SES_after, NRELAP, P_R36-SF12-after, KFSS_M-2y, M_R36-SF12-after, T25FW-2y, KFSS_P-2y, CESEV, EDSS-2y, EDSS-after_2y, NHPT-2y
Permutation 2: KFSS_M-2y, SES_after, M_R36-SF12-after, T25FW-2y, CESEV, KFSS_P-2y, P_R36-SF12-after, EDSS-2y, NHPT-2y, EDSS-after_2y, NRELAP
Permutation 3: KFSS_M-2y, KFSS_P-2y, M_R36-SF12-after, EDSS-2y, NRELAP, EDSS-after_2y, T25FW-2y, CESEV, SES_after, NHPT-2y, P_R36-SF12-after
Permutation 4: CESEV, KFSS_P-2y, T25FW-2y, SES_after, M_R36-SF12-after, KFSS_M-2y, EDSS-2y, P_R36-SF12-after, EDSS-after_2y, NHPT-2y, NRELAP
Permutation 5: KFSS_P-2y, CESEV, KFSS_M-2y, EDSS-2y, NRELAP, T25FW-2y, EDSS-after_2y, SES_after, P_R36-SF12-after, NHPT-2y, M_R36-SF12-after


In [45]:
ordered_targets = random_permutations[0]

In [46]:
targets = data[ordered_targets] ### CHANGE VAR LIST HERE
targets

Unnamed: 0,KFSS_M-2y,KFSS_P-2y,EDSS-2y,T25FW-2y,NHPT-2y,P_R36-SF12-after,M_R36-SF12-after,SES_after,EDSS-after_2y,NRELAP,CESEV
0,,,,,,,,,,0.0,
1,,,,6.60,,,,,,0.0,
2,0.185185,0.166667,3.50,6.15,21.30,0.734177,0.685714,,3.0,0.0,
3,0.129630,0.291667,3.75,5.25,37.50,0.594937,0.632353,,,0.0,
4,0.111111,0.166667,1.50,4.70,17.90,0.615385,0.700000,1.25,,0.0,
...,...,...,...,...,...,...,...,...,...,...,...
2460,0.111111,0.166667,2.50,3.80,18.95,0.634615,0.866667,1.25,,0.0,
2461,,,,,,,,,,0.0,
2462,,,,,,,,,,2.0,MILD
2463,0.314815,0.166667,3.75,6.00,22.40,0.632911,0.642857,,4.0,1.0,


In [47]:
# Set random state for reproducibility
random_state = 42
N_FOLDS = 5

In [48]:
# Generate CV folds
cv=missingness_stratified_cv(features, N_FOLDS, random_state)
cv = cv.to_frame(name="CV Fold")

features_cv = pd.merge(features, pd.DataFrame(cv), left_index=True, right_index=True)
targets_cv = pd.merge(targets, pd.DataFrame(cv), left_index=True, right_index=True)

features_cv['CV Fold'].value_counts()

CV Fold
4.0    510
3.0    502
0.0    500
1.0    495
2.0    458
Name: count, dtype: int64

Is it a problem that not all folds have the exact same number?

In [61]:
features_cv.head()

Unnamed: 0,AGE,CARDIO,URINARY,MUSCKELET,FATIGUE,NHPT-before,PASAT_2s-before,PASAT_3s-before,SDMT-before,T25FW-before,SLEC_before,SES_before,BDI-before,EDSS-before,KFSS_M-before,KFSS_P-before,M_R36-SF12-before,P_R36-SF12-before,R36-SF12-before_Ind,T-before,P-before,N-before,SEX_F,SEX_M,RACE_NON-WHITE,RACE_WHITE,CONTINENT_ASIA,CONTINENT_EURASIA,CONTINENT_EUROPE,CONTINENT_NORTH AMERICA,CONTINENT_OCEANIA,CONTINENT_SOUTH AMERICA,MHDIAGN_PPMS,MHDIAGN_RRMS,MHDIAGN_SPMS,CV Fold
0,46.0,0,0,0,0,,,,,,,,,,,,,,,,,,1,0,0,0,0,0,0,0,0,0,0,1,0,0.0
1,,1,1,0,1,,,,,8.55,,,,6.0,,,,,,0.0,,,0,1,0,1,0,0,0,1,0,0,0,0,1,1.0
2,44.0,1,1,0,0,23.65,34.5,43.5,,6.3,,,,3.75,0.240741,0.208333,0.657143,0.708861,1.0,0.0,0.0,,0,1,1,0,0,0,0,0,0,0,1,0,0,2.0
3,60.0,1,1,1,1,34.45,55.0,60.0,,4.5,,,0.031746,4.0,0.240741,0.375,0.657143,0.481013,1.0,0.0,0.0,1.0,0,1,0,1,0,0,0,1,0,0,0,0,1,0.0
4,28.0,1,1,0,1,16.55,,58.0,63.5,4.85,26.0,1.25,0.063492,2.0,0.203704,0.125,0.733333,0.692308,0.0,0.0,0.0,0.0,1,0,0,1,0,0,1,0,0,0,0,1,0,3.0


In [62]:
targets_cv.head()

Unnamed: 0,KFSS_M-2y,KFSS_P-2y,EDSS-2y,T25FW-2y,NHPT-2y,P_R36-SF12-after,M_R36-SF12-after,SES_after,EDSS-after_2y,NRELAP,CESEV,CV Fold
0,,,,,,,,,,0.0,,0.0
1,,,,6.6,,,,,,0.0,,1.0
2,0.185185,0.166667,3.5,6.15,21.3,0.734177,0.685714,,3.0,0.0,,2.0
3,0.12963,0.291667,3.75,5.25,37.5,0.594937,0.632353,,,0.0,,0.0
4,0.111111,0.166667,1.5,4.7,17.9,0.615385,0.7,1.25,,0.0,,3.0


In [64]:
# Check for rows where all columns except 'NRELAP' are missing
missing_rows = targets_cv.drop(columns=['NRELAP', 'CV Fold']).isnull().all(axis=1)

# Count the number of rows where all columns except 'NRELAP' are missing
num_missing_rows_except_NRELAP = missing_rows.sum()

print(f"Number of rows where all columns except 'NRELAP' are missing: {num_missing_rows_except_NRELAP}")

Number of rows where all columns except 'NRELAP' are missing: 309


In [66]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import r2_score, accuracy_score
from scipy.stats import mode
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Initialize a list to store predictions for each chain
y_pred_chains = []
y_test_list = [[] for _ in range(N_FOLDS)]  # Initialize y_test_list with empty lists for each fold index

# Iterate over each chain ordering
for ordered_targets_chain in random_permutations:
    y_pred_list_chain = []  # List to store predictions for this chain
    
    # Fit and predict for each fold for this chain
    for i in range(0, N_FOLDS): 
        Xi_train = features_cv[features_cv['CV Fold'] != i].drop(["CV Fold"], axis=1)
        Xi_test = features_cv[features_cv['CV Fold'] == i].drop(["CV Fold"], axis=1)
        yi_train = targets_cv[targets_cv['CV Fold'] != i].drop(["CV Fold"], axis=1)
        yi_test = targets_cv[targets_cv['CV Fold'] == i].drop(["CV Fold"], axis=1)
        y_test_list_chain = []  # List to store y_test for this fold
        
        chain = Chain(
            model_reg=RandomForestRegressor(random_state=random_state),
            model_clf=RandomForestClassifier(random_state=random_state),
            propagate="pred",
        )
        chain.fit(Xi_train, yi_train, target_types=None)
        y_pred = chain.predict(Xi_test)
        y_pred_list_chain.append(pd.DataFrame(y_pred, columns=yi_test.columns, index=yi_test.index))
        
        # Append yi_test to the corresponding fold index in y_test_list
        y_test_list[i].append(yi_test)  # Append yi_test for fold i
    
    y_pred_chains.append(y_pred_list_chain)

# Initialize lists to store averaged predictions and targets
y_pred_avg = [[] for _ in range(N_FOLDS)]  # List of lists to store averaged predictions for each fold
y_test_avg = [[] for _ in range(N_FOLDS)]  # List of lists to store targets for each fold

# Average predictions across chains for each fold
for fold_index in range(N_FOLDS):
    for variable_index, variable_name in enumerate(ordered_targets):
        y_test_fold_variable = []
        y_pred_fold_variable = []

        for chain_index in range(len(y_pred_chains)):
            y_test_chain = y_test_list[fold_index][chain_index][variable_name]
            y_pred_chain = y_pred_chains[chain_index][fold_index][variable_name]
            
            # Handle missing values
            y_test_chain = y_test_chain.dropna()
            y_pred_chain = y_pred_chain.loc[y_test_chain.index]

            y_test_fold_variable.append(y_test_chain)
            y_pred_fold_variable.append(y_pred_chain)

        # Concatenate lists for each chain within the fold
        y_test_fold_concat = pd.concat(y_test_fold_variable)
        y_pred_fold_concat = pd.concat(y_pred_fold_variable)

        # Compute mean for numerical targets, mode for categorical targets
        if y_test_fold_concat.dtype.kind in 'bifc':  # Check if target is numerical
            y_pred_variable = y_pred_fold_concat.mean()
        else:
            y_pred_variable = y_pred_fold_concat.mode().iloc[0]

        y_pred_avg[fold_index].append(y_pred_variable)
        y_test_avg[fold_index].append(y_test_fold_concat)

# Initialize lists to store scores
scores_with_std = []

# Compute scores for the averaged predictions across all folds
for variable_index, variable_name in enumerate(ordered_targets):
    variable_scores = []
    for fold_index in range(N_FOLDS):
        y_test = y_test_avg[fold_index][variable_index]
        y_pred = y_pred_avg[fold_index][variable_index]

        if y_test.dtype.kind in 'bifc':
            score = r2_score(y_test, y_pred)
        else:
            score = accuracy_score(y_test, y_pred)

        variable_scores.append(score)

    variable_avg_score = np.mean(variable_scores)
    variable_std_score = np.std(variable_scores)
    scores_with_std.append((variable_name, variable_avg_score, variable_std_score))

# Print the scores with average and standard deviation along with variable names
print("Scores for each outcome (ensemble of chains):")
for variable_name, avg_score, std_score in scores_with_std:
    print(f"{variable_name}: {avg_score:.2f} (± {std_score:.2f})")

InvalidParameterError: The 'y_pred' parameter of r2_score must be an array-like. Got 0.23550325559516733 instead.

In [58]:
y_pred_chains = []
y_test_list = [[] for _ in range(N_FOLDS)]  # Initialize y_test_list with empty lists for each fold index

# Iterate over each chain ordering
for ordered_targets_chain in random_permutations:
    y_pred_list_chain = []  # List to store predictions for this chain

    # Fit and predict for each fold for this chain
    for i in range(0, N_FOLDS): 
        Xi_train = features_cv[features_cv['CV Fold'] != i].drop(["CV Fold"], axis=1)
        Xi_test = features_cv[features_cv['CV Fold'] == i].drop(["CV Fold"], axis=1)
        yi_train = targets_cv[targets_cv['CV Fold'] != i].drop(["CV Fold"], axis=1)
        yi_test = targets_cv[targets_cv['CV Fold'] == i].drop(["CV Fold"], axis=1)
        y_test_list_chain = []  # List to store y_test for this fold

        chain = Chain(
            model_reg=RandomForestRegressor(random_state=random_state),
            model_clf=RandomForestClassifier(random_state=random_state),
            propagate="pred",
        )
        chain.fit(Xi_train, yi_train, target_types=None)
        y_pred = chain.predict(Xi_test)
        y_pred_list_chain.append(pd.DataFrame(y_pred, columns=yi_test.columns, index=yi_test.index))

        # Append yi_test to the corresponding fold index in y_test_list
        y_test_list[i].append(yi_test)  # Append yi_test for fold i

    y_pred_chains.append(y_pred_list_chain)

# Initialize lists to store averaged predictions and targets
y_pred_avg = [[] for _ in range(N_FOLDS)]  # List of lists to store averaged predictions for each fold
y_test_avg = [[] for _ in range(N_FOLDS)]  # List of lists to store targets for each fold

# Average predictions across chains for each fold
for fold_index in range(N_FOLDS):
    for variable_index, variable_name in enumerate(ordered_targets):
        y_test_fold_variable = []
        y_pred_fold_variable = []

        for chain_index in range(len(y_pred_chains)):
            y_test_chain = y_test_list[fold_index][chain_index][variable_name]
            y_pred_chain = y_pred_chains[chain_index][fold_index][variable_name]
            
            # Handle missing values
            y_test_chain = y_test_chain.dropna()
            y_pred_chain = y_pred_chain.loc[y_test_chain.index]

            y_test_fold_variable.append(y_test_chain)
            y_pred_fold_variable.append(y_pred_chain)

        # Concatenate lists for each chain within the fold
        y_test_fold_concat = pd.concat(y_test_fold_variable)
        y_pred_fold_concat = pd.concat(y_pred_fold_variable)

        # Compute mean for numerical targets, mode for categorical targets
        if y_test_fold_concat.dtype.kind in 'bifc':  # Check if target is numerical
            y_pred_variable = y_pred_fold_concat.mean()
        else:
            y_pred_variable = y_pred_fold_concat.mode().iloc[0]

        y_pred_avg[fold_index].append(y_pred_variable)
        y_test_avg[fold_index].append(y_test_fold_concat)

# Initialize lists to store scores
scores_with_std = []

# Compute scores for the averaged predictions across all folds
for variable_index, variable_name in enumerate(ordered_targets):
    variable_scores = []
    for fold_index in range(N_FOLDS):
        y_test = y_test_avg[fold_index][variable_index]
        y_pred = y_pred_avg[fold_index][variable_index]

        if y_test.dtype.kind in 'bifc':
            score = r2_score(y_test, y_pred)
        else:
            score = accuracy_score(y_test, y_pred)

        variable_scores.append(score)

    variable_avg_score = np.mean(variable_scores)
    variable_std_score = np.std(variable_scores)
    scores_with_std.append((variable_name, variable_avg_score, variable_std_score))

# Print the scores with average and standard deviation along with variable names
print("Scores for each outcome (ensemble of chains):")
for variable_name, avg_score, std_score in scores_with_std:
    print(f"{variable_name}: {avg_score:.2f} (± {std_score:.2f})")

InvalidParameterError: The 'y_pred' parameter of r2_score must be an array-like. Got 0.23550325559516733 instead.

1st index: fold, 2nd index: outcome

*Questions/Notes*: 
- using pred="true" gives always score 1 (weird)

---