This notebook compares the scores of the local models and the model chain

In [176]:
import numpy as np
import pandas as pd
#import itertools
#import random

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.metrics import r2_score, accuracy_score
from chaining import Chain
import os

In [177]:
def missingness_stratified_cv(df, N_FOLDS=5, random_state=None):
    # Add seed for reproducibility of the predictions (to get the same scores each time we run the code)
    np.random.seed(random_state)

    # Initial complete-case test fold assignment
    cv = pd.Series(np.nan, index=df.index)
    i_cc = (df.isna().sum(axis=1) == 0) # Complete cases
    cv.iloc[i_cc] = np.random.randint(low=0, high=N_FOLDS, size=i_cc.sum())

    # Go over columns from most missing to least missing
    for j in df.isna().sum().argsort()[::-1]:
        # Instances i that are not assigned yet but for which df[i,j] is observed
        i_tbf = (cv.isna()) & (~df.iloc[:,j].isna()) # to be filled
        # Fill them randomly
        cv.iloc[i_tbf] = np.random.randint(low=0, high=N_FOLDS, size=i_tbf.sum())

    return cv

In [178]:
possible_paths = [
    'C:/Users/lenne/OneDrive/Documenten/Master of Statistics and Data Science/2023-2024/Master thesis/Thesis_Sofia_Lennert/new_data',
    'C:/Users/anaso/Desktop/SOFIA MENDES/KU Leuven/Master Thesis/Thesis_Sofia_Lennert/new_data'
]

# Define file names
file = 'merged_data.csv'

# Find full paths to the CSV files
path = next((f'{path}/{file}' for path in possible_paths if os.path.exists(f'{path}/{file}')), None)

data = pd.read_csv(path)

# Assume missing means 0 relapses
#data['NRELAP'] = data['NRELAP'].fillna(0)
# Bin the number of relapses into 0, 1, 2, 3 and 4+ 
#def bin_column(value):
#    if value in [0, 1, 2, 3]:
#        return str(value)
#    else:
#        return '4+'
#data['NRELAP'] = data['NRELAP'].apply(bin_column)

# Resulting DataFrame will have aggregated data from all four datasets based on the specific_column
pd.set_option('display.max_columns', None)
data

Unnamed: 0,USUBJID,AGE,SEX,RACE,CONTINENT,CESEV,CECONTRT,TOTRELAP,MHCONTRT,MHDIAGN,CARDIO,URINARY,MUSCKELET,FATIGUE,SMSTDY,NRELAP,NHPT-before,NHPT-2y,NHPT-after_2y,PASAT_2s-before,PASAT_2s-2y,PASAT_2s-after_2y,PASAT_3s-before,PASAT_3s-2y,PASAT_3s-after_2y,SDMT-before,SDMT-2y,T25FW-before,T25FW-2y,T25FW-after_2y,T-before,T-after,P-before,P-after,N-before,N-after,DS,DS_L,DS_R,SLEC_before,SLEC_after,SES_after,SES_before,VAA,BDI-before,BDI-after,EDSS-before,EDSS-2y,EDSS-after_2y,KFSS1-Sensory-2y,KFSS1-Sensory-after_2y,KFSS1-Sensory-before,KFSS1-Brain-2y,KFSS1-Brain-after_2y,KFSS1-Brain-before,KFSS1-Bowel-2y,KFSS1-Bowel-after_2y,KFSS1-Bowel-before,KFSS1-Pyramidal-2y,KFSS1-Pyramidal-after_2y,KFSS1-Pyramidal-before,KFSS1-Cerebral-2y,KFSS1-Cerebral-after_2y,KFSS1-Cerebral-before,KFSS1-Visual-2y,KFSS1-Visual-after_2y,KFSS1-Visual-before,KFSS1-Cerebellar-2y,KFSS1-Cerebellar-after_2y,KFSS1-Cerebellar-before,KFSS_M-2y,KFSS_M-after_2y,KFSS_M-before,KFSS_P-2y,KFSS_P-after_2y,KFSS_P-before,M_R36-SF12-before,P_R36-SF12-before,R36-SF12-before_Ind,M_R36-SF12-after,P_R36-SF12-after,R36-SF12-after_Ind
0,MSOAC/0014,46.0,F,,,,,,,RRMS,0,0,0,0,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,MSOAC/0016,,M,WHITE,NORTH AMERICA,,,,Y,SPMS,1,1,0,1,,0.0,,,,,,,,,,,,8.55,6.60,,0.0,0.0,,,,,,,,,,,,,,,6.00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,MSOAC/0019,44.0,M,NON-WHITE,,,,,,PPMS,1,1,0,0,,0.0,23.65,21.30,20.15,34.5,35.5,43.0,43.5,51.0,53.0,,,6.30,6.15,5.85,0.0,0.0,0.0,0.0,,,,,,,,,,,,,3.75,3.50,3.0,0.333333,0.166667,0.500000,0.2,0.0,0.2,0.000000,0.166667,0.083333,0.333333,0.5,0.416667,0.0,0.0,0.0,0.333333,0.0,0.333333,0.0,0.2,0.0,0.185185,0.185185,0.240741,0.166667,0.083333,0.208333,0.657143,0.708861,1.0,0.685714,0.734177,1.0
3,MSOAC/0024,60.0,M,WHITE,NORTH AMERICA,,,,,SPMS,1,1,1,1,,0.0,34.45,37.50,,55.0,54.0,,60.0,60.0,,,,4.50,5.25,,0.0,0.0,0.0,0.0,1.0,0.0,,,,,,,,,0.031746,0.023810,4.00,3.75,,0.333333,,0.333333,0.0,,0.1,0.583333,,0.666667,0.166667,,0.250000,0.0,,0.0,0.000000,,0.083333,0.2,,0.5,0.129630,,0.240741,0.291667,,0.375000,0.657143,0.481013,1.0,0.632353,0.594937,1.0
4,MSOAC/0030,28.0,F,WHITE,EUROPE,,,,,RRMS,1,1,0,1,,0.0,16.55,17.90,,,,,58.0,60.0,,63.5,69.0,4.85,4.70,,0.0,0.0,0.0,0.0,0.0,0.0,,,,26.0,24.0,1.25,1.25,,0.063492,0.039683,2.00,1.50,,0.166667,,0.166667,0.2,,0.2,0.166667,,0.166667,0.166667,,0.333333,0.0,,0.2,0.166667,,0.083333,0.0,,0.1,0.111111,,0.203704,0.166667,,0.125000,0.733333,0.692308,0.0,0.700000,0.615385,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2460,MSOAC/9986,46.0,M,WHITE,OCEANIA,,,,,RRMS,1,1,0,1,,0.0,19.35,18.95,,,,,58.0,60.0,,51.0,60.0,3.90,3.80,,0.0,0.0,0.0,0.0,0.0,0.0,,,,36.0,35.0,1.25,1.25,,0.047619,0.063492,2.75,2.50,,0.333333,,0.166667,0.0,,0.0,0.333333,,0.250000,0.166667,,0.333333,0.0,,0.0,0.000000,,0.000000,0.0,,0.2,0.111111,,0.148148,0.166667,,0.125000,0.833333,0.576923,0.0,0.866667,0.634615,0.0
2461,MSOAC/9987,18.0,F,,,,,,,RRMS,0,0,0,0,,0.0,,,,,,,,,,,,,,,,,,,,,1.0,1.0,1.0,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2462,MSOAC/9995,38.0,F,,,MILD,,4.0,,RRMS,0,0,0,0,142.0,2.0,,,,,,,,,,,,,,,,,,,,,1.2,1.0,1.0,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2463,MSOAC/9998,40.0,F,WHITE,,,Y,2.0,Y,PPMS,0,1,0,1,79.0,1.0,23.80,22.40,22.50,21.5,30.5,33.5,31.5,39.5,40.5,,,6.15,6.00,6.20,0.0,0.0,0.0,0.0,,,,,,,,,,,,,4.50,3.75,4.0,0.166667,0.250000,0.333333,0.4,0.6,0.6,0.166667,0.166667,0.166667,0.333333,0.5,0.500000,0.0,0.0,0.4,0.166667,0.0,0.166667,0.6,0.6,0.6,0.314815,0.351852,0.481481,0.166667,0.083333,0.166667,0.642857,0.569620,1.0,0.642857,0.632911,1.0


In [179]:
#variables = ['KFSS_M-2y', 'EDSS-2y', 'T25FW-2y', 'NRELAP']# removed KFSS_P-2y for now -- ('SMSTDY' gave a score of -0.03)
variables = ['KFSS_M-2y', 'KFSS_P-2y', 'EDSS-2y', 'T25FW-2y', 'NHPT-2y', 'P_R36-SF12-after', 
            'M_R36-SF12-after', 'SES_after', 'EDSS-after_2y', 'NRELAP', 'CESEV']

Note: once we obtain the best ordering, change the order here!

In [180]:
# Extract targets
targets = data[variables]

# Extract features by dropping the target columns
#features = data.drop(variables, axis=1)

columns_to_keep = ['AGE', 'SEX', 'RACE', 'CONTINENT', 'MHDIAGN', 'CARDIO', 'URINARY', 'MUSCKELET', 'FATIGUE', 
                    'NHPT-before', 'PASAT_2s-before', 'PASAT_3s-before', 'SDMT-before', 'T25FW-before', 'SLEC_before','SES_before',
                    'BDI-before', 'EDSS-before', 'KFSS_M-before', 'KFSS_P-before', 'M_R36-SF12-before',
                	'P_R36-SF12-before', 'R36-SF12-before_Ind', 'T-before','P-before','N-before']
# still need to change in OE dataframe the SLEC and SES so name is consistent with the others

features = data[columns_to_keep]
features

Unnamed: 0,AGE,SEX,RACE,CONTINENT,MHDIAGN,CARDIO,URINARY,MUSCKELET,FATIGUE,NHPT-before,PASAT_2s-before,PASAT_3s-before,SDMT-before,T25FW-before,SLEC_before,SES_before,BDI-before,EDSS-before,KFSS_M-before,KFSS_P-before,M_R36-SF12-before,P_R36-SF12-before,R36-SF12-before_Ind,T-before,P-before,N-before
0,46.0,F,,,RRMS,0,0,0,0,,,,,,,,,,,,,,,,,
1,,M,WHITE,NORTH AMERICA,SPMS,1,1,0,1,,,,,8.55,,,,6.00,,,,,,0.0,,
2,44.0,M,NON-WHITE,,PPMS,1,1,0,0,23.65,34.5,43.5,,6.30,,,,3.75,0.240741,0.208333,0.657143,0.708861,1.0,0.0,0.0,
3,60.0,M,WHITE,NORTH AMERICA,SPMS,1,1,1,1,34.45,55.0,60.0,,4.50,,,0.031746,4.00,0.240741,0.375000,0.657143,0.481013,1.0,0.0,0.0,1.0
4,28.0,F,WHITE,EUROPE,RRMS,1,1,0,1,16.55,,58.0,63.5,4.85,26.0,1.25,0.063492,2.00,0.203704,0.125000,0.733333,0.692308,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2460,46.0,M,WHITE,OCEANIA,RRMS,1,1,0,1,19.35,,58.0,51.0,3.90,36.0,1.25,0.047619,2.75,0.148148,0.125000,0.833333,0.576923,0.0,0.0,0.0,0.0
2461,18.0,F,,,RRMS,0,0,0,0,,,,,,,,,,,,,,,,,
2462,38.0,F,,,RRMS,0,0,0,0,,,,,,,,,,,,,,,,,
2463,40.0,F,WHITE,,PPMS,0,1,0,1,23.80,21.5,31.5,,6.15,,,,4.50,0.481481,0.166667,0.642857,0.569620,1.0,0.0,0.0,


In [181]:
object_columns = features.select_dtypes(include=['object'])
features = pd.get_dummies(features, columns=object_columns.columns, dtype=int)
features.head()

Unnamed: 0,AGE,CARDIO,URINARY,MUSCKELET,FATIGUE,NHPT-before,PASAT_2s-before,PASAT_3s-before,SDMT-before,T25FW-before,SLEC_before,SES_before,BDI-before,EDSS-before,KFSS_M-before,KFSS_P-before,M_R36-SF12-before,P_R36-SF12-before,R36-SF12-before_Ind,T-before,P-before,N-before,SEX_F,SEX_M,RACE_NON-WHITE,RACE_WHITE,CONTINENT_ASIA,CONTINENT_EURASIA,CONTINENT_EUROPE,CONTINENT_NORTH AMERICA,CONTINENT_OCEANIA,CONTINENT_SOUTH AMERICA,MHDIAGN_PPMS,MHDIAGN_RRMS,MHDIAGN_SPMS
0,46.0,0,0,0,0,,,,,,,,,,,,,,,,,,1,0,0,0,0,0,0,0,0,0,0,1,0
1,,1,1,0,1,,,,,8.55,,,,6.0,,,,,,0.0,,,0,1,0,1,0,0,0,1,0,0,0,0,1
2,44.0,1,1,0,0,23.65,34.5,43.5,,6.3,,,,3.75,0.240741,0.208333,0.657143,0.708861,1.0,0.0,0.0,,0,1,1,0,0,0,0,0,0,0,1,0,0
3,60.0,1,1,1,1,34.45,55.0,60.0,,4.5,,,0.031746,4.0,0.240741,0.375,0.657143,0.481013,1.0,0.0,0.0,1.0,0,1,0,1,0,0,0,1,0,0,0,0,1
4,28.0,1,1,0,1,16.55,,58.0,63.5,4.85,26.0,1.25,0.063492,2.0,0.203704,0.125,0.733333,0.692308,0.0,0.0,0.0,0.0,1,0,0,1,0,0,1,0,0,0,0,1,0


In [182]:
targets.dtypes

KFSS_M-2y           float64
KFSS_P-2y           float64
EDSS-2y             float64
T25FW-2y            float64
NHPT-2y             float64
P_R36-SF12-after    float64
M_R36-SF12-after    float64
SES_after           float64
EDSS-after_2y       float64
NRELAP               object
CESEV                object
dtype: object

Run MICE

In [183]:
featuresM=features.copy()

#missing_mask = featuresM.isna()
imputer = IterativeImputer(max_iter=10, random_state=42)
imputed_values = imputer.fit_transform(featuresM)

featuresM = pd.DataFrame(imputed_values, columns=featuresM.columns)
featuresM



Unnamed: 0,AGE,CARDIO,URINARY,MUSCKELET,FATIGUE,NHPT-before,PASAT_2s-before,PASAT_3s-before,SDMT-before,T25FW-before,SLEC_before,SES_before,BDI-before,EDSS-before,KFSS_M-before,KFSS_P-before,M_R36-SF12-before,P_R36-SF12-before,R36-SF12-before_Ind,T-before,P-before,N-before,SEX_F,SEX_M,RACE_NON-WHITE,RACE_WHITE,CONTINENT_ASIA,CONTINENT_EURASIA,CONTINENT_EUROPE,CONTINENT_NORTH AMERICA,CONTINENT_OCEANIA,CONTINENT_SOUTH AMERICA,MHDIAGN_PPMS,MHDIAGN_RRMS,MHDIAGN_SPMS
0,46.00000,0.0,0.0,0.0,0.0,25.129556,34.472656,43.649637,42.890290,10.621517,18.315635,3.936186,-0.204252,3.305555,0.179351,0.123189,0.657292,0.648993,0.902009,0.013356,0.017199,0.020860,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,49.78346,1.0,1.0,0.0,1.0,32.304754,34.670991,45.065841,42.655445,8.550000,18.538146,9.942287,0.163624,6.000000,0.411783,0.304485,0.674465,0.554717,0.810738,0.000000,0.021968,0.031019,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,44.00000,1.0,1.0,0.0,0.0,23.650000,34.500000,43.500000,35.356598,6.300000,19.632975,-30.205362,-0.217055,3.750000,0.240741,0.208333,0.657143,0.708861,1.000000,0.000000,0.000000,0.017082,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,60.00000,1.0,1.0,1.0,1.0,34.450000,55.000000,60.000000,128.542360,4.500000,18.923486,340.718009,0.031746,4.000000,0.240741,0.375000,0.657143,0.481013,1.000000,0.000000,0.000000,1.000000,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,28.00000,1.0,1.0,0.0,1.0,16.550000,47.959512,58.000000,63.500000,4.850000,26.000000,1.250000,0.063492,2.000000,0.203704,0.125000,0.733333,0.692308,0.000000,0.000000,0.000000,0.000000,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2460,46.00000,1.0,1.0,0.0,1.0,19.350000,44.700890,58.000000,51.000000,3.900000,36.000000,1.250000,0.047619,2.750000,0.148148,0.125000,0.833333,0.576923,0.000000,0.000000,0.000000,0.000000,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2461,18.00000,0.0,0.0,0.0,0.0,24.185994,35.028844,44.186816,50.606156,10.804256,22.162562,-2.182545,-0.198088,3.025797,0.160476,0.114455,0.677600,0.680226,0.853403,0.005934,0.009012,0.008918,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2462,38.00000,0.0,0.0,0.0,0.0,24.859967,34.631567,43.803117,45.094823,10.673728,19.414757,2.187977,-0.202491,3.225624,0.173958,0.120694,0.663094,0.657916,0.888121,0.011235,0.014860,0.017448,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2463,40.00000,0.0,1.0,0.0,1.0,23.800000,21.500000,31.500000,20.323392,6.150000,25.086784,-74.799829,-0.087345,4.500000,0.481481,0.166667,0.642857,0.569620,1.000000,0.000000,0.000000,0.020802,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [184]:
model_data = pd.concat([featuresM, targets], axis=1)
model_data

Unnamed: 0,AGE,CARDIO,URINARY,MUSCKELET,FATIGUE,NHPT-before,PASAT_2s-before,PASAT_3s-before,SDMT-before,T25FW-before,SLEC_before,SES_before,BDI-before,EDSS-before,KFSS_M-before,KFSS_P-before,M_R36-SF12-before,P_R36-SF12-before,R36-SF12-before_Ind,T-before,P-before,N-before,SEX_F,SEX_M,RACE_NON-WHITE,RACE_WHITE,CONTINENT_ASIA,CONTINENT_EURASIA,CONTINENT_EUROPE,CONTINENT_NORTH AMERICA,CONTINENT_OCEANIA,CONTINENT_SOUTH AMERICA,MHDIAGN_PPMS,MHDIAGN_RRMS,MHDIAGN_SPMS,KFSS_M-2y,KFSS_P-2y,EDSS-2y,T25FW-2y,NHPT-2y,P_R36-SF12-after,M_R36-SF12-after,SES_after,EDSS-after_2y,NRELAP,CESEV
0,46.00000,0.0,0.0,0.0,0.0,25.129556,34.472656,43.649637,42.890290,10.621517,18.315635,3.936186,-0.204252,3.305555,0.179351,0.123189,0.657292,0.648993,0.902009,0.013356,0.017199,0.020860,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,,,,,,,,,,0.0,
1,49.78346,1.0,1.0,0.0,1.0,32.304754,34.670991,45.065841,42.655445,8.550000,18.538146,9.942287,0.163624,6.000000,0.411783,0.304485,0.674465,0.554717,0.810738,0.000000,0.021968,0.031019,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,,,,6.60,,,,,,0.0,
2,44.00000,1.0,1.0,0.0,0.0,23.650000,34.500000,43.500000,35.356598,6.300000,19.632975,-30.205362,-0.217055,3.750000,0.240741,0.208333,0.657143,0.708861,1.000000,0.000000,0.000000,0.017082,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.185185,0.166667,3.50,6.15,21.30,0.734177,0.685714,,3.0,0.0,
3,60.00000,1.0,1.0,1.0,1.0,34.450000,55.000000,60.000000,128.542360,4.500000,18.923486,340.718009,0.031746,4.000000,0.240741,0.375000,0.657143,0.481013,1.000000,0.000000,0.000000,1.000000,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.129630,0.291667,3.75,5.25,37.50,0.594937,0.632353,,,0.0,
4,28.00000,1.0,1.0,0.0,1.0,16.550000,47.959512,58.000000,63.500000,4.850000,26.000000,1.250000,0.063492,2.000000,0.203704,0.125000,0.733333,0.692308,0.000000,0.000000,0.000000,0.000000,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.111111,0.166667,1.50,4.70,17.90,0.615385,0.700000,1.25,,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2460,46.00000,1.0,1.0,0.0,1.0,19.350000,44.700890,58.000000,51.000000,3.900000,36.000000,1.250000,0.047619,2.750000,0.148148,0.125000,0.833333,0.576923,0.000000,0.000000,0.000000,0.000000,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.111111,0.166667,2.50,3.80,18.95,0.634615,0.866667,1.25,,0.0,
2461,18.00000,0.0,0.0,0.0,0.0,24.185994,35.028844,44.186816,50.606156,10.804256,22.162562,-2.182545,-0.198088,3.025797,0.160476,0.114455,0.677600,0.680226,0.853403,0.005934,0.009012,0.008918,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,,,,,,,,,,0.0,
2462,38.00000,0.0,0.0,0.0,0.0,24.859967,34.631567,43.803117,45.094823,10.673728,19.414757,2.187977,-0.202491,3.225624,0.173958,0.120694,0.663094,0.657916,0.888121,0.011235,0.014860,0.017448,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,,,,,,,,,,2.0,MILD
2463,40.00000,0.0,1.0,0.0,1.0,23.800000,21.500000,31.500000,20.323392,6.150000,25.086784,-74.799829,-0.087345,4.500000,0.481481,0.166667,0.642857,0.569620,1.000000,0.000000,0.000000,0.020802,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.314815,0.166667,3.75,6.00,22.40,0.632911,0.642857,,4.0,1.0,


In [185]:
missing_entries = model_data.isnull().any().any()

if missing_entries:
    print("DataFrame has missing entries.")
else:
    print("DataFrame does not have missing entries.")

DataFrame has missing entries.


In [186]:
#columns_to_encode = ['NRELAP', 'CESEV']

le1 = LabelEncoder()
le2 = LabelEncoder()

cesev = le1.fit_transform(np.array(model_data['CESEV']))
nrelap = le2.fit_transform(np.array(model_data['NRELAP']))

model_data['CESEV'] = cesev
model_data["CESEV"] = model_data["CESEV"].replace(3, np.nan)

model_data['NRELAP']=nrelap

# Impute missing values
imputer = IterativeImputer(max_iter=10, random_state=42)
imputed_values = imputer.fit_transform(model_data)

# Convert imputed values back to DataFrame
encoded_data = pd.DataFrame(imputed_values, columns=model_data.columns)



In [187]:
np.array(model_data["CESEV"])

array([nan, nan, nan, ...,  0., nan,  0.])

In [188]:
missing_entries = encoded_data.isnull().any().any()

if missing_entries:
    print("DataFrame has missing entries.")
else:
    print("DataFrame does not have missing entries.")

DataFrame does not have missing entries.


In [189]:
model_data['CESEV']

0       NaN
1       NaN
2       NaN
3       NaN
4       NaN
       ... 
2460    NaN
2461    NaN
2462    0.0
2463    NaN
2464    0.0
Name: CESEV, Length: 2465, dtype: float64

In [190]:
#cesev = np.array(model_data['CESEV'])#.astype(int)
#nrelap = np.array(model_data['NRELAP'])#.astype(int)

#out1 = le1.inverse_transform(np.array(encoded_data['CESEV']).astype('int64'))
#out2 = le2.inverse_transform(np.array(encoded_data['NRELAP']).astype('int64'))

#print(out1)
#print(out2)


In [191]:
targetsM = encoded_data[targets.columns]
targetsM

Unnamed: 0,KFSS_M-2y,KFSS_P-2y,EDSS-2y,T25FW-2y,NHPT-2y,P_R36-SF12-after,M_R36-SF12-after,SES_after,EDSS-after_2y,NRELAP,CESEV
0,0.216944,0.115467,3.597226,10.937190,24.246184,0.630938,0.681019,3.201814,3.731700,0.0,0.538955
1,0.385144,0.283564,5.825027,6.600000,31.713411,0.553411,0.675406,7.763440,5.901814,0.0,0.896500
2,0.185185,0.166667,3.500000,6.150000,21.300000,0.734177,0.685714,-22.530100,3.000000,0.0,0.183366
3,0.129630,0.291667,3.750000,5.250000,37.500000,0.594937,0.632353,258.071263,4.079338,0.0,0.335621
4,0.111111,0.166667,1.500000,4.700000,17.900000,0.615385,0.700000,1.250000,1.678746,0.0,0.596577
...,...,...,...,...,...,...,...,...,...,...,...
2460,0.111111,0.166667,2.500000,3.800000,18.950000,0.634615,0.866667,1.250000,2.792831,0.0,0.445007
2461,0.199337,0.104137,3.279002,12.827422,24.813813,0.649747,0.684164,-1.429551,3.198359,0.0,0.506656
2462,0.232058,0.120093,3.590650,17.772469,30.068766,0.642956,0.675964,1.899127,3.230507,2.0,0.000000
2463,0.314815,0.166667,3.750000,6.000000,22.400000,0.632911,0.642857,-56.313127,4.000000,1.0,0.600772


5-Fold CV

In [192]:
# Set random state for reproducibility
random_state = 42
N_FOLDS = 5

In [193]:
# Generate CV folds
cv=missingness_stratified_cv(features, N_FOLDS, random_state)
cv = cv.to_frame(name="CV Fold")

featuresM_cv = pd.merge(featuresM, pd.DataFrame(cv), left_index=True, right_index=True)
targetsM_cv = pd.merge(targetsM, pd.DataFrame(cv), left_index=True, right_index=True)
targets_cv = pd.merge(targets, pd.DataFrame(cv), left_index=True, right_index=True)

featuresM_cv['CV Fold'].value_counts()

CV Fold
4.0    510
3.0    502
0.0    500
1.0    495
2.0    458
Name: count, dtype: int64

In [194]:
targetsM_cv.shape

(2465, 12)

In [195]:
targets_cv.shape

(2465, 12)

Is it a problem that not all folds have the exact same number?

---

# Local Models

In [196]:
y_pred_list = []
y_test_list = []

for i in range(0, N_FOLDS): 
    Xi_train = featuresM_cv[featuresM_cv['CV Fold'] != i].drop(["CV Fold"], axis=1)
    Xi_test = featuresM_cv[featuresM_cv['CV Fold'] == i].drop(["CV Fold"], axis=1)
    yi_train = targetsM_cv[targetsM_cv['CV Fold'] != i].drop(["CV Fold"], axis=1)
    yi_test = targets_cv[targets_cv['CV Fold'] == i].drop(["CV Fold"], axis=1)
    y_test_list.append(pd.DataFrame(yi_test, columns=yi_test.columns, index=yi_test.index))

    chain = Chain(
        model_reg=RandomForestRegressor(random_state=random_state),
        model_clf=RandomForestClassifier(random_state=random_state),
        propagate=False, #RUN LOCAL MODELS
    )
    chain.fit(Xi_train, yi_train, target_types=None) #["reg","reg","reg","reg","reg","reg","reg","reg","reg","clf","clf"]
    y_pred = chain.predict(Xi_test)
    y_pred_list.append(pd.DataFrame(y_pred, columns=yi_test.columns, index=yi_test.index))
    print("Done with evaluating on CV Fold {}".format(i+1))

Done with evaluating on CV Fold 1
Done with evaluating on CV Fold 2
Done with evaluating on CV Fold 3
Done with evaluating on CV Fold 4
Done with evaluating on CV Fold 5


In [199]:
y_pred_list

[      KFSS_M-2y  KFSS_P-2y   EDSS-2y   T25FW-2y    NHPT-2y  P_R36-SF12-after  \
 0      0.223585   0.119832  3.669871  11.417733  24.795613          0.629540   
 3      0.275034   0.307723  4.167500   4.908500  38.682706          0.571908   
 8      0.365746   0.244455  6.450000  22.084997  26.599980          0.518952   
 13     0.412939   0.282586  6.528940  27.986700  34.795162          0.551873   
 20     0.176259   0.174238  2.757500   6.199500  25.820155          0.707571   
 ...         ...        ...       ...        ...        ...               ...   
 2449   0.326471   0.026667  4.300231  20.087500  27.007029          0.577854   
 2454   0.264501   0.363720  6.650230  20.277077  27.287091          0.544068   
 2457   0.282107   0.056250  3.518117   4.746838  21.886674          0.626731   
 2458   0.279444   0.194611  4.357009   8.259519  32.085498          0.572171   
 2460   0.181993   0.113078  2.754178   4.236135  19.402508          0.595454   
 
       M_R36-SF12-after   

In [197]:
y_test_cv = []
y_pred_cv = []

for j in range(len(y_test_list)):  # 5
    y_test_targ = []
    y_pred_targ = []
    nvar=y_test_list[0].shape[1]

    for i in range(0, nvar):  # or (1, 5)
        missing_rows_mask = y_test_list[j].iloc[:, i].isna()
        y_test = y_test_list[j].iloc[:, i][~missing_rows_mask]
        y_pred = y_pred_list[j].iloc[:, i][~missing_rows_mask]
        
        y_test_targ.append(y_test)
        y_pred_targ.append(y_pred)
    
    y_test_cv.append(y_test_targ)
    y_pred_cv.append(y_pred_targ)
# y_test_cv[fold][outcome]

1st index: fold, 2nd index: outcome

In [198]:
# Initialize a list to store scores
scores = []
scores_with_std = []

# Iterate over each outcome variable in the folds
for variable_name in variables: #CHANGE VAR LIST HERE
    variable_scores = []
    
    # Compute scores for the variable across all folds
    for fold_index in range(len(y_test_cv)):
        y_test = y_test_cv[fold_index][variables.index(variable_name)] #CHANGE VAR LIST HERE
        y_pred = y_pred_cv[fold_index][variables.index(variable_name)] #CHANGE VAR LIST HERE
        
        # Check if the target variable is numerical or categorical
        if y_test.dtype.kind in 'bifc':
            score = r2_score(y_test, y_pred)
        else:
            score = accuracy_score(y_test, y_pred)
                  
        variable_scores.append(score)
    
    # Compute the average score for the variable across all folds
    variable_avg_score = np.mean(variable_scores)
    # Compute the standard deviation for the variable across all folds
    variable_std_score = np.std(variable_scores)
    scores_with_std.append((variable_name, variable_avg_score, variable_std_score))

# Print the scores with average and standard deviation along with variable names
print("Scores for each outcome (local):")
for variable_name, avg_score, std_score in scores_with_std:
    print(f"{variable_name}: {avg_score:.2f} (± {std_score:.2f})")

ValueError: Classification metrics can't handle a mix of multiclass and continuous targets

*Questions/Notes*: 
- using pred="true" gives always score 1 (weird)

---

# Model Chain

### Propagate predicted values

In [29]:
y_pred_list = []
y_test_list = []

for i in range(0, N_FOLDS): 
    Xi_train = features_cv[features_cv['CV Fold'] != i].drop(["CV Fold"], axis=1)
    Xi_test = features_cv[features_cv['CV Fold'] == i].drop(["CV Fold"], axis=1)
    yi_train = targets_cv[targets_cv['CV Fold'] != i].drop(["CV Fold"], axis=1)
    yi_test = targets_cv[targets_cv['CV Fold'] == i].drop(["CV Fold"], axis=1)
    y_test_list.append(pd.DataFrame(yi_test, columns=yi_test.columns, index=yi_test.index))

    chain = Chain(
        model_reg=RandomForestRegressor(random_state=random_state),
        model_clf=RandomForestClassifier(random_state=random_state),
        propagate="pred", #RUN MODELS IN A CHAIN
    )
    chain.fit(Xi_train, yi_train, target_types=None) #["reg","reg","reg","reg","reg","reg","reg","reg","reg","clf","clf"]
    y_pred = chain.predict(Xi_test)
    y_pred_list.append(pd.DataFrame(y_pred, columns=yi_test.columns, index=yi_test.index))
    print("Done with evaluating on CV Fold {}".format(i+1))

Done with evaluating on CV Fold 1
Done with evaluating on CV Fold 2
Done with evaluating on CV Fold 3
Done with evaluating on CV Fold 4
Done with evaluating on CV Fold 5


In [30]:
y_test_cv = []
y_pred_cv = []

for j in range(len(y_test_list)):  # 5
    y_test_targ = []
    y_pred_targ = []
    nvar=y_test_list[0].shape[1]

    for i in range(0, nvar):  # or (1, 5)
        missing_rows_mask = y_test_list[j].iloc[:, i].isna()
        y_test = y_test_list[j].iloc[:, i][~missing_rows_mask]
        y_pred = y_pred_list[j].iloc[:, i][~missing_rows_mask]
        
        y_test_targ.append(y_test)
        y_pred_targ.append(y_pred)
    
    y_test_cv.append(y_test_targ)
    y_pred_cv.append(y_pred_targ)
# y_test_cv[fold][outcome]

In [31]:
# Initialize a list to store scores
scores = []
scores_with_std = []

# Iterate over each outcome variable in the folds
for variable_name in variables: 
    variable_scores = []
    
    # Compute scores for the variable across all folds
    for fold_index in range(len(y_test_cv)):
        y_test = y_test_cv[fold_index][variables.index(variable_name)] 
        y_pred = y_pred_cv[fold_index][variables.index(variable_name)] 
        
        # Check if the target variable is numerical or categorical
        if y_test.dtype.kind in 'bifc':
            score = r2_score(y_test, y_pred)
        else:
            score = accuracy_score(y_test, y_pred)
                  
        variable_scores.append(score)
    
    # Compute the average score for the variable across all folds
    variable_avg_score = np.mean(variable_scores)
    # Compute the standard deviation for the variable across all folds
    variable_std_score = np.std(variable_scores)
    scores_with_std.append((variable_name, variable_avg_score, variable_std_score))

# Print the scores with average and standard deviation along with variable names
print("Scores for each outcome (chain - predicted values):")
for variable_name, avg_score, std_score in scores_with_std:
    print(f"{variable_name}: {avg_score:.2f} (± {std_score:.2f})")

Scores for each outcome (chain - predicted values):
KFSS_M-2y: 0.81 (± 0.02)
KFSS_P-2y: 0.75 (± 0.03)
EDSS-2y: 0.88 (± 0.02)
T25FW-2y: 0.72 (± 0.06)
NHPT-2y: 0.58 (± 0.18)
P_R36-SF12-after: 0.60 (± 0.04)
M_R36-SF12-after: 0.35 (± 0.03)
SES_after: 0.71 (± 0.04)
EDSS-after_2y: 0.74 (± 0.05)
NRELAP: 0.64 (± 0.02)
CESEV: 0.51 (± 0.03)


### Propagate true values

Still need to fix clf in chaining.py for propagate="true" -- do not run the code below, it won't work

In [32]:
y_pred_list = []
y_test_list = []

for i in range(0, N_FOLDS): 
    Xi_train = features_cv[features_cv['CV Fold'] != i].drop(["CV Fold"], axis=1)
    Xi_test = features_cv[features_cv['CV Fold'] == i].drop(["CV Fold"], axis=1)
    yi_train = targets_cv[targets_cv['CV Fold'] != i].drop(["CV Fold"], axis=1)
    yi_test = targets_cv[targets_cv['CV Fold'] == i].drop(["CV Fold"], axis=1)
    y_test_list.append(pd.DataFrame(yi_test, columns=yi_test.columns, index=yi_test.index))

    chain = Chain(
        model_reg=RandomForestRegressor(random_state=random_state),
        model_clf=RandomForestClassifier(random_state=random_state),
        propagate="true", #RUN MODELS IN A CHAIN
    )
    chain.fit(Xi_train, yi_train, target_types=None) #["reg","reg","reg","reg","reg","reg","reg","reg","reg","clf","clf"]
    y_pred = chain.predict(Xi_test)
    y_pred_list.append(pd.DataFrame(y_pred, columns=yi_test.columns, index=yi_test.index))
    print("Done with evaluating on CV Fold {}".format(i+1))

Done with evaluating on CV Fold 1
Done with evaluating on CV Fold 2
Done with evaluating on CV Fold 3
Done with evaluating on CV Fold 4
Done with evaluating on CV Fold 5


In [33]:
y_test_cv = []
y_pred_cv = []

for j in range(len(y_test_list)):  # 5
    y_test_targ = []
    y_pred_targ = []
    nvar=y_test_list[0].shape[1]

    for i in range(0, nvar):  # or (1, 5)
        missing_rows_mask = y_test_list[j].iloc[:, i].isna()
        y_test = y_test_list[j].iloc[:, i][~missing_rows_mask]
        y_pred = y_pred_list[j].iloc[:, i][~missing_rows_mask]
        
        y_test_targ.append(y_test)
        y_pred_targ.append(y_pred)
    
    y_test_cv.append(y_test_targ)
    y_pred_cv.append(y_pred_targ)
# y_test_cv[fold][outcome]

In [34]:
# Initialize a list to store scores
scores = []
scores_with_std = []

# Iterate over each outcome variable in the folds
for variable_name in variables: 
    variable_scores = []
    
    # Compute scores for the variable across all folds
    for fold_index in range(len(y_test_cv)):
        y_test = y_test_cv[fold_index][variables.index(variable_name)] 
        y_pred = y_pred_cv[fold_index][variables.index(variable_name)] 
        
        # Check if the target variable is numerical or categorical
        if y_test.dtype.kind in 'bifc':
            score = r2_score(y_test, y_pred)
        else:
            score = accuracy_score(y_test, y_pred)
                  
        variable_scores.append(score)
    
    # Compute the average score for the variable across all folds
    variable_avg_score = np.mean(variable_scores)
    # Compute the standard deviation for the variable across all folds
    variable_std_score = np.std(variable_scores)
    scores_with_std.append((variable_name, variable_avg_score, variable_std_score))

# Print the scores with average and standard deviation along with variable names
print("Scores for each outcome (chain - true values):")
for variable_name, avg_score, std_score in scores_with_std:
    print(f"{variable_name}: {avg_score:.2f} (± {std_score:.2f})")

Scores for each outcome (chain - true values):
KFSS_M-2y: 0.81 (± 0.02)
KFSS_P-2y: 0.75 (± 0.03)
EDSS-2y: 0.88 (± 0.02)
T25FW-2y: 0.72 (± 0.07)
NHPT-2y: 0.60 (± 0.18)
P_R36-SF12-after: 0.60 (± 0.03)
M_R36-SF12-after: 0.35 (± 0.04)
SES_after: 0.71 (± 0.05)
EDSS-after_2y: 0.75 (± 0.04)
NRELAP: 0.64 (± 0.02)
CESEV: 0.56 (± 0.03)
