This notebook compares the scores of the local models and the model chain

In [1]:
import numpy as np
import pandas as pd
#import itertools
#import random

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.metrics import r2_score, accuracy_score
from chaining import Chain
import os

In [2]:
def missingness_stratified_cv(df, N_FOLDS=5, random_state=None):
    # Add seed for reproducibility of the predictions (to get the same scores each time we run the code)
    np.random.seed(random_state)

    # Initial complete-case test fold assignment
    cv = pd.Series(np.nan, index=df.index)
    i_cc = (df.isna().sum(axis=1) == 0) # Complete cases
    cv.iloc[i_cc] = np.random.randint(low=0, high=N_FOLDS, size=i_cc.sum())

    # Go over columns from most missing to least missing
    for j in df.isna().sum().argsort()[::-1]:
        # Instances i that are not assigned yet but for which df[i,j] is observed
        i_tbf = (cv.isna()) & (~df.iloc[:,j].isna()) # to be filled
        # Fill them randomly
        cv.iloc[i_tbf] = np.random.randint(low=0, high=N_FOLDS, size=i_tbf.sum())

    return cv

In [3]:
possible_paths = [
    'C:/Users/lenne/OneDrive/Documenten/Master of Statistics and Data Science/2023-2024/Master thesis/Thesis_Sofia_Lennert/new_data',
    'C:/Users/anaso/Desktop/SOFIA MENDES/KU Leuven/Master Thesis/Thesis_Sofia_Lennert/new_data'
]

# Define file names
file = 'merged_data.csv'

# Find full paths to the CSV files
path = next((f'{path}/{file}' for path in possible_paths if os.path.exists(f'{path}/{file}')), None)

data = pd.read_csv(path)

# Bin the number of relapses into 0, 1, 2, 3 and 4+ 
def bin_column(value):
    if value in [0, 1, 2, 3]:
        return str(value)
    else:
        return '4+'
data['NRELAP'] = data['NRELAP'].apply(bin_column)

# Resulting DataFrame will have aggregated data from all four datasets based on the specific_column
pd.set_option('display.max_columns', None)
data

Unnamed: 0,USUBJID,AGE,SEX,RACE,CONTINENT,CESEV,CECONTRT,TOTRELAP,MHCONTRT,MHDIAGN,CARDIO,URINARY,MUSCKELET,FATIGUE,SMSTDY,NRELAP,NHPT-before,NHPT-2y,NHPT-after_2y,PASAT_2s-before,PASAT_2s-2y,PASAT_2s-after_2y,PASAT_3s-before,PASAT_3s-2y,PASAT_3s-after_2y,SDMT-before,SDMT-2y,T25FW-before,T25FW-2y,T25FW-after_2y,T-before,T-after,P-before,P-after,N-before,N-after,SLEC_before,SLEC_after,SES_after,SES_before,VAA,BDI-before,BDI-after,EDSS-before,EDSS-2y,EDSS-after_2y,KFSS1-Sensory-2y,KFSS1-Sensory-after_2y,KFSS1-Sensory-before,KFSS1-Brain-2y,KFSS1-Brain-after_2y,KFSS1-Brain-before,KFSS1-Bowel-2y,KFSS1-Bowel-after_2y,KFSS1-Bowel-before,KFSS1-Pyramidal-2y,KFSS1-Pyramidal-after_2y,KFSS1-Pyramidal-before,KFSS1-Cerebral-2y,KFSS1-Cerebral-after_2y,KFSS1-Cerebral-before,KFSS1-Visual-2y,KFSS1-Visual-after_2y,KFSS1-Visual-before,KFSS1-Cerebellar-2y,KFSS1-Cerebellar-after_2y,KFSS1-Cerebellar-before,KFSS_M-2y,KFSS_M-after_2y,KFSS_M-before,KFSS_P-2y,KFSS_P-after_2y,KFSS_P-before,M_R36-SF12-before,P_R36-SF12-before,R36-SF12-before_Ind,M_R36-SF12-after,P_R36-SF12-after,R36-SF12-after_Ind
0,MSOAC/0014,46.0,F,,,,,,,RRMS,0,0,0,0,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,MSOAC/0016,,M,WHITE,NORTH AMERICA,,,,Y,SPMS,1,1,0,1,,0.0,,,,,,,,,,,,8.55,6.60,,0.0,0.0,,,,,,,,,,,,6.00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,MSOAC/0019,44.0,M,NON-WHITE,,,,,,PPMS,1,1,0,0,,0.0,23.65,21.30,20.15,34.5,35.5,43.0,43.5,51.0,53.0,,,6.30,6.15,5.85,0.0,0.0,0.0,0.0,,,,,,,,,,3.75,3.50,3.0,0.333333,0.166667,0.500000,0.2,0.0,0.2,0.000000,0.166667,0.083333,0.333333,0.5,0.416667,0.0,0.0,0.0,0.333333,0.0,0.333333,0.0,0.2,0.0,0.185185,0.185185,0.240741,0.166667,0.083333,0.208333,0.828571,0.772152,1.0,0.857143,0.721519,1.0
3,MSOAC/0024,60.0,M,WHITE,NORTH AMERICA,,,,,SPMS,1,1,1,1,,0.0,34.45,37.50,,55.0,54.0,,60.0,60.0,,,,4.50,5.25,,0.0,0.0,0.0,0.0,1.0,0.0,,,,,,0.031746,0.023810,4.00,3.75,,0.333333,,0.333333,0.0,,0.1,0.583333,,0.666667,0.166667,,0.250000,0.0,,0.0,0.000000,,0.083333,0.2,,0.5,0.129630,,0.240741,0.291667,,0.375000,0.885714,0.569620,1.0,0.857143,0.716216,1.0
4,MSOAC/0030,28.0,F,WHITE,EUROPE,,,,,RRMS,1,1,0,1,,0.0,16.55,17.90,,,,,58.0,60.0,,63.5,69.0,4.85,4.70,,0.0,0.0,0.0,0.0,0.0,0.0,26.0,24.0,1.25,1.25,,0.063492,0.039683,2.00,1.50,,0.166667,,0.166667,0.2,,0.2,0.166667,,0.166667,0.166667,,0.333333,0.0,,0.2,0.166667,,0.083333,0.0,,0.1,0.111111,,0.203704,0.166667,,0.125000,0.933333,0.846154,0.0,0.833333,0.730769,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2460,MSOAC/9986,46.0,M,WHITE,OCEANIA,,,,,RRMS,1,1,0,1,,0.0,19.35,18.95,,,,,58.0,60.0,,51.0,60.0,3.90,3.80,,0.0,0.0,0.0,0.0,0.0,0.0,36.0,35.0,1.25,1.25,,0.047619,0.063492,2.75,2.50,,0.333333,,0.166667,0.0,,0.0,0.333333,,0.250000,0.166667,,0.333333,0.0,,0.0,0.000000,,0.000000,0.0,,0.2,0.111111,,0.148148,0.166667,,0.125000,0.833333,0.730769,0.0,0.800000,0.750000,0.0
2461,MSOAC/9987,18.0,F,,,,,,,RRMS,0,0,0,0,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2462,MSOAC/9995,38.0,F,,,MILD,,4.0,,RRMS,0,0,0,0,142.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2463,MSOAC/9998,40.0,F,WHITE,,,Y,2.0,Y,PPMS,0,1,0,1,79.0,1.0,23.80,22.40,22.50,21.5,30.5,33.5,31.5,39.5,40.5,,,6.15,6.00,6.20,0.0,0.0,0.0,0.0,,,,,,,,,,4.50,3.75,4.0,0.166667,0.250000,0.333333,0.4,0.6,0.6,0.166667,0.166667,0.166667,0.333333,0.5,0.500000,0.0,0.0,0.4,0.166667,0.0,0.166667,0.6,0.6,0.6,0.314815,0.351852,0.481481,0.166667,0.083333,0.166667,0.728571,0.658228,1.0,0.757143,0.594937,1.0


In [4]:
#variables = ['KFSS_M-2y', 'EDSS-2y', 'T25FW-2y', 'NRELAP']# removed KFSS_P-2y for now -- ('SMSTDY' gave a score of -0.03)
variables = ['KFSS_M-2y', 'KFSS_P-2y', 'EDSS-2y', 'T25FW-2y', 'NHPT-2y', 'P_R36-SF12-after', 'M_R36-SF12-after', 
             'SES_after', 'SLEC_after', 'KFSS_M-after_2y', 'KFSS_P-after_2y', 'EDSS-after_2y', 'NRELAP', 'CESEV']

Note: once we obtain the best ordering, change the order here!

In [5]:
# Extract targets
targets = data[variables]

# Extract features by dropping the target columns
#features = data.drop(variables, axis=1)

columns_to_keep = ['AGE', 'SEX', 'RACE', 'CONTINENT', 'MHDIAGN', 'CARDIO', 'URINARY', 'MUSCKELET', 'FATIGUE', 
                    'NHPT-before', 'PASAT_2s-before', 'PASAT_3s-before', 'SDMT-before', 'T25FW-before', 'SLEC_before','SES_before',
                    'BDI-before', 'EDSS-before', 'KFSS_M-before', 'KFSS_P-before', 'M_R36-SF12-before',
                	'P_R36-SF12-before', 'R36-SF12-before_Ind', 'T-before','P-before','N-before']
# still need to change in OE dataframe the SLEC and SES so name is consistent with the others

features = data[columns_to_keep]
features

Unnamed: 0,AGE,SEX,RACE,CONTINENT,MHDIAGN,CARDIO,URINARY,MUSCKELET,FATIGUE,NHPT-before,PASAT_2s-before,PASAT_3s-before,SDMT-before,T25FW-before,SLEC_before,SES_before,BDI-before,EDSS-before,KFSS_M-before,KFSS_P-before,M_R36-SF12-before,P_R36-SF12-before,R36-SF12-before_Ind,T-before,P-before,N-before
0,46.0,F,,,RRMS,0,0,0,0,,,,,,,,,,,,,,,,,
1,,M,WHITE,NORTH AMERICA,SPMS,1,1,0,1,,,,,8.55,,,,6.00,,,,,,0.0,,
2,44.0,M,NON-WHITE,,PPMS,1,1,0,0,23.65,34.5,43.5,,6.30,,,,3.75,0.240741,0.208333,0.828571,0.772152,1.0,0.0,0.0,
3,60.0,M,WHITE,NORTH AMERICA,SPMS,1,1,1,1,34.45,55.0,60.0,,4.50,,,0.031746,4.00,0.240741,0.375000,0.885714,0.569620,1.0,0.0,0.0,1.0
4,28.0,F,WHITE,EUROPE,RRMS,1,1,0,1,16.55,,58.0,63.5,4.85,26.0,1.25,0.063492,2.00,0.203704,0.125000,0.933333,0.846154,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2460,46.0,M,WHITE,OCEANIA,RRMS,1,1,0,1,19.35,,58.0,51.0,3.90,36.0,1.25,0.047619,2.75,0.148148,0.125000,0.833333,0.730769,0.0,0.0,0.0,0.0
2461,18.0,F,,,RRMS,0,0,0,0,,,,,,,,,,,,,,,,,
2462,38.0,F,,,RRMS,0,0,0,0,,,,,,,,,,,,,,,,,
2463,40.0,F,WHITE,,PPMS,0,1,0,1,23.80,21.5,31.5,,6.15,,,,4.50,0.481481,0.166667,0.728571,0.658228,1.0,0.0,0.0,


In [6]:
features.select_dtypes(include=['object'])

Unnamed: 0,SEX,RACE,CONTINENT,MHDIAGN
0,F,,,RRMS
1,M,WHITE,NORTH AMERICA,SPMS
2,M,NON-WHITE,,PPMS
3,M,WHITE,NORTH AMERICA,SPMS
4,F,WHITE,EUROPE,RRMS
...,...,...,...,...
2460,M,WHITE,OCEANIA,RRMS
2461,F,,,RRMS
2462,F,,,RRMS
2463,F,WHITE,,PPMS


In [7]:
#object_columns = features.select_dtypes(include=['object'])
#features = pd.get_dummies(features, columns=object_columns.columns, dtype=int)
#features.head()

In [8]:
targets.dtypes

KFSS_M-2y           float64
KFSS_P-2y           float64
EDSS-2y             float64
T25FW-2y            float64
NHPT-2y             float64
P_R36-SF12-after    float64
M_R36-SF12-after    float64
SES_after           float64
SLEC_after          float64
KFSS_M-after_2y     float64
KFSS_P-after_2y     float64
EDSS-after_2y       float64
NRELAP               object
CESEV                object
dtype: object

Run MICE

In [9]:
featuresM=features.copy()

In [10]:
le1 = LabelEncoder()
le2 = LabelEncoder()
le3 = LabelEncoder()
le4 = LabelEncoder()

featuresM['SEX'] = le1.fit_transform(np.array(featuresM['SEX']))
featuresM['SEX'] = featuresM['SEX'].replace(2, np.nan)

featuresM['RACE'] = le2.fit_transform(np.array(featuresM['RACE']))
featuresM['RACE'] = featuresM['RACE'].replace(2, np.nan)

featuresM['CONTINENT'] = le3.fit_transform(np.array(featuresM['CONTINENT']))
featuresM['CONTINENT'] = featuresM['CONTINENT'].replace(6, np.nan)

featuresM['MHDIAGN'] = le4.fit_transform(np.array(featuresM['MHDIAGN']))
featuresM['MHDIAGN'] = featuresM['MHDIAGN'].replace(3, np.nan)

featuresM

Unnamed: 0,AGE,SEX,RACE,CONTINENT,MHDIAGN,CARDIO,URINARY,MUSCKELET,FATIGUE,NHPT-before,PASAT_2s-before,PASAT_3s-before,SDMT-before,T25FW-before,SLEC_before,SES_before,BDI-before,EDSS-before,KFSS_M-before,KFSS_P-before,M_R36-SF12-before,P_R36-SF12-before,R36-SF12-before_Ind,T-before,P-before,N-before
0,46.0,0,,,1.0,0,0,0,0,,,,,,,,,,,,,,,,,
1,,1,1.0,3.0,2.0,1,1,0,1,,,,,8.55,,,,6.00,,,,,,0.0,,
2,44.0,1,0.0,,0.0,1,1,0,0,23.65,34.5,43.5,,6.30,,,,3.75,0.240741,0.208333,0.828571,0.772152,1.0,0.0,0.0,
3,60.0,1,1.0,3.0,2.0,1,1,1,1,34.45,55.0,60.0,,4.50,,,0.031746,4.00,0.240741,0.375000,0.885714,0.569620,1.0,0.0,0.0,1.0
4,28.0,0,1.0,2.0,1.0,1,1,0,1,16.55,,58.0,63.5,4.85,26.0,1.25,0.063492,2.00,0.203704,0.125000,0.933333,0.846154,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2460,46.0,1,1.0,4.0,1.0,1,1,0,1,19.35,,58.0,51.0,3.90,36.0,1.25,0.047619,2.75,0.148148,0.125000,0.833333,0.730769,0.0,0.0,0.0,0.0
2461,18.0,0,,,1.0,0,0,0,0,,,,,,,,,,,,,,,,,
2462,38.0,0,,,1.0,0,0,0,0,,,,,,,,,,,,,,,,,
2463,40.0,0,1.0,,0.0,0,1,0,1,23.80,21.5,31.5,,6.15,,,,4.50,0.481481,0.166667,0.728571,0.658228,1.0,0.0,0.0,


In [11]:
#missing_mask = featuresM.isna()
imputer = IterativeImputer(max_iter=10, random_state=42)
imputed_values = imputer.fit_transform(featuresM)

featuresM = pd.DataFrame(imputed_values, columns=featuresM.columns)
featuresM

Unnamed: 0,AGE,SEX,RACE,CONTINENT,MHDIAGN,CARDIO,URINARY,MUSCKELET,FATIGUE,NHPT-before,PASAT_2s-before,PASAT_3s-before,SDMT-before,T25FW-before,SLEC_before,SES_before,BDI-before,EDSS-before,KFSS_M-before,KFSS_P-before,M_R36-SF12-before,P_R36-SF12-before,R36-SF12-before_Ind,T-before,P-before,N-before
0,46.000000,0.0,0.906648,2.178526,1.0,0.0,0.0,0.0,0.0,23.583228,35.355541,45.543851,39.493228,9.428920,0.424081,0.868421,0.166259,3.478709,0.209635,0.124440,0.707628,0.682450,0.737487,0.013708,0.016078,0.017242
1,51.920685,1.0,1.000000,3.000000,2.0,1.0,1.0,0.0,1.0,32.123394,34.149892,44.467077,64.603808,8.550000,128.630778,1.863812,0.144201,6.000000,0.408076,0.288009,0.713088,0.579112,0.744787,0.000000,0.022695,0.031485
2,44.000000,1.0,0.000000,2.305725,0.0,1.0,1.0,0.0,0.0,23.650000,34.500000,43.500000,7.646610,6.300000,-138.965348,-0.162132,0.118500,3.750000,0.240741,0.208333,0.828571,0.772152,1.000000,0.000000,0.000000,0.016365
3,60.000000,1.0,1.000000,3.000000,2.0,1.0,1.0,1.0,1.0,34.450000,55.000000,60.000000,954.702789,4.500000,4045.554050,28.647053,0.031746,4.000000,0.240741,0.375000,0.885714,0.569620,1.000000,0.000000,0.000000,1.000000
4,28.000000,0.0,1.000000,2.000000,1.0,1.0,1.0,0.0,1.0,16.550000,48.229470,58.000000,63.500000,4.850000,26.000000,1.250000,0.063492,2.000000,0.203704,0.125000,0.933333,0.846154,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2460,46.000000,1.0,1.000000,4.000000,1.0,1.0,1.0,0.0,1.0,19.350000,44.895998,58.000000,51.000000,3.900000,36.000000,1.250000,0.047619,2.750000,0.148148,0.125000,0.833333,0.730769,0.000000,0.000000,0.000000,0.000000
2461,18.000000,0.0,0.792585,1.487481,1.0,0.0,0.0,0.0,0.0,22.309843,36.315230,46.243364,46.275864,7.755247,-8.178015,0.826172,0.202962,2.441138,0.150251,0.068084,0.677956,0.720873,0.368105,0.004346,0.007923,0.002264
2462,38.000000,0.0,0.874059,1.981085,1.0,0.0,0.0,0.0,0.0,23.219404,35.629738,45.743712,41.431124,8.950728,-2.033660,0.856350,0.176746,3.182260,0.192668,0.108338,0.699150,0.693428,0.631949,0.011033,0.013748,0.012963
2463,40.000000,0.0,1.000000,2.457433,0.0,0.0,1.0,0.0,1.0,23.800000,21.500000,31.500000,-175.250700,6.150000,-923.204200,-5.476727,0.236563,4.500000,0.481481,0.166667,0.728571,0.658228,1.000000,0.000000,0.000000,0.018916


In [12]:
featuresM['SEX']=np.array(featuresM['SEX']).round().astype(int)
featuresM['RACE']=np.array(featuresM['RACE']).round().astype(int)
featuresM['CONTINENT']=np.array(featuresM['CONTINENT']).round().astype(int)
featuresM['MHDIAGN']=np.array(featuresM['MHDIAGN']).round().astype(int)

In [13]:
featuresM.loc[featuresM['SEX'] < -0.5, 'SEX'] = 0
featuresM.loc[featuresM['SEX'] > 1.5, 'SEX'] = 1

featuresM.loc[featuresM['RACE'] < -0.5, 'RACE'] = 0
featuresM.loc[featuresM['RACE'] > 1.5, 'RACE'] = 1

featuresM.loc[featuresM['CONTINENT'] < -0.5, 'CONTINENT'] = 0
featuresM.loc[featuresM['CONTINENT'] > 5.5, 'CONTINENT'] = 5

featuresM.loc[featuresM['MHDIAGN'] < -0.5, 'MHDIAGN'] = 0
featuresM.loc[featuresM['MHDIAGN'] > 2.5, 'MHDIAGN'] = 2

In [14]:
featuresM['SEX'] = le1.inverse_transform(featuresM['SEX'])
featuresM['RACE'] = le2.inverse_transform(featuresM['RACE'])
featuresM['CONTINENT'] = le3.inverse_transform(featuresM['CONTINENT'])
featuresM['MHDIAGN'] = le4.inverse_transform(featuresM['MHDIAGN'])

In [15]:
print(featuresM['SEX'].unique())
print(featuresM['RACE'].unique())
print(featuresM['CONTINENT'].unique())
print(featuresM['MHDIAGN'].unique())

['F' 'M']
['WHITE' 'NON-WHITE']
['EUROPE' 'NORTH AMERICA' 'ASIA' 'EURASIA' 'SOUTH AMERICA' 'OCEANIA']
['RRMS' 'SPMS' 'PPMS']


In [16]:
#selected_columns = featuresM.iloc[:, :-2]

# Compute the range for each column
#ranges = selected_columns.apply(lambda x: x.max() - x.min())

#print("Range of values for each column (except last two):")
#print(ranges)

In [17]:
featuresM.select_dtypes(include=['object'])

Unnamed: 0,SEX,RACE,CONTINENT,MHDIAGN
0,F,WHITE,EUROPE,RRMS
1,M,WHITE,NORTH AMERICA,SPMS
2,M,NON-WHITE,EUROPE,PPMS
3,M,WHITE,NORTH AMERICA,SPMS
4,F,WHITE,EUROPE,RRMS
...,...,...,...,...
2460,M,WHITE,OCEANIA,RRMS
2461,F,WHITE,EURASIA,RRMS
2462,F,WHITE,EUROPE,RRMS
2463,F,WHITE,EUROPE,PPMS


In [18]:
object_columns = featuresM.select_dtypes(include=['object'])
featuresM = pd.get_dummies(featuresM, columns=object_columns.columns, dtype=int)
featuresM.head()

Unnamed: 0,AGE,CARDIO,URINARY,MUSCKELET,FATIGUE,NHPT-before,PASAT_2s-before,PASAT_3s-before,SDMT-before,T25FW-before,SLEC_before,SES_before,BDI-before,EDSS-before,KFSS_M-before,KFSS_P-before,M_R36-SF12-before,P_R36-SF12-before,R36-SF12-before_Ind,T-before,P-before,N-before,SEX_F,SEX_M,RACE_NON-WHITE,RACE_WHITE,CONTINENT_ASIA,CONTINENT_EURASIA,CONTINENT_EUROPE,CONTINENT_NORTH AMERICA,CONTINENT_OCEANIA,CONTINENT_SOUTH AMERICA,MHDIAGN_PPMS,MHDIAGN_RRMS,MHDIAGN_SPMS
0,46.0,0.0,0.0,0.0,0.0,23.583228,35.355541,45.543851,39.493228,9.42892,0.424081,0.868421,0.166259,3.478709,0.209635,0.12444,0.707628,0.68245,0.737487,0.013708,0.016078,0.017242,1,0,0,1,0,0,1,0,0,0,0,1,0
1,51.920685,1.0,1.0,0.0,1.0,32.123394,34.149892,44.467077,64.603808,8.55,128.630778,1.863812,0.144201,6.0,0.408076,0.288009,0.713088,0.579112,0.744787,0.0,0.022695,0.031485,0,1,0,1,0,0,0,1,0,0,0,0,1
2,44.0,1.0,1.0,0.0,0.0,23.65,34.5,43.5,7.64661,6.3,-138.965348,-0.162132,0.1185,3.75,0.240741,0.208333,0.828571,0.772152,1.0,0.0,0.0,0.016365,0,1,1,0,0,0,1,0,0,0,1,0,0
3,60.0,1.0,1.0,1.0,1.0,34.45,55.0,60.0,954.702789,4.5,4045.55405,28.647053,0.031746,4.0,0.240741,0.375,0.885714,0.56962,1.0,0.0,0.0,1.0,0,1,0,1,0,0,0,1,0,0,0,0,1
4,28.0,1.0,1.0,0.0,1.0,16.55,48.22947,58.0,63.5,4.85,26.0,1.25,0.063492,2.0,0.203704,0.125,0.933333,0.846154,0.0,0.0,0.0,0.0,1,0,0,1,0,0,1,0,0,0,0,1,0


In [19]:
model_data = pd.concat([featuresM, targets], axis=1)
model_data

Unnamed: 0,AGE,CARDIO,URINARY,MUSCKELET,FATIGUE,NHPT-before,PASAT_2s-before,PASAT_3s-before,SDMT-before,T25FW-before,SLEC_before,SES_before,BDI-before,EDSS-before,KFSS_M-before,KFSS_P-before,M_R36-SF12-before,P_R36-SF12-before,R36-SF12-before_Ind,T-before,P-before,N-before,SEX_F,SEX_M,RACE_NON-WHITE,RACE_WHITE,CONTINENT_ASIA,CONTINENT_EURASIA,CONTINENT_EUROPE,CONTINENT_NORTH AMERICA,CONTINENT_OCEANIA,CONTINENT_SOUTH AMERICA,MHDIAGN_PPMS,MHDIAGN_RRMS,MHDIAGN_SPMS,KFSS_M-2y,KFSS_P-2y,EDSS-2y,T25FW-2y,NHPT-2y,P_R36-SF12-after,M_R36-SF12-after,SES_after,SLEC_after,KFSS_M-after_2y,KFSS_P-after_2y,EDSS-after_2y,NRELAP,CESEV
0,46.000000,0.0,0.0,0.0,0.0,23.583228,35.355541,45.543851,39.493228,9.428920,0.424081,0.868421,0.166259,3.478709,0.209635,0.124440,0.707628,0.682450,0.737487,0.013708,0.016078,0.017242,1,0,0,1,0,0,1,0,0,0,0,1,0,,,,,,,,,,,,,0.0,
1,51.920685,1.0,1.0,0.0,1.0,32.123394,34.149892,44.467077,64.603808,8.550000,128.630778,1.863812,0.144201,6.000000,0.408076,0.288009,0.713088,0.579112,0.744787,0.000000,0.022695,0.031485,0,1,0,1,0,0,0,1,0,0,0,0,1,,,,6.60,,,,,,,,,0.0,
2,44.000000,1.0,1.0,0.0,0.0,23.650000,34.500000,43.500000,7.646610,6.300000,-138.965348,-0.162132,0.118500,3.750000,0.240741,0.208333,0.828571,0.772152,1.000000,0.000000,0.000000,0.016365,0,1,1,0,0,0,1,0,0,0,1,0,0,0.185185,0.166667,3.50,6.15,21.30,0.721519,0.857143,,,0.185185,0.083333,3.0,0.0,
3,60.000000,1.0,1.0,1.0,1.0,34.450000,55.000000,60.000000,954.702789,4.500000,4045.554050,28.647053,0.031746,4.000000,0.240741,0.375000,0.885714,0.569620,1.000000,0.000000,0.000000,1.000000,0,1,0,1,0,0,0,1,0,0,0,0,1,0.129630,0.291667,3.75,5.25,37.50,0.716216,0.857143,,,,,,0.0,
4,28.000000,1.0,1.0,0.0,1.0,16.550000,48.229470,58.000000,63.500000,4.850000,26.000000,1.250000,0.063492,2.000000,0.203704,0.125000,0.933333,0.846154,0.000000,0.000000,0.000000,0.000000,1,0,0,1,0,0,1,0,0,0,0,1,0,0.111111,0.166667,1.50,4.70,17.90,0.730769,0.833333,1.25,24.0,,,,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2460,46.000000,1.0,1.0,0.0,1.0,19.350000,44.895998,58.000000,51.000000,3.900000,36.000000,1.250000,0.047619,2.750000,0.148148,0.125000,0.833333,0.730769,0.000000,0.000000,0.000000,0.000000,0,1,0,1,0,0,0,0,1,0,0,1,0,0.111111,0.166667,2.50,3.80,18.95,0.750000,0.800000,1.25,35.0,,,,0.0,
2461,18.000000,0.0,0.0,0.0,0.0,22.309843,36.315230,46.243364,46.275864,7.755247,-8.178015,0.826172,0.202962,2.441138,0.150251,0.068084,0.677956,0.720873,0.368105,0.004346,0.007923,0.002264,1,0,0,1,0,1,0,0,0,0,0,1,0,,,,,,,,,,,,,0.0,
2462,38.000000,0.0,0.0,0.0,0.0,23.219404,35.629738,45.743712,41.431124,8.950728,-2.033660,0.856350,0.176746,3.182260,0.192668,0.108338,0.699150,0.693428,0.631949,0.011033,0.013748,0.012963,1,0,0,1,0,0,1,0,0,0,0,1,0,,,,,,,,,,,,,2.0,MILD
2463,40.000000,0.0,1.0,0.0,1.0,23.800000,21.500000,31.500000,-175.250700,6.150000,-923.204200,-5.476727,0.236563,4.500000,0.481481,0.166667,0.728571,0.658228,1.000000,0.000000,0.000000,0.018916,1,0,0,1,0,0,1,0,0,0,1,0,0,0.314815,0.166667,3.75,6.00,22.40,0.594937,0.757143,,,0.351852,0.083333,4.0,1.0,


In [20]:
#missing_entries = model_data.isnull().any().any()

#if missing_entries:
#    print("DataFrame has missing entries.")
#else:
#    print("DataFrame does not have missing entries.")

In [21]:
#columns_to_encode = ['NRELAP', 'CESEV']

le5 = LabelEncoder()
le6 = LabelEncoder()

cesev = le5.fit_transform(np.array(model_data['CESEV']))
nrelap = le6.fit_transform(np.array(model_data['NRELAP']))

model_data['CESEV'] = cesev
model_data["CESEV"] = model_data["CESEV"].replace(3, np.nan)

model_data['NRELAP']=nrelap

# Impute missing values
imputer = IterativeImputer(max_iter=10, random_state=42)
imputed_values = imputer.fit_transform(model_data)

# Convert imputed values back to DataFrame
encoded_data = pd.DataFrame(imputed_values, columns=model_data.columns)

In [22]:
#np.unique(encoded_data["CESEV"])

In [23]:
encoded_data[(encoded_data["CESEV"] <= -0.5) | (encoded_data["CESEV"] >= 2.5)]

Unnamed: 0,AGE,CARDIO,URINARY,MUSCKELET,FATIGUE,NHPT-before,PASAT_2s-before,PASAT_3s-before,SDMT-before,T25FW-before,SLEC_before,SES_before,BDI-before,EDSS-before,KFSS_M-before,KFSS_P-before,M_R36-SF12-before,P_R36-SF12-before,R36-SF12-before_Ind,T-before,P-before,N-before,SEX_F,SEX_M,RACE_NON-WHITE,RACE_WHITE,CONTINENT_ASIA,CONTINENT_EURASIA,CONTINENT_EUROPE,CONTINENT_NORTH AMERICA,CONTINENT_OCEANIA,CONTINENT_SOUTH AMERICA,MHDIAGN_PPMS,MHDIAGN_RRMS,MHDIAGN_SPMS,KFSS_M-2y,KFSS_P-2y,EDSS-2y,T25FW-2y,NHPT-2y,P_R36-SF12-after,M_R36-SF12-after,SES_after,SLEC_after,KFSS_M-after_2y,KFSS_P-after_2y,EDSS-after_2y,NRELAP,CESEV
186,42.0,1.0,1.0,1.0,1.0,58.35,5.5,8.0,265.944778,38.15,1107.392571,8.312002,0.31746,6.5,0.5,0.291667,0.5,0.43038,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.481481,0.333333,6.5,107.0,69.5,0.696203,0.592857,6.950464,772.745981,0.426602,0.29815,5.980064,0.0,-1.183764
224,64.0,0.0,1.0,1.0,1.0,106.2,21.0,32.0,-264.433028,106.4,-1261.243466,-8.653155,0.198133,6.5,0.462963,0.333333,0.785714,0.607595,1.0,0.0,0.0,0.159496,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.481481,0.5,8.0,180.0,113.8,0.481013,0.571429,-7.902161,-894.956654,0.481481,0.625,8.0,0.0,-0.667688
326,39.0,0.0,1.0,0.0,0.0,17.2,37.004733,45.0,49.971544,4.35,19.5,1.25,0.129722,1.5,0.111111,0.0,0.742857,0.810127,1.0,0.008045,0.013578,0.005398,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,4.55,17.5,0.879747,0.807143,1.25,19.5,0.0,0.0,0.0,0.0,-0.533861
615,45.0,0.0,0.0,1.0,0.0,28.65,38.5,46.0,356.517111,19.95,1416.15193,10.588093,0.242584,6.0,0.361657,0.249727,0.528571,0.56962,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.357259,0.282879,6.5,64.45,50.35,0.563291,0.507143,8.978358,995.400494,0.355303,0.273233,6.604929,0.0,-0.530759
773,62.0,1.0,1.0,1.0,1.0,29.05,46.0,55.5,286.525415,5.65,1089.432387,8.255187,0.201682,4.0,0.277778,0.291667,0.685714,0.620253,1.0,0.0,0.0,0.034751,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.314815,0.416667,4.0,6.45,37.5,0.493671,0.571429,7.133929,772.251002,0.314815,0.541667,8.0,0.0,2.774559
815,36.0,0.0,1.0,0.0,1.0,25.15,3.490644,11.0,11.5,26.05,9.0,0.625,0.126984,2.0,0.111111,0.083333,0.866667,0.653846,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.111111,0.0,3.0,107.0,28.0,0.557692,0.666667,0.625,0.0,0.121548,-0.093805,2.678693,0.0,-0.679945
820,52.0,0.0,1.0,1.0,0.0,30.6,35.0,53.5,-853.232335,3.95,-3986.87048,-26.415581,0.065469,3.0,0.185185,0.166667,0.885714,0.898734,1.0,0.0,0.0,0.032603,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.166667,0.166667,3.0,4.7,26.8,0.822785,0.871429,-22.127615,-2790.809651,0.12963,0.083333,2.0,0.0,-0.562165
1161,31.0,0.0,0.0,1.0,0.0,19.7,49.356366,60.0,65.105706,3.85,43.0,1.25,0.065903,1.25,0.055556,0.041667,0.814286,0.835443,1.0,0.009591,0.004044,0.031035,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.037037,0.041667,1.5,4.05,17.25,0.924051,0.878571,1.25,25.5,0.018519,0.0,0.5,0.0,-0.927544
1180,38.0,0.0,1.0,1.0,1.0,70.55,45.146805,59.0,72.864631,26.85,115.870412,1.656322,0.151217,6.5,0.436082,0.313425,0.742857,0.531646,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.496493,0.468972,8.0,89.862467,300.0,0.424051,0.578571,1.148505,56.833566,0.638897,0.326258,8.5,0.0,-4.489466
1206,29.0,0.0,0.0,0.0,0.0,19.65,47.872,60.0,58.670448,4.45,25.5,1.25,-0.007886,1.5,0.037037,0.083333,0.914286,0.962025,1.0,0.008838,0.003124,0.007556,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.055556,0.083333,1.5,4.3,20.75,0.955696,0.964286,1.25,3.5,0.074074,0.125,1.5,0.0,-0.53498


In [24]:
#ind = encoded_data[encoded_data["CESEV"] < -0.5].index
#model_data.loc[ind]

In [25]:
#missing_entries = encoded_data.isnull().any().any()

#if missing_entries:
#    print("DataFrame has missing entries.")
#else:
#    print("DataFrame does not have missing entries.")

In [26]:
#model_data['CESEV']

In [27]:
encoded_data.loc[encoded_data['CESEV'] < -0.5, 'CESEV'] = 0

cesev = np.array(encoded_data['CESEV']).round().astype(int)
nrelap = np.array(encoded_data['NRELAP']).round().astype(int)

In [28]:
print(np.unique(cesev))
print(np.count_nonzero(cesev == -2147483648))
print(np.count_nonzero(cesev == 3))

[0 1 2 3]
0
3


In [29]:
def replace_negative(arr):
    return np.where(arr == -2147483648, 0, arr)

def replace_three(arr):
    return np.where(arr == 3, 2, arr)

cesev = replace_negative(cesev)
cesev = replace_three(cesev)


print(np.unique(cesev))
print(np.count_nonzero(cesev == -2147483648))
print(np.count_nonzero(cesev == 3))

[0 1 2]
0
0


In [30]:
encoded_data['CESEV'] = le5.inverse_transform(cesev)
encoded_data['NRELAP'] = le6.inverse_transform(nrelap)

#print(encoded_data['CESEV'])
#print(encoded_data['NRELAP'])

In [31]:
encoded_data['CESEV'].unique()

array(['MODERATE', 'MILD', 'SEVERE'], dtype=object)

In [32]:
#np.unique(cesev)

In [33]:
targetsM = encoded_data[targets.columns]
targetsM

Unnamed: 0,KFSS_M-2y,KFSS_P-2y,EDSS-2y,T25FW-2y,NHPT-2y,P_R36-SF12-after,M_R36-SF12-after,SES_after,SLEC_after,KFSS_M-after_2y,KFSS_P-after_2y,EDSS-after_2y,NRELAP,CESEV
0,0.198944,0.125347,3.452108,10.004314,23.009106,0.686041,0.709839,0.880378,6.373640,0.234475,0.171561,3.683032,0.0,MODERATE
1,0.384485,0.270120,5.853841,6.600000,30.783050,0.583039,0.713720,1.800244,98.654474,0.357438,0.241766,5.917891,0.0,MODERATE
2,0.185185,0.166667,3.500000,6.150000,21.300000,0.721519,0.857143,0.153143,-88.197641,0.185185,0.083333,3.000000,0.0,MILD
3,0.129630,0.291667,3.750000,5.250000,37.500000,0.716216,0.857143,24.644451,2848.937961,0.087278,0.136659,3.809612,0.0,MILD
4,0.111111,0.166667,1.500000,4.700000,17.900000,0.730769,0.833333,1.250000,24.000000,0.210672,0.271593,2.102681,0.0,MODERATE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2460,0.111111,0.166667,2.500000,3.800000,18.950000,0.750000,0.800000,1.250000,35.000000,0.153083,0.098404,2.511082,0.0,MILD
2461,0.139146,0.088575,2.484261,7.803386,22.201075,0.766499,0.731665,0.786889,-0.017664,0.202866,0.218388,2.663094,0.0,MODERATE
2462,0.204019,0.116849,3.286239,16.773824,28.874719,0.716686,0.702209,0.805998,-0.456734,0.258826,0.163237,3.093793,2.0,MILD
2463,0.314815,0.166667,3.750000,6.000000,22.400000,0.594937,0.757143,-4.409678,-641.960134,0.351852,0.083333,4.000000,1.0,MODERATE


In [34]:
selected_columns = targetsM.iloc[:, :-2]

# Compute the range for each column
ranges = selected_columns.apply(lambda x: x.max() - x.min())

print("Range of values for each column (except last two):")
print(ranges)

Range of values for each column (except last two):
KFSS_M-2y               0.759259
KFSS_P-2y               0.750000
EDSS-2y                 8.000000
T25FW-2y              177.949331
NHPT-2y               289.100000
P_R36-SF12-after        0.750000
M_R36-SF12-after        1.000000
SES_after             143.636870
SLEC_after          17447.375373
KFSS_M-after_2y         0.865754
KFSS_P-after_2y         1.793652
EDSS-after_2y          12.364511
dtype: float64


5-Fold CV

In [35]:
# Set random state for reproducibility
random_state = 42
N_FOLDS = 5

In [36]:
# Generate CV folds
cv=missingness_stratified_cv(features, N_FOLDS, random_state)
cv = cv.to_frame(name="CV Fold")

featuresM_cv = pd.merge(featuresM, pd.DataFrame(cv), left_index=True, right_index=True)
targetsM_cv = pd.merge(targetsM, pd.DataFrame(cv), left_index=True, right_index=True)
targets_cv = pd.merge(targets, pd.DataFrame(cv), left_index=True, right_index=True)

featuresM_cv['CV Fold'].value_counts()

CV Fold
4.0    510
3.0    502
0.0    500
1.0    495
2.0    458
Name: count, dtype: int64

In [37]:
targetsM_cv.shape

(2465, 15)

In [38]:
targets_cv.shape

(2465, 15)

Is it a problem that not all folds have the exact same number?

---

# Local Models

In [39]:
y_pred_list = []
y_test_list = []

for i in range(0, N_FOLDS): 
    Xi_train = featuresM_cv[featuresM_cv['CV Fold'] != i].drop(["CV Fold"], axis=1)
    Xi_test = featuresM_cv[featuresM_cv['CV Fold'] == i].drop(["CV Fold"], axis=1)
    yi_train = targetsM_cv[targetsM_cv['CV Fold'] != i].drop(["CV Fold"], axis=1)
    yi_test = targets_cv[targets_cv['CV Fold'] == i].drop(["CV Fold"], axis=1)
    y_test_list.append(pd.DataFrame(yi_test, columns=yi_test.columns, index=yi_test.index))

    chain = Chain(
        model_reg=RandomForestRegressor(random_state=random_state),
        model_clf=RandomForestClassifier(random_state=random_state),
        propagate=False, #RUN LOCAL MODELS
    )
    chain.fit(Xi_train, yi_train, target_types=None) #["reg","reg","reg","reg","reg","reg","reg","reg","reg","clf","clf"]
    y_pred = chain.predict(Xi_test)
    y_pred_list.append(pd.DataFrame(y_pred, columns=yi_test.columns, index=yi_test.index))
    print("Done with evaluating on CV Fold {}".format(i+1))

Done with evaluating on CV Fold 1
Done with evaluating on CV Fold 2
Done with evaluating on CV Fold 3
Done with evaluating on CV Fold 4
Done with evaluating on CV Fold 5


In [40]:
#y_pred_list

In [41]:
y_test_cv = []
y_pred_cv = []

for j in range(len(y_test_list)):  # 5
    y_test_targ = []
    y_pred_targ = []
    nvar=y_test_list[0].shape[1]

    for i in range(0, nvar):  # or (1, 5)
        missing_rows_mask = y_test_list[j].iloc[:, i].isna()
        y_test = y_test_list[j].iloc[:, i][~missing_rows_mask]
        y_pred = y_pred_list[j].iloc[:, i][~missing_rows_mask]
        
        y_test_targ.append(y_test)
        y_pred_targ.append(y_pred)
    
    y_test_cv.append(y_test_targ)
    y_pred_cv.append(y_pred_targ)
# y_test_cv[fold][outcome]

1st index: fold, 2nd index: outcome

In [42]:
# Initialize a list to store scores
scores = []
scores_with_std = []

# Iterate over each outcome variable in the folds
for variable_name in variables: #CHANGE VAR LIST HERE
    variable_scores = []
    
    # Compute scores for the variable across all folds
    for fold_index in range(len(y_test_cv)):
        y_test = y_test_cv[fold_index][variables.index(variable_name)] #CHANGE VAR LIST HERE
        y_pred = y_pred_cv[fold_index][variables.index(variable_name)] #CHANGE VAR LIST HERE
        
        # Check if the target variable is numerical or categorical
        if y_test.dtype.kind in 'bifc':
            score = r2_score(y_test, y_pred)
        else:
            score = accuracy_score(y_test, y_pred)
                  
        variable_scores.append(score)
    
    # Compute the average score for the variable across all folds
    variable_avg_score = np.mean(variable_scores)
    # Compute the standard deviation for the variable across all folds
    variable_std_score = np.std(variable_scores)
    scores_with_std.append((variable_name, variable_avg_score, variable_std_score))

# Print the scores with average and standard deviation along with variable names
print("Scores for each outcome (local):")
for variable_name, avg_score, std_score in scores_with_std:
    print(f"{variable_name}: {avg_score:.2f} (± {std_score:.2f})")

Scores for each outcome (local):
KFSS_M-2y: 0.80 (± 0.03)
KFSS_P-2y: 0.74 (± 0.03)
EDSS-2y: 0.88 (± 0.01)
T25FW-2y: 0.70 (± 0.09)
NHPT-2y: 0.55 (± 0.16)
P_R36-SF12-after: 0.69 (± 0.05)
M_R36-SF12-after: 0.56 (± 0.02)
SES_after: 0.70 (± 0.04)
SLEC_after: 0.66 (± 0.04)
KFSS_M-after_2y: 0.66 (± 0.05)
KFSS_P-after_2y: 0.50 (± 0.07)
EDSS-after_2y: 0.75 (± 0.05)
NRELAP: 0.63 (± 0.02)
CESEV: 0.53 (± 0.02)


*Questions/Notes*: 
- using pred="true" gives always score 1 (weird)

---

# Model Chain

### Propagate predicted values

In [43]:
y_pred_list = []
y_test_list = []

for i in range(0, N_FOLDS): 
    Xi_train = featuresM_cv[featuresM_cv['CV Fold'] != i].drop(["CV Fold"], axis=1)
    Xi_test = featuresM_cv[featuresM_cv['CV Fold'] == i].drop(["CV Fold"], axis=1)
    yi_train = targetsM_cv[targetsM_cv['CV Fold'] != i].drop(["CV Fold"], axis=1)
    yi_test = targets_cv[targets_cv['CV Fold'] == i].drop(["CV Fold"], axis=1)
    y_test_list.append(pd.DataFrame(yi_test, columns=yi_test.columns, index=yi_test.index))

    chain = Chain(
        model_reg=RandomForestRegressor(random_state=random_state),
        model_clf=RandomForestClassifier(random_state=random_state),
        propagate="pred", #RUN MODELS IN A CHAIN
    )
    chain.fit(Xi_train, yi_train, target_types=None) #["reg","reg","reg","reg","reg","reg","reg","reg","reg","clf","clf"]
    y_pred = chain.predict(Xi_test)
    y_pred_list.append(pd.DataFrame(y_pred, columns=yi_test.columns, index=yi_test.index))
    print("Done with evaluating on CV Fold {}".format(i+1))

Done with evaluating on CV Fold 1
Done with evaluating on CV Fold 2
Done with evaluating on CV Fold 3
Done with evaluating on CV Fold 4
Done with evaluating on CV Fold 5


In [44]:
y_test_cv = []
y_pred_cv = []

for j in range(len(y_test_list)):  # 5
    y_test_targ = []
    y_pred_targ = []
    nvar=y_test_list[0].shape[1]

    for i in range(0, nvar):  # or (1, 5)
        missing_rows_mask = y_test_list[j].iloc[:, i].isna()
        y_test = y_test_list[j].iloc[:, i][~missing_rows_mask]
        y_pred = y_pred_list[j].iloc[:, i][~missing_rows_mask]
        
        y_test_targ.append(y_test)
        y_pred_targ.append(y_pred)
    
    y_test_cv.append(y_test_targ)
    y_pred_cv.append(y_pred_targ)
# y_test_cv[fold][outcome]

In [45]:
# Initialize a list to store scores
scores = []
scores_with_std = []

# Iterate over each outcome variable in the folds
for variable_name in variables: 
    variable_scores = []
    
    # Compute scores for the variable across all folds
    for fold_index in range(len(y_test_cv)):
        y_test = y_test_cv[fold_index][variables.index(variable_name)] 
        y_pred = y_pred_cv[fold_index][variables.index(variable_name)] 
        
        # Check if the target variable is numerical or categorical
        if y_test.dtype.kind in 'bifc':
            score = r2_score(y_test, y_pred)
        else:
            score = accuracy_score(y_test, y_pred)
                  
        variable_scores.append(score)
    
    # Compute the average score for the variable across all folds
    variable_avg_score = np.mean(variable_scores)
    # Compute the standard deviation for the variable across all folds
    variable_std_score = np.std(variable_scores)
    scores_with_std.append((variable_name, variable_avg_score, variable_std_score))

# Print the scores with average and standard deviation along with variable names
print("Scores for each outcome (chain - predicted values):")
for variable_name, avg_score, std_score in scores_with_std:
    print(f"{variable_name}: {avg_score:.2f} (± {std_score:.2f})")

Scores for each outcome (chain - predicted values):
KFSS_M-2y: 0.80 (± 0.03)
KFSS_P-2y: 0.74 (± 0.03)
EDSS-2y: 0.88 (± 0.02)
T25FW-2y: 0.72 (± 0.07)
NHPT-2y: 0.58 (± 0.18)
P_R36-SF12-after: 0.69 (± 0.05)
M_R36-SF12-after: 0.54 (± 0.02)
SES_after: 0.71 (± 0.04)
SLEC_after: 0.66 (± 0.04)
KFSS_M-after_2y: 0.65 (± 0.04)
KFSS_P-after_2y: 0.50 (± 0.07)
EDSS-after_2y: 0.74 (± 0.04)
NRELAP: 0.63 (± 0.02)
CESEV: 0.53 (± 0.03)


### Propagate true values

Still need to fix clf in chaining.py for propagate="true" -- do not run the code below, it won't work

In [46]:
y_pred_list = []
y_test_list = []

for i in range(0, N_FOLDS): 
    Xi_train = featuresM_cv[featuresM_cv['CV Fold'] != i].drop(["CV Fold"], axis=1)
    Xi_test = featuresM_cv[featuresM_cv['CV Fold'] == i].drop(["CV Fold"], axis=1)
    yi_train = targetsM_cv[targetsM_cv['CV Fold'] != i].drop(["CV Fold"], axis=1)
    yi_test = targets_cv[targets_cv['CV Fold'] == i].drop(["CV Fold"], axis=1)
    y_test_list.append(pd.DataFrame(yi_test, columns=yi_test.columns, index=yi_test.index))

    chain = Chain(
        model_reg=RandomForestRegressor(random_state=random_state),
        model_clf=RandomForestClassifier(random_state=random_state),
        propagate="true", #RUN MODELS IN A CHAIN
    )
    chain.fit(Xi_train, yi_train, target_types=None) #["reg","reg","reg","reg","reg","reg","reg","reg","reg","clf","clf"]
    y_pred = chain.predict(Xi_test)
    y_pred_list.append(pd.DataFrame(y_pred, columns=yi_test.columns, index=yi_test.index))
    print("Done with evaluating on CV Fold {}".format(i+1))

Done with evaluating on CV Fold 1
Done with evaluating on CV Fold 2
Done with evaluating on CV Fold 3
Done with evaluating on CV Fold 4
Done with evaluating on CV Fold 5


In [47]:
y_test_cv = []
y_pred_cv = []

for j in range(len(y_test_list)):  # 5
    y_test_targ = []
    y_pred_targ = []
    nvar=y_test_list[0].shape[1]

    for i in range(0, nvar):  # or (1, 5)
        missing_rows_mask = y_test_list[j].iloc[:, i].isna()
        y_test = y_test_list[j].iloc[:, i][~missing_rows_mask]
        y_pred = y_pred_list[j].iloc[:, i][~missing_rows_mask]
        
        y_test_targ.append(y_test)
        y_pred_targ.append(y_pred)
    
    y_test_cv.append(y_test_targ)
    y_pred_cv.append(y_pred_targ)
# y_test_cv[fold][outcome]

In [48]:
# Initialize a list to store scores
scores = []
scores_with_std = []

# Iterate over each outcome variable in the folds
for variable_name in variables: 
    variable_scores = []
    
    # Compute scores for the variable across all folds
    for fold_index in range(len(y_test_cv)):
        y_test = y_test_cv[fold_index][variables.index(variable_name)] 
        y_pred = y_pred_cv[fold_index][variables.index(variable_name)] 
        
        # Check if the target variable is numerical or categorical
        if y_test.dtype.kind in 'bifc':
            score = r2_score(y_test, y_pred)
        else:
            score = accuracy_score(y_test, y_pred)
                  
        variable_scores.append(score)
    
    # Compute the average score for the variable across all folds
    variable_avg_score = np.mean(variable_scores)
    # Compute the standard deviation for the variable across all folds
    variable_std_score = np.std(variable_scores)
    scores_with_std.append((variable_name, variable_avg_score, variable_std_score))

# Print the scores with average and standard deviation along with variable names
print("Scores for each outcome (chain - true values):")
for variable_name, avg_score, std_score in scores_with_std:
    print(f"{variable_name}: {avg_score:.2f} (± {std_score:.2f})")

Scores for each outcome (chain - true values):
KFSS_M-2y: 0.80 (± 0.03)
KFSS_P-2y: 0.74 (± 0.03)
EDSS-2y: 0.88 (± 0.02)
T25FW-2y: 0.72 (± 0.08)
NHPT-2y: 0.58 (± 0.19)
P_R36-SF12-after: 0.68 (± 0.05)
M_R36-SF12-after: 0.55 (± 0.02)
SES_after: 0.70 (± 0.05)
SLEC_after: 0.66 (± 0.03)
KFSS_M-after_2y: 0.65 (± 0.03)
KFSS_P-after_2y: 0.51 (± 0.05)
EDSS-after_2y: 0.72 (± 0.05)
NRELAP: 0.60 (± 0.02)
CESEV: 0.52 (± 0.03)
