In [1]:
import numpy as np
import pandas as pd
import itertools
import random

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import r2_score, accuracy_score
from chaining import Chain
import os

from scipy.stats import mode
from sksurv.util import Surv # pip install scikit-survival
from sksurv.metrics import concordance_index_censored
from sksurv.ensemble import RandomSurvivalForest

In [3]:
def missingness_stratified_cv(df, N_FOLDS=5, random_state=None):
    # Add seed for reproducibility of the predictions (to get the same scores each time we run the code)
    np.random.seed(random_state)

    # Initial complete-case test fold assignment
    cv = pd.Series(np.nan, index=df.index)
    i_cc = (df.isna().sum(axis=1) == 0) # Complete cases
    cv.iloc[i_cc] = np.random.randint(low=0, high=N_FOLDS, size=i_cc.sum())

    # Go over columns from most missing to least missing
    for j in df.isna().sum().argsort()[::-1]:
        # Instances i that are not assigned yet but for which df[i,j] is observed
        i_tbf = (cv.isna()) & (~df.iloc[:,j].isna()) # to be filled
        # Fill them randomly
        cv.iloc[i_tbf] = np.random.randint(low=0, high=N_FOLDS, size=i_tbf.sum())

    return cv

In [6]:
possible_paths = [
    'C:/Users/lenne/OneDrive/Documenten/Master of Statistics and Data Science/2023-2024/Master thesis/Thesis_Sofia_Lennert/new_data',
    'C:/Users/anaso/Desktop/SOFIA MENDES/KU Leuven/Master Thesis/Thesis_Sofia_Lennert/new_data'
]

# Define file names
file = 'merged_data_modeling.csv'

# Find full paths to the CSV files
path = next((f'{path}/{file}' for path in possible_paths if os.path.exists(f'{path}/{file}')), None)

# Resulting DataFrame will have aggregated data from all four datasets based on the specific_column
pd.set_option('display.max_columns', None)

data = pd.read_csv(path)
data

Unnamed: 0,USUBJID,AGE,SEX,RACE,CONTINENT,CESEV,CECONTRT,TOTRELAP,MHCONTRT,MHDIAGN,CARDIO,URINARY,MUSCKELET,FATIGUE,SMSTDY,NRELAP,NHPT-before,NHPT-2y,NHPT-after_2y,PASAT_2s-before,PASAT_2s-2y,PASAT_2s-after_2y,PASAT_3s-before,PASAT_3s-2y,PASAT_3s-after_2y,SDMT-before,SDMT-2y,T25FW-before,T25FW-2y,T25FW-after_2y,T-before,T-after,P-before,P-after,N-before,N-after,DS,DS_L,DS_R,SLEC_before,SLEC_after,SES_after,SES_before,VAA,BDI-before,BDI-after,EDSS-before,EDSS-2y,EDSS-after_2y,KFSS1-Sensory-2y,KFSS1-Sensory-after_2y,KFSS1-Sensory-before,KFSS1-Brain-2y,KFSS1-Brain-after_2y,KFSS1-Brain-before,KFSS1-Bowel-2y,KFSS1-Bowel-after_2y,KFSS1-Bowel-before,KFSS1-Pyramidal-2y,KFSS1-Pyramidal-after_2y,KFSS1-Pyramidal-before,KFSS1-Cerebral-2y,KFSS1-Cerebral-after_2y,KFSS1-Cerebral-before,KFSS1-Visual-2y,KFSS1-Visual-after_2y,KFSS1-Visual-before,KFSS1-Cerebellar-2y,KFSS1-Cerebellar-after_2y,KFSS1-Cerebellar-before,KFSS_M-2y,KFSS_M-after_2y,KFSS_M-before,KFSS_P-2y,KFSS_P-after_2y,KFSS_P-before,M_R36-SF12-before,P_R36-SF12-before,R36-SF12-before_Ind,M_R36-SF12-after,P_R36-SF12-after,R36-SF12-after_Ind
0,MSOAC/0014,46.0,F,,,,,,,RRMS,0,0,0,0,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,MSOAC/0016,,M,WHITE,NORTH AMERICA,,,,Y,SPMS,1,1,0,1,,0.0,,,,,,,,,,,,8.55,6.60,,0.0,0.0,,,,,,,,,,,,,,,6.00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,MSOAC/0019,44.0,M,NON-WHITE,,,,,,PPMS,1,1,0,0,,0.0,23.65,21.30,20.15,34.5,35.5,43.0,43.5,51.0,53.0,,,6.30,6.15,5.85,0.0,0.0,0.0,0.0,,,,,,,,,,,,,3.75,3.50,3.0,0.333333,0.166667,0.500000,0.2,0.0,0.2,0.000000,0.166667,0.083333,0.333333,0.5,0.416667,0.0,0.0,0.0,0.333333,0.0,0.333333,0.0,0.2,0.0,0.185185,0.185185,0.240741,0.166667,0.083333,0.208333,0.657143,0.708861,1.0,0.685714,0.734177,1.0
3,MSOAC/0024,60.0,M,WHITE,NORTH AMERICA,,,,,SPMS,1,1,1,1,,0.0,34.45,37.50,,55.0,54.0,,60.0,60.0,,,,4.50,5.25,,0.0,0.0,0.0,0.0,1.0,0.0,,,,,,,,,0.031746,0.023810,4.00,3.75,,0.333333,,0.333333,0.0,,0.1,0.583333,,0.666667,0.166667,,0.250000,0.0,,0.0,0.000000,,0.083333,0.2,,0.5,0.129630,,0.240741,0.291667,,0.375000,0.657143,0.481013,1.0,0.632353,0.594937,1.0
4,MSOAC/0030,28.0,F,WHITE,EUROPE,,,,,RRMS,1,1,0,1,,0.0,16.55,17.90,,,,,58.0,60.0,,63.5,69.0,4.85,4.70,,0.0,0.0,0.0,0.0,0.0,0.0,,,,26.0,24.0,1.25,1.25,,0.063492,0.039683,2.00,1.50,,0.166667,,0.166667,0.2,,0.2,0.166667,,0.166667,0.166667,,0.333333,0.0,,0.2,0.166667,,0.083333,0.0,,0.1,0.111111,,0.203704,0.166667,,0.125000,0.733333,0.692308,0.0,0.700000,0.615385,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2460,MSOAC/9986,46.0,M,WHITE,OCEANIA,,,,,RRMS,1,1,0,1,,0.0,19.35,18.95,,,,,58.0,60.0,,51.0,60.0,3.90,3.80,,0.0,0.0,0.0,0.0,0.0,0.0,,,,36.0,35.0,1.25,1.25,,0.047619,0.063492,2.75,2.50,,0.333333,,0.166667,0.0,,0.0,0.333333,,0.250000,0.166667,,0.333333,0.0,,0.0,0.000000,,0.000000,0.0,,0.2,0.111111,,0.148148,0.166667,,0.125000,0.833333,0.576923,0.0,0.866667,0.634615,0.0
2461,MSOAC/9987,18.0,F,,,,,,,RRMS,0,0,0,0,,0.0,,,,,,,,,,,,,,,,,,,,,1.0,1.0,1.0,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2462,MSOAC/9995,38.0,F,,,MILD,,4.0,,RRMS,0,0,0,0,142.0,2.0,,,,,,,,,,,,,,,,,,,,,1.2,1.0,1.0,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2463,MSOAC/9998,40.0,F,WHITE,,,Y,2.0,Y,PPMS,0,1,0,1,79.0,1.0,23.80,22.40,22.50,21.5,30.5,33.5,31.5,39.5,40.5,,,6.15,6.00,6.20,0.0,0.0,0.0,0.0,,,,,,,,,,,,,4.50,3.75,4.0,0.166667,0.250000,0.333333,0.4,0.6,0.6,0.166667,0.166667,0.166667,0.333333,0.5,0.500000,0.0,0.0,0.4,0.166667,0.0,0.166667,0.6,0.6,0.6,0.314815,0.351852,0.481481,0.166667,0.083333,0.166667,0.642857,0.569620,1.0,0.642857,0.632911,1.0


In [8]:
y = pd.DataFrame(np.nan, index=data.index, columns=["cens", "time"])
y["time"] = data["SMSTDY"] / 365.25
y["cens"] = (~y["time"].isna()) # censoring status: 0 = censored, 1 = observed
# - If it does not exist, I put a time to censoring at the last EDSS visit
to_fill = y.index[(~y["cens"]).values.flatten()]
#y.loc[to_fill,"time"] = edss.groupby("USUBJID").QSDY.max().loc[to_fill]
y = y.loc[y.time > 0] # TODO 78 anomaly cases with non-positive time to event
y = y.dropna() # NOTE Still some missing values, drop for now
y_surv = Surv().from_arrays(y.cens, y.time) # Convert to recarray (required for sksurv)
y_surv

array([( True, 7.33744011e-01), ( True, 3.94250513e-01),
       ( True, 1.13347023e+00), ( True, 7.52908966e-01),
       ( True, 5.94113621e-01), ( True, 1.01574264e+00),
       ( True, 2.87474333e-01), ( True, 1.06776181e-01),
       ( True, 4.07939767e-01), ( True, 3.58658453e-01),
       ( True, 1.06776181e-01), ( True, 9.17180014e-01),
       ( True, 1.01848049e+00), ( True, 1.63449692e+00),
       ( True, 6.02327173e-02), ( True, 6.84462697e-02),
       ( True, 1.13620808e+00), ( True, 6.21492129e-01),
       ( True, 9.03490760e-02), ( True, 2.62833676e-01),
       ( True, 8.21355236e-03), ( True, 1.88637919e+00),
       ( True, 2.62833676e-01), ( True, 1.51950719e+00),
       ( True, 9.58247775e-02), ( True, 1.48391513e+00),
       ( True, 6.05065024e-01), ( True, 9.63723477e-01),
       ( True, 1.89185489e+00), ( True, 1.64271047e-02),
       ( True, 5.17453799e-01), ( True, 7.09103354e-01),
       ( True, 1.24572211e+00), ( True, 2.57357974e-01),
       ( True, 1.77686516e+00),

In [9]:
#variables = ['KFSS_M-2y', 'EDSS-2y', 'T25FW-2y', 'NRELAP']# removed KFSS_P-2y for now -- ('SMSTDY' gave a score of -0.03)
variables = ['KFSS_M-2y', 'KFSS_P-2y', 'EDSS-2y', 'T25FW-2y', 'NHPT-2y', 'P_R36-SF12-after', 
            'M_R36-SF12-after', 'SES_after', 'EDSS-after_2y', 'NRELAP', 'CESEV']

# Extract targets
#targets = data[variables]

# Extract features by dropping the target columns
#features = data.drop(variables, axis=1)

columns_to_keep = ['AGE', 'SEX', 'RACE', 'CONTINENT', 'MHDIAGN', 'CARDIO', 'URINARY', 'MUSCKELET', 'FATIGUE', 
                    'NHPT-before', 'PASAT_2s-before', 'PASAT_3s-before', 'SDMT-before', 'T25FW-before', 'SLEC_before','SES_before',
                    'BDI-before', 'EDSS-before', 'KFSS_M-before', 'KFSS_P-before', 'M_R36-SF12-before',
                	'P_R36-SF12-before', 'R36-SF12-before_Ind', 'T-before','P-before','N-before']
# still need to change in OE dataframe the SLEC and SES so name is consistent with the others

features = data[columns_to_keep]
features

Unnamed: 0,AGE,SEX,RACE,CONTINENT,MHDIAGN,CARDIO,URINARY,MUSCKELET,FATIGUE,NHPT-before,PASAT_2s-before,PASAT_3s-before,SDMT-before,T25FW-before,SLEC_before,SES_before,BDI-before,EDSS-before,KFSS_M-before,KFSS_P-before,M_R36-SF12-before,P_R36-SF12-before,R36-SF12-before_Ind,T-before,P-before,N-before
0,46.0,F,,,RRMS,0,0,0,0,,,,,,,,,,,,,,,,,
1,,M,WHITE,NORTH AMERICA,SPMS,1,1,0,1,,,,,8.55,,,,6.00,,,,,,0.0,,
2,44.0,M,NON-WHITE,,PPMS,1,1,0,0,23.65,34.5,43.5,,6.30,,,,3.75,0.240741,0.208333,0.657143,0.708861,1.0,0.0,0.0,
3,60.0,M,WHITE,NORTH AMERICA,SPMS,1,1,1,1,34.45,55.0,60.0,,4.50,,,0.031746,4.00,0.240741,0.375000,0.657143,0.481013,1.0,0.0,0.0,1.0
4,28.0,F,WHITE,EUROPE,RRMS,1,1,0,1,16.55,,58.0,63.5,4.85,26.0,1.25,0.063492,2.00,0.203704,0.125000,0.733333,0.692308,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2460,46.0,M,WHITE,OCEANIA,RRMS,1,1,0,1,19.35,,58.0,51.0,3.90,36.0,1.25,0.047619,2.75,0.148148,0.125000,0.833333,0.576923,0.0,0.0,0.0,0.0
2461,18.0,F,,,RRMS,0,0,0,0,,,,,,,,,,,,,,,,,
2462,38.0,F,,,RRMS,0,0,0,0,,,,,,,,,,,,,,,,,
2463,40.0,F,WHITE,,PPMS,0,1,0,1,23.80,21.5,31.5,,6.15,,,,4.50,0.481481,0.166667,0.642857,0.569620,1.0,0.0,0.0,


In [10]:
object_columns = features.select_dtypes(include=['object'])
features = pd.get_dummies(features, columns=object_columns.columns, dtype=int)
features.head()

Unnamed: 0,AGE,CARDIO,URINARY,MUSCKELET,FATIGUE,NHPT-before,PASAT_2s-before,PASAT_3s-before,SDMT-before,T25FW-before,SLEC_before,SES_before,BDI-before,EDSS-before,KFSS_M-before,KFSS_P-before,M_R36-SF12-before,P_R36-SF12-before,R36-SF12-before_Ind,T-before,P-before,N-before,SEX_F,SEX_M,RACE_NON-WHITE,RACE_WHITE,CONTINENT_ASIA,CONTINENT_EURASIA,CONTINENT_EUROPE,CONTINENT_NORTH AMERICA,CONTINENT_OCEANIA,CONTINENT_SOUTH AMERICA,MHDIAGN_PPMS,MHDIAGN_RRMS,MHDIAGN_SPMS
0,46.0,0,0,0,0,,,,,,,,,,,,,,,,,,1,0,0,0,0,0,0,0,0,0,0,1,0
1,,1,1,0,1,,,,,8.55,,,,6.0,,,,,,0.0,,,0,1,0,1,0,0,0,1,0,0,0,0,1
2,44.0,1,1,0,0,23.65,34.5,43.5,,6.3,,,,3.75,0.240741,0.208333,0.657143,0.708861,1.0,0.0,0.0,,0,1,1,0,0,0,0,0,0,0,1,0,0
3,60.0,1,1,1,1,34.45,55.0,60.0,,4.5,,,0.031746,4.0,0.240741,0.375,0.657143,0.481013,1.0,0.0,0.0,1.0,0,1,0,1,0,0,0,1,0,0,0,0,1
4,28.0,1,1,0,1,16.55,,58.0,63.5,4.85,26.0,1.25,0.063492,2.0,0.203704,0.125,0.733333,0.692308,0.0,0.0,0.0,0.0,1,0,0,1,0,0,1,0,0,0,0,1,0


In [11]:
features = features.copy()
features = features.loc[y.index]
features

Unnamed: 0,AGE,CARDIO,URINARY,MUSCKELET,FATIGUE,NHPT-before,PASAT_2s-before,PASAT_3s-before,SDMT-before,T25FW-before,SLEC_before,SES_before,BDI-before,EDSS-before,KFSS_M-before,KFSS_P-before,M_R36-SF12-before,P_R36-SF12-before,R36-SF12-before_Ind,T-before,P-before,N-before,SEX_F,SEX_M,RACE_NON-WHITE,RACE_WHITE,CONTINENT_ASIA,CONTINENT_EURASIA,CONTINENT_EUROPE,CONTINENT_NORTH AMERICA,CONTINENT_OCEANIA,CONTINENT_SOUTH AMERICA,MHDIAGN_PPMS,MHDIAGN_RRMS,MHDIAGN_SPMS
5,35.0,0,1,1,0,16.80,,58.0,,3.75,11.5,1.00,,1.0,0.018519,0.041667,0.671429,0.683544,1.0,,,,1,0,0,1,0,0,1,0,0,0,0,1,0
7,18.0,0,0,0,0,,,,,,,,,,,,,,,,,,0,1,0,0,0,0,0,0,0,0,0,1,0
9,48.0,0,0,0,0,,,,,,,,,,,,,,,,,,1,0,0,0,0,0,0,0,0,0,0,1,0
10,34.0,0,0,0,1,25.85,,30.0,31.0,6.55,16.0,0.80,0.31746,2.5,0.222222,0.083333,0.600000,0.576923,0.0,0.0,0.0,0.0,1,0,0,1,0,0,1,0,0,0,0,1,0
11,28.0,0,1,0,0,21.20,,56.5,,6.50,33.5,1.25,,2.0,0.074074,0.000000,0.700000,0.721519,1.0,,,,1,0,0,1,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2448,48.0,1,1,1,1,46.10,23.5,34.5,,93.85,,,,6.5,,,0.728571,0.493671,1.0,0.0,0.0,0.0,1,0,0,1,0,0,0,0,0,0,0,0,1
2451,35.0,0,0,1,0,19.45,,45.0,,5.20,19.0,1.25,,1.5,0.129630,0.000000,0.614286,0.759494,1.0,,,,0,1,0,1,0,0,1,0,0,0,0,1,0
2462,38.0,0,0,0,0,,,,,,,,,,,,,,,,,,1,0,0,0,0,0,0,0,0,0,0,1,0
2463,40.0,0,1,0,1,23.80,21.5,31.5,,6.15,,,,4.5,0.481481,0.166667,0.642857,0.569620,1.0,0.0,0.0,,1,0,0,1,0,0,0,0,0,0,1,0,0


In [12]:
rsf = RandomSurvivalForest(n_jobs=-1, random_state=42)
X_train = features # TODO use the CV split here + first preprocess (impute missing values etc)
X_test  = features # TODO use the CV split here + first preprocess (impute missing values etc)
y_train = y_surv # TODO use the CV split here
y_test  = y_surv # TODO use the CV split here
rsf.fit(X_train, y_train)
y_pred = rsf.predict(X_test)
hci = concordance_index_censored(y_test["event"], y_test["time"], y_pred)[0]
print(f"Harrell's C-index = {hci:.3f}")

ValueError: Input X contains NaN.
RandomSurvivalForest does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
# Set random state for reproducibility
random_state = 42
N_FOLDS = 5

In [None]:
# Generate CV folds
cv=missingness_stratified_cv(features, N_FOLDS, random_state)
cv = cv.to_frame(name="CV Fold")