In [181]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error as mse, r2_score, accuracy_score
from sklearn.model_selection import train_test_split
from chaining import Chain
import os

In [182]:
X, y = load_iris(return_X_y=True)
y = np.vstack((X[:,-1], y)).T
X = X[:, :-1]

In [183]:
chain = Chain(
    model_reg=RandomForestRegressor(random_state=42),
    model_clf=RandomForestClassifier(random_state=42),
    propagate="true",
    )
chain.fit(X,y, target_types=["reg","clf"])
y_pred = chain.predict(X,y)
scores = [ 
    mse(y[:,0], y_pred.iloc[:,0]),
    mse(y[:,1], y_pred.iloc[:,1]),
]
print(f"MSE = {scores[0]:.2f}, {scores[1]:.2f}")


MSE = 0.00, 0.00


Our model

In [184]:
possible_paths = [
    'C:/Users/lenne/OneDrive/Documenten/Master of Statistics and Data Science/2023-2024/Master thesis/Thesis_Sofia_Lennert/new_data',
    'C:/Users/anaso/Desktop/SOFIA MENDES/KU Leuven/Master Thesis/Thesis_Sofia_Lennert/new_data'
]

# Define file names
file = 'merged_data.csv'

# Find full paths to the CSV files
path = next((f'{path}/{file}' for path in possible_paths if os.path.exists(f'{path}/{file}')), None)

In [185]:
# Resulting DataFrame will have aggregated data from all four datasets based on the specific_column
pd.set_option('display.max_columns', None)

In [186]:
data = pd.read_csv(path)
data

Unnamed: 0,USUBJID,AGE,SEX,RACE,CONTINENT,CESEV,CECONTRT,TOTRELAP,MHCONTRT,MHDIAGN,CARDIO,URINARY,MUSCKELET,FATIGUE,SMSTDY,NRELAP,NHPT-before,NHPT-2y,NHPT-after_2y,PASAT_2s-before,PASAT_2s-2y,PASAT_2s-after_2y,PASAT_3s-before,PASAT_3s-2y,PASAT_3s-after_2y,SDMT-before,SDMT-2y,T25FW-before,T25FW-2y,T25FW-after_2y,T-before,T-after,P-before,P-after,N-before,N-after,DS,DS_L,DS_R,SLEC_before,SLEC_after,SES_after,SES_before,VAA,BDI-before,BDI-after,EDSS-before,EDSS-2y,EDSS-after_2y,KFSS1-Sensory-2y,KFSS1-Sensory-after_2y,KFSS1-Sensory-before,KFSS1-Brain-2y,KFSS1-Brain-after_2y,KFSS1-Brain-before,KFSS1-Bowel-2y,KFSS1-Bowel-after_2y,KFSS1-Bowel-before,KFSS1-Pyramidal-2y,KFSS1-Pyramidal-after_2y,KFSS1-Pyramidal-before,KFSS1-Cerebral-2y,KFSS1-Cerebral-after_2y,KFSS1-Cerebral-before,KFSS1-Visual-2y,KFSS1-Visual-after_2y,KFSS1-Visual-before,KFSS1-Cerebellar-2y,KFSS1-Cerebellar-after_2y,KFSS1-Cerebellar-before,KFSS_M-2y,KFSS_M-after_2y,KFSS_M-before,KFSS_P-2y,KFSS_P-after_2y,KFSS_P-before,RAND36_M-before,RAND36_M-after,RAND36_P-before,RAND36_P-after,SF12_M-before,SF12_M-after,SF12_P-before,SF12_P-after
0,MSOAC/0649,,F,WHITE,NORTH AMERICA,,,,Y,SPMS,0,1,0,1,,,,,,,,,,,,,,11.35,10.40,,,,,,,,,,,,,,,,,,3.00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,MSOAC/2224,38.0,F,WHITE,EUROPE,,,,,RRMS,0,1,0,0,,,27.80,26.55,,,,,55.0,56.0,,38.0,42.0,8.90,8.85,,0.0,0.0,0.0,0.0,0.0,0.0,,,,30.0,30.0,1.25,1.25,,0.380952,0.388889,3.00,3.0,,0.000000,,0.000000,0.2,,0.2,0.000000,,0.000000,0.5,,0.500000,0.0,,0.0,0.000000,,0.0,0.0,,0.0,0.148148,,0.148148,0.000000,,0.000000,,,,,0.8,0.7,0.5,0.5
2,MSOAC/0576,50.0,F,WHITE,,,,,Y,PPMS,0,1,0,1,,,17.75,18.65,17.90,51.0,56.5,56.5,59.0,59.0,59.5,,,4.55,4.70,5.15,0.0,0.0,0.0,0.0,,,,,,,,,,,,,3.75,4.0,4.25,0.333333,0.250000,0.333333,0.0,0.0,0.1,0.250000,0.166667,0.333333,0.5,0.500000,0.500000,0.0,0.2,0.0,0.083333,0.083333,0.0,0.4,0.4,0.3,0.240741,0.277778,0.259259,0.166667,0.125000,0.166667,0.628571,0.671429,0.594937,0.531646,,,,
3,MSOAC/4961,44.0,F,WHITE,,,,,Y,PPMS,0,1,1,1,,,19.75,19.45,20.60,38.5,52.0,54.0,55.5,59.0,58.0,,,4.00,4.90,5.60,0.0,0.0,0.0,0.0,,,,,,,,,,,,,3.50,3.5,3.50,0.333333,0.250000,0.333333,0.0,0.1,0.0,0.166667,0.250000,0.250000,0.5,0.500000,0.416667,0.0,0.2,0.0,0.000000,0.000000,0.0,0.2,0.3,0.3,0.185185,0.277778,0.222222,0.083333,0.125000,0.125000,0.614286,0.614286,0.582278,0.645570,,,,
4,MSOAC/5990,52.0,F,WHITE,,,,,Y,PPMS,1,1,0,1,,,21.45,21.35,22.75,22.0,23.5,22.0,35.0,41.5,44.0,,,11.40,16.80,30.20,0.0,0.0,0.0,0.0,,,,,,,,,,,,,6.00,6.0,6.50,0.166667,0.333333,0.166667,0.2,0.2,0.2,0.166667,0.166667,0.166667,0.5,0.666667,0.500000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.4,0.0,0.222222,0.333333,0.185185,0.083333,0.166667,0.083333,0.600000,0.742857,0.506329,0.531646,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2460,MSOAC/2501,46.0,F,WHITE,,,,12.0,,SPMS,0,0,0,0,,,29.40,25.65,,14.0,12.0,,17.5,16.0,,,,9.30,8.10,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,6.00,6.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.642857,0.614286,0.645570,0.588608,,,,
2461,MSOAC/8672,43.0,F,,,MODERATE,,1.0,,RRMS,0,0,0,0,25.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2462,MSOAC/5705,30.0,M,,,MILD,,2.0,,RRMS,0,0,0,0,515.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2463,MSOAC/8255,42.0,M,,,MODERATE,,1.0,,RRMS,0,0,0,0,259.0,1.0,,,,,,,,,,,,,,,,,,,,,0.9,0.0,1.0,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [187]:
data['NRELAP'] = data['NRELAP'].fillna(0)

def bin_column(value):
    if value in [0, 1, 2, 3]:
        return str(value)
    else:
        return '4+'

# Assuming your DataFrame is df and the column to bin is 'column_name'
data['NRELAP'] = data['NRELAP'].apply(bin_column)

In [188]:
variables = ['EDSS-2y', 'T25FW-2y', 'RAND36_P-after', 'NRELAP']# , 'CESEV' ('SMSTDY' gave a score of -0.03)

# Extract targets
targets = data[variables]

# Extract features by dropping the target columns
features = data.drop(variables, axis=1)

In [189]:
columns_to_keep = ['AGE', 'SEX', 'RACE', 'CONTINENT', 'MHDIAGN', 'CARDIO', 'URINARY', 'MUSCKELET', 'FATIGUE', 
                    'NHPT-before', 'PASAT_2s-before', 'PASAT_3s-before', 'SDMT-before', 'T25FW-before', 'SLEC_before','SES_before',
                    'BDI-before', 'EDSS-before', 'KFSS_M-before', 'KFSS_P-before', 'RAND36_M-before', 'RAND36_P-before', 'SF12_M-before', 
                    'SF12_P-before']
# still need to change in OE dataframe the SLEC and SES so name is consistent with the others

features = features[columns_to_keep]
features

Unnamed: 0,AGE,SEX,RACE,CONTINENT,MHDIAGN,CARDIO,URINARY,MUSCKELET,FATIGUE,NHPT-before,PASAT_2s-before,PASAT_3s-before,SDMT-before,T25FW-before,SLEC_before,SES_before,BDI-before,EDSS-before,KFSS_M-before,KFSS_P-before,RAND36_M-before,RAND36_P-before,SF12_M-before,SF12_P-before
0,,F,WHITE,NORTH AMERICA,SPMS,0,1,0,1,,,,,11.35,,,,3.00,,,,,,
1,38.0,F,WHITE,EUROPE,RRMS,0,1,0,0,27.80,,55.0,38.0,8.90,30.0,1.25,0.380952,3.00,0.148148,0.000000,,,0.8,0.5
2,50.0,F,WHITE,,PPMS,0,1,0,1,17.75,51.0,59.0,,4.55,,,,3.75,0.259259,0.166667,0.628571,0.594937,,
3,44.0,F,WHITE,,PPMS,0,1,1,1,19.75,38.5,55.5,,4.00,,,,3.50,0.222222,0.125000,0.614286,0.582278,,
4,52.0,F,WHITE,,PPMS,1,1,0,1,21.45,22.0,35.0,,11.40,,,,6.00,0.185185,0.083333,0.600000,0.506329,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2460,46.0,F,WHITE,,SPMS,0,0,0,0,29.40,14.0,17.5,,9.30,,,,6.00,,,0.642857,0.645570,,
2461,43.0,F,,,RRMS,0,0,0,0,,,,,,,,,,,,,,,
2462,30.0,M,,,RRMS,0,0,0,0,,,,,,,,,,,,,,,
2463,42.0,M,,,RRMS,0,0,0,0,,,,,,,,,,,,,,,


In [190]:
object_columns = features.select_dtypes(include=['object'])
features = pd.get_dummies(features, columns=object_columns.columns, dtype=int)

In [191]:
# Initialize LabelEncoder
#label_encoder = LabelEncoder()

#for col in targets.columns:
#    # Check if the column contains non-numeric values
#    if targets[col].dtype == 'object':
#        # Fill missing values with a placeholder
#        targets[col].fillna('__missing__', inplace=True)
#        # Fit and transform the column with label encoding
#        targets[col] = label_encoder.fit_transform(targets[col])
#        # Find the label for missing values
#        missing_label = label_encoder.transform(['__missing__'])[np.nan]
#        # Replace encoded missing values with NaN
#        #targets[col] = np.where(targets[col] == missing_label, np.nan, targets[col])

#targets

In [192]:
targets.dtypes

EDSS-2y           float64
T25FW-2y          float64
RAND36_P-after    float64
NRELAP             object
dtype: object

In [193]:
# Split training and test data randomly
X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size=0.2)
print(len(X_train), len(y_train), len(X_test), len(y_test))

1972 1972 493 493


In [194]:
# Create a new column 'RAND36_M-before' and fill it with values from 'SF12_M-before'
X_train['M_R36-SF12-before'] = X_train['RAND36_M-before'].fillna(X_train['SF12_M-before'])

# Create a new column 'R36-SF12' with a binary indicator
X_train['M_R36-SF12'] = X_train.apply(lambda row: 1 if pd.notna(row['RAND36_M-before']) else (0 if pd.notna(row['SF12_M-before']) else np.nan), axis=1)

# Drop the original columns if needed
X_train = X_train.drop(['SF12_M-before'], axis=1)

# Create a new column 'RAND36_M-before' and fill it with values from 'SF12_M-before'
X_train['P_R36-SF12-before'] = X_train['RAND36_P-before'].fillna(X_train['SF12_P-before'])

# Create a new column 'R36-SF12' with a binary indicator
X_train['P_R36-SF12'] = X_train.apply(lambda row: 1 if pd.notna(row['RAND36_P-before']) else (0 if pd.notna(row['SF12_P-before']) else np.nan), axis=1)

# Drop the original columns if needed
X_train = X_train.drop(['SF12_P-before'], axis=1)
X_train

Unnamed: 0,AGE,CARDIO,URINARY,MUSCKELET,FATIGUE,NHPT-before,PASAT_2s-before,PASAT_3s-before,SDMT-before,T25FW-before,SLEC_before,SES_before,BDI-before,EDSS-before,KFSS_M-before,KFSS_P-before,RAND36_M-before,RAND36_P-before,SEX_F,SEX_M,RACE_NON-WHITE,RACE_WHITE,CONTINENT_ASIA,CONTINENT_EURASIA,CONTINENT_EUROPE,CONTINENT_NORTH AMERICA,CONTINENT_OCEANIA,CONTINENT_SOUTH AMERICA,MHDIAGN_PPMS,MHDIAGN_RRMS,MHDIAGN_SPMS,M_R36-SF12-before,M_R36-SF12,P_R36-SF12-before,P_R36-SF12
229,40.0,0,1,1,1,22.35,39.5,46.0,,6.10,,,,4.0,0.425926,0.208333,0.714286,0.607595,0,1,0,1,0,0,0,0,0,0,1,0,0,0.714286,1.0,0.607595,1.0
434,33.0,0,0,0,0,,,,,,,,,,,,,,0,1,0,0,0,0,0,0,0,0,0,1,0,,,,
1673,38.0,0,0,0,0,,,,,,,,,,,,,,1,0,0,0,0,0,0,0,0,0,0,1,0,,,,
2381,45.0,1,1,0,1,23.95,,25.5,79.5,10.50,0.0,0.40,0.238095,1.5,0.074074,0.083333,,,0,1,0,1,0,0,1,0,0,0,0,1,0,0.633333,0.0,0.730769,0.0
1679,26.0,0,0,0,0,23.70,,48.0,51.5,4.90,26.5,1.00,0.396825,1.0,0.000000,0.083333,,,0,1,0,1,0,1,0,0,0,0,0,1,0,0.533333,0.0,0.576923,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2140,32.0,0,0,0,0,25.95,,51.0,,3.25,25.0,1.25,,4.0,0.351852,0.166667,0.585714,0.721519,0,1,0,1,0,0,1,0,0,0,0,1,0,0.585714,1.0,0.721519,1.0
2062,37.0,0,0,0,0,,,,,,,,,,,,,,0,1,0,0,0,0,0,0,0,0,0,1,0,,,,
2177,19.0,1,1,0,0,20.05,,37.5,47.5,4.30,9.0,1.25,0.031746,1.0,0.037037,0.000000,,,1,0,1,0,0,0,1,0,0,0,0,1,0,0.700000,0.0,0.730769,0.0
188,44.0,0,1,1,1,27.30,28.0,38.0,,11.30,,,,6.5,0.388889,0.166667,0.585714,0.531646,1,0,0,1,0,0,0,0,0,0,1,0,0,0.585714,1.0,0.531646,1.0


In [195]:
# Create a new column 'RAND36_M-before' and fill it with values from 'SF12_M-before'
X_test['M_R36-SF12-before'] = X_test['RAND36_M-before'].fillna(X_test['SF12_M-before'])

# Create a new column 'R36-SF12' with a binary indicator
X_test['M_R36-SF12'] = X_test.apply(lambda row: 1 if pd.notna(row['RAND36_M-before']) else (0 if pd.notna(row['SF12_M-before']) else np.nan), axis=1)

# Drop the original columns if needed
X_test = X_test.drop(['SF12_M-before'], axis=1)

# Create a new column 'RAND36_M-before' and fill it with values from 'SF12_M-before'
X_test['P_R36-SF12-before'] = X_test['RAND36_P-before'].fillna(X_test['SF12_P-before'])

# Create a new column 'R36-SF12' with a binary indicator
X_test['P_R36-SF12'] = X_test.apply(lambda row: 1 if pd.notna(row['RAND36_P-before']) else (0 if pd.notna(row['SF12_P-before']) else np.nan), axis=1)

# Drop the original columns if needed
X_test = X_test.drop(['SF12_P-before'], axis=1)
X_test

Unnamed: 0,AGE,CARDIO,URINARY,MUSCKELET,FATIGUE,NHPT-before,PASAT_2s-before,PASAT_3s-before,SDMT-before,T25FW-before,SLEC_before,SES_before,BDI-before,EDSS-before,KFSS_M-before,KFSS_P-before,RAND36_M-before,RAND36_P-before,SEX_F,SEX_M,RACE_NON-WHITE,RACE_WHITE,CONTINENT_ASIA,CONTINENT_EURASIA,CONTINENT_EUROPE,CONTINENT_NORTH AMERICA,CONTINENT_OCEANIA,CONTINENT_SOUTH AMERICA,MHDIAGN_PPMS,MHDIAGN_RRMS,MHDIAGN_SPMS,M_R36-SF12-before,M_R36-SF12,P_R36-SF12-before,P_R36-SF12
1266,49.0,0,1,0,1,16.75,48.0,56.0,,4.85,,,,4.0,0.296296,0.166667,0.642857,0.582278,1,0,0,1,0,0,0,0,0,0,1,0,0,0.642857,1.0,0.582278,1.0
437,40.0,1,0,0,0,32.65,,38.0,43.5,5.00,16.5,1.250000,0.000000,2.0,0.148148,0.083333,,,1,0,0,1,0,0,1,0,0,0,0,1,0,0.566667,0.0,0.653846,0.0
106,48.0,1,1,0,1,20.40,46.0,55.0,,3.15,,,,5.0,0.259259,0.166667,0.671429,0.721519,0,1,0,1,0,0,0,0,0,0,1,0,0,0.671429,1.0,0.721519,1.0
2365,38.0,1,0,0,0,44.55,,43.5,41.0,22.10,25.0,0.800000,0.015873,3.0,0.203704,0.041667,,,1,0,1,0,1,0,0,0,0,0,0,1,0,0.633333,0.0,0.576923,0.0
12,45.0,1,0,1,0,25.00,,30.5,,6.25,32.0,0.888889,,2.5,0.185185,0.083333,0.714286,0.594937,1,0,0,1,0,0,1,0,0,0,0,1,0,0.714286,1.0,0.594937,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
724,38.0,0,0,0,0,,,,,,,,,,,,,,1,0,0,0,0,0,0,0,0,0,0,1,0,,,,
1318,,1,1,1,1,,,,,8.05,,,,4.0,,,,,0,1,0,1,0,0,0,1,0,0,0,0,1,,,,
2456,37.0,0,0,1,0,36.35,,32.0,34.5,6.00,11.5,0.888889,,2.5,0.148148,0.000000,,,0,1,0,1,0,0,1,0,0,0,0,1,0,0.566667,0.0,0.653846,0.0
1707,33.0,0,0,0,0,,,,,,,,,,,,,,1,0,0,0,0,0,0,0,0,0,0,1,0,,,,


In [196]:
chain = Chain(
        model_reg=RandomForestRegressor(random_state=42),
        model_clf=RandomForestClassifier(random_state=42),
        propagate="pred",
    )
print("Done 1")   
chain.fit(X_train, y_train, target_types=["reg","reg","reg","clf"]) #,"clf"
print("Done 2")
y_pred = chain.predict(X_test,y_test)
print("Done 3")


Done 1
Done 2
Done 3


In [197]:
y_pred.iloc[:, 0].isna().any()

False

In [198]:
y_test.iloc[:, 0].isna().any()

True

In [199]:
# Remove corresponding rows where y_test is missing
missing_rows_mask = y_test.iloc[:, 0].isna()
y_test1 = y_test.iloc[:, 0][~missing_rows_mask]
y_pred1 = y_pred.iloc[:, 0][~missing_rows_mask]

missing_rows_mask = y_test.iloc[:, 1].isna()
y_test2 = y_test.iloc[:, 1][~missing_rows_mask]
y_pred2 = y_pred.iloc[:, 1][~missing_rows_mask]

missing_rows_mask = y_test.iloc[:, 2].isna()
y_test3 = y_test.iloc[:, 2][~missing_rows_mask]
y_pred3 = y_pred.iloc[:, 2][~missing_rows_mask]

missing_rows_mask = y_test.iloc[:, 3].isna()
y_test4 = y_test.iloc[:, 3][~missing_rows_mask]
y_pred4 = y_pred.iloc[:, 3][~missing_rows_mask]

#missing_rows_mask = y_test.iloc[:, 4].isna()
#y_test5 = y_test.iloc[:, 4][~missing_rows_mask]
#y_pred5 = y_pred.iloc[:, 4][~missing_rows_mask]

In [200]:
scores = [ 
    r2_score(y_test1, y_pred1), 
    r2_score(y_test2, y_pred2), 
    r2_score(y_test3, y_pred3),
    accuracy_score(y_test4, y_pred4),
    ]
print(f"Scores = {scores[0]:.2f}, {scores[1]:.2f}, {scores[2]:.2f}, {scores[3]:.2f}") 

Scores = 0.90, 0.80, 0.64, 0.62


These are reults when predicting y_train again (which was already used to train the model)

*Questions/Notes*: 
- why don't we do chain.predict(X,y) -- cause how is the chain using y_test in this case?
- using pred="true" gives always score 1 (weird)
- when we have "clf" then the chain gives problems when trying to predict the next targets (num or cat)
- when trying to predict SMSTDY it gave a score of -0.03 :/