In [11]:
import pickle
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sksurv.preprocessing import OneHotEncoder as SurvOneHotEncoder
from sksurv.util import Surv

from sksurv.column import encode_categorical
from sksurv.column import standardize
from sksurv.util import Surv

from sklearn.model_selection import train_test_split
from sksurv.ensemble import RandomSurvivalForest
from sksurv.linear_model import CoxnetSurvivalAnalysis

from sksurv.metrics import (
    concordance_index_censored,
    concordance_index_ipcw,
    cumulative_dynamic_auc,
    integrated_brier_score,
)

def evaluate_model_uno_c(model, test_X, test_y, train_y, times):
    pred = model.predict(test_X)
    uno_concordance = concordance_index_ipcw(train_y, test_y, pred, tau=times[-1])
    return uno_concordance

In [12]:
# pickle_file = 'data/COX_DATA_FULL_LIVING.pkl'
pickle_file = '../data/COX_DATA_FULL_LIVING_EXPERIMENTAL.pkl'

with open(pickle_file, 'rb') as f:
    dataset = pickle.load(f)

dataset.drop(['DIAG_KI', 'COD_KI'], axis=1, inplace=True)

In [13]:
yes_categorical=[
                "PRE_TX_TXFUS", 
                "GENDER",
                "ON_DIALYSIS", 
                "ETHCAT", 
                "ETHCAT_DON",
                'DIAB',
                'HCV_SEROSTATUS',  
                'LIV_DON_TY',
                "ABO_MAT", 
                'HBV_CORE', 
                
            ]
yes_numerical = [   
                # "SERUM_CREAT", # might be data leakage, as it is after the transplant
                "AGE", 
                "AGE_DON",
                "DIALYSIS_TIME",                
                "KI_CREAT_PREOP", # negative importance
                "NPKID", # negative importance
                "HGT_CM_CALC",  # negative importance
                "BMI_DON_CALC", # negative importance
                ]

In [14]:
numeric_features = [x for x in yes_numerical if x != "PTIME" and x != "PSTATUS"]

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(handle_unknown='ignore')) # maybe it's better to use not ignore
])

# Combine transformations for all features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, yes_numerical),
        ('cat', categorical_transformer, yes_categorical)
    ]
)

# Set up the final pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

# Apply preprocessing to X
# X = pipeline.fit_transform(dataset[yes_categorical + yes_numerical])

# For calculating feature importance
categorical_x = encode_categorical(dataset[yes_categorical])
numerical_x = standardize(dataset[yes_numerical])
X = pd.concat([numerical_x, categorical_x], axis=1)

survival_time = dataset["PTIME"].astype(np.float64)
event = dataset["PSTATUS"].astype(float).astype(bool)

y = Surv.from_arrays(event, survival_time, "Status", "Days")

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y["Status"], random_state=42)

In [16]:
rsf = RandomSurvivalForest(n_estimators=3, n_jobs=-1, random_state=42, low_memory=True)
rsf.fit(X_train[1000:], y_train[1000:])

In [None]:
lower, upper = np.percentile(y["Days"], [10, 90])
times = np.arange(lower, upper + 1)

# evaluate_model(rsf, X_test, y_test, y_train, times)
evaluate_model_uno_c(rsf, X_test[500:], y_test[500:], y_train[1000:], times)

0.6765997616352277, 11806774, 5596229, 4195, 3547) -- all features
(0.6926788145550595, 12066401, 5338051, 2746, 3547) -- minus 4 features
(0.6976510749974301, 12242994, 5160898, 3306, 3547) -- minus npkid
(0.7005440984901211, 12156021, 5248164, 3013, 3547) -- minus SERUM_CREAT

In [17]:
from sklearn.inspection import permutation_importance

# uncoment if you want to calculate permutation importance (data must not be processed by pipeline)
result = permutation_importance(rsf, X_test, y_test, n_repeats=10, random_state=0, n_jobs=-1)

In [18]:
pd.set_option('display.max_rows', None)

# columns = numeric_features + categorical_features

importances_df = pd.DataFrame(result.importances_mean, index=X_train.columns)
importances_df.columns = ['Importance']
importances_df.sort_values(by='Importance', ascending=False, inplace=True)

# Print out feature importances
print(importances_df)

                     Importance
AGE                8.611559e-02
DIAB=3.0           1.688482e-02
DIAB=5.0           1.378395e-02
ON_DIALYSIS=Y      1.165772e-02
DIAB=2.0           8.748764e-03
DIALYSIS_TIME      7.906618e-03
LIV_DON_TY=2.0     7.867650e-03
ETHCAT_DON=4.0     4.425040e-03
AGE_DON            3.802213e-03
GENDER=M           2.272702e-03
BMI_DON_CALC       2.229051e-03
LIV_DON_TY=4.0     2.221719e-03
KI_CREAT_PREOP     2.076924e-03
ETHCAT=5           1.648426e-03
HGT_CM_CALC        1.594232e-03
LIV_DON_TY=7.0     1.573311e-03
HCV_SEROSTATUS=P   1.525862e-03
NPKID              1.358497e-03
HBV_CORE=P         9.638380e-04
ETHCAT_DON=5.0     9.541697e-04
PRE_TX_TXFUS=Y     8.942728e-04
LIV_DON_TY=999.0   4.428517e-04
ETHCAT_DON=2.0     4.313275e-04
ETHCAT=4           2.667603e-04
LIV_DON_TY=9.0     2.500209e-04
HBV_CORE=ND        2.477265e-04
LIV_DON_TY=5.0     2.303398e-04
ETHCAT=6           2.064666e-04
DIAB=998.0         1.870126e-04
ETHCAT_DON=6.0     5.614606e-05
ETHCAT_D