In [1]:
import pickle
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sksurv.preprocessing import OneHotEncoder as SurvOneHotEncoder
from sksurv.util import Surv

from sksurv.column import encode_categorical
from sksurv.column import standardize
from sksurv.util import Surv

from sklearn.model_selection import train_test_split
from sksurv.ensemble import RandomSurvivalForest
from sksurv.linear_model import CoxnetSurvivalAnalysis

from sksurv.metrics import (
    concordance_index_censored,
    concordance_index_ipcw,
    cumulative_dynamic_auc,
    integrated_brier_score,
)

def evaluate_model_uno_c(model, test_X, test_y, train_y, times):
    pred = model.predict(test_X)
    uno_concordance = concordance_index_ipcw(train_y, test_y, pred, tau=times[-1])
    return uno_concordance

In [5]:
# pickle_file = 'data/COX_DATA_FULL_LIVING.pkl'
pickle_file = '../data/COX_DATA_FULL_LIVING_EXPERIMENTAL.pkl'

with open(pickle_file, 'rb') as f:
    dataset = pickle.load(f)

dataset.drop(['DIAG_KI', 'COD_KI'], axis=1, inplace=True)

In [6]:
yes_categorical=[
                "PRE_TX_TXFUS", 
                "GENDER",
                "ON_DIALYSIS", 
                "ETHCAT", 
                "ETHCAT_DON",
                'DIAB',
                'HCV_SEROSTATUS',  
                'LIV_DON_TY',
                "ABO_MAT", 
                'HBV_CORE', 
                
            ]
yes_numerical = [   
                # "SERUM_CREAT", # might be data leakage, as it is after the transplant
                "AGE", 
                "AGE_DON",
                "DIALYSIS_TIME",                
                "KI_CREAT_PREOP", # negative importance
                "NPKID", # negative importance
                "HGT_CM_CALC",  # negative importance
                "BMI_DON_CALC", # negative importance
                ]

In [7]:
numeric_features = [x for x in yes_numerical if x != "PTIME" and x != "PSTATUS"]

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(handle_unknown='ignore')) # maybe it's better to use not ignore
])

# Combine transformations for all features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, yes_numerical),
        ('cat', categorical_transformer, yes_categorical)
    ]
)

# Set up the final pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

# Apply preprocessing to X
# X = pipeline.fit_transform(dataset[yes_categorical + yes_numerical])

# For calculating feature importance
categorical_x = encode_categorical(dataset[yes_categorical])
numerical_x = standardize(dataset[yes_numerical])
X = pd.concat([numerical_x, categorical_x], axis=1)

survival_time = dataset["PTIME"].astype(np.float64)
event = dataset["PSTATUS"].astype(float).astype(bool)

y = Surv.from_arrays(event, survival_time, "Status", "Days")

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y["Status"], random_state=42)

In [9]:
from sksurv.ensemble import GradientBoostingSurvivalAnalysis

est = GradientBoostingSurvivalAnalysis(n_estimators=3, learning_rate=1)
est.fit(X_train, y_train)
est.score(X_test, y_test)

0.6968632652197185

In [10]:
from sklearn.inspection import permutation_importance

# uncoment if you want to calculate permutation importance (data must not be processed by pipeline)
result = permutation_importance(est, X_test, y_test, n_repeats=10, random_state=0, n_jobs=-1)

In [11]:
pd.set_option('display.max_rows', None)

# columns = numeric_features + categorical_features

importances_df = pd.DataFrame(result.importances_mean, index=X_train.columns)
importances_df.columns = ['Importance']
importances_df.sort_values(by='Importance', ascending=False, inplace=True)

# Print out feature importances
print(importances_df)

                   Importance
AGE                  0.130638
DIAB=3.0             0.018859
DIAB=5.0             0.011912
DIAB=2.0             0.007063
DIALYSIS_TIME        0.005974
ON_DIALYSIS=Y        0.005330
HGT_CM_CALC          0.000043
LIV_DON_TY=4.0       0.000000
HCV_SEROSTATUS=ND    0.000000
HCV_SEROSTATUS=P     0.000000
LIV_DON_TY=10.0      0.000000
LIV_DON_TY=11.0      0.000000
LIV_DON_TY=12.0      0.000000
LIV_DON_TY=2.0       0.000000
LIV_DON_TY=3.0       0.000000
NPKID                0.000000
DIAB=998.0           0.000000
LIV_DON_TY=6.0       0.000000
LIV_DON_TY=7.0       0.000000
LIV_DON_TY=8.0       0.000000
LIV_DON_TY=9.0       0.000000
LIV_DON_TY=999.0     0.000000
ABO_MAT=2.0          0.000000
ABO_MAT=3.0          0.000000
HBV_CORE=ND          0.000000
LIV_DON_TY=5.0       0.000000
DIAB=4.0             0.000000
KI_CREAT_PREOP       0.000000
AGE_DON              0.000000
BMI_DON_CALC         0.000000
PRE_TX_TXFUS=Y       0.000000
GENDER=M             0.000000
ETHCAT=2  