In [122]:
import pickle
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sksurv.preprocessing import OneHotEncoder as SurvOneHotEncoder
from sksurv.util import Surv

from sksurv.column import encode_categorical
from sksurv.column import standardize

# Train data
# pickle_file = 'data/DATA_COXNET_DECEASED.pickle'
# with open(pickle_file, 'rb') as f:
#     X, y = pickle.load(f)
pd.set_option('display.max_rows', 500)

In [123]:
pickle_file = 'data/DATA_DECEASED.pkl'

with open(pickle_file, 'rb') as f:
    dataset = pickle.load(f)

In [124]:
dataset.head()

Unnamed: 0_level_0,NPKID,PTIME,DIAB,RDR2,PSTATUS,BMI_CALC,AGE_DON,DR2,PRE_TX_TXFUS,ON_DIALYSIS,ETHCAT,COLD_ISCH_KI,HCV_SEROSTATUS,DIABETES_DON,CREAT_TRR,AGE,CREAT_DON,DR53,GENDER
__null_dask_index__,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
191,0.0,3501.0,5.0,103.0,1.0,21.1,65.0,13.0,N,N,4,20.4,P,N,10.7,47.0,1.2,0.0,M
1891,0.0,6479.0,1.0,97.0,1.0,28.7,20.0,0.0,Y,N,2,25.0,N,N,22.2,40.0,1.0,0.0,M
2420,1.0,5460.0,1.0,11.0,0.0,34.4,20.0,11.0,N,N,2,16.0,N,N,23.6,28.0,1.0,0.0,M
3705,0.0,6008.0,5.0,10.0,1.0,35.7,39.0,10.0,Y,N,1,20.0,N,N,11.8,42.0,0.8,0.0,F
4702,0.0,5277.0,1.0,14.0,0.0,21.9,44.0,14.0,N,N,4,21.3,N,N,5.5,42.0,2.6,0.0,F


In [125]:
numeric_features = [
    "AGE",
    "BMI_CALC",
    "AGE_DON",
    "CREAT_TRR",
    "NPKID",
    "COLD_ISCH_KI",
]
categorical_features = [
    "ON_DIALYSIS",
    "PRE_TX_TXFUS",
    "GENDER",
    "ETHCAT",
    "DIABETES_DON",
    "DIAB",
    "HCV_SEROSTATUS",
]


In [126]:
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformations for all features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Set up the final pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

# Apply preprocessing to X
X = pipeline.fit_transform(dataset[categorical_features + numeric_features ])

# categorical_x = encode_categorical(dataset[categorical_features])
# numerical_x = standardize(dataset[numeric_features])
# X = pd.concat([numerical_x, categorical_x], axis=1)

survival_time = dataset["PTIME"].astype(np.float64)
event = dataset["PSTATUS"].astype(float).astype(bool)

y = Surv.from_arrays(event, survival_time, "Status", "Days")

with open('pickle/trained_pipeline.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

In [127]:
from sksurv.column import encode_categorical
from sksurv.column import standardize
from sksurv.util import Surv

from surv_data_pipeline.columns import COLUMNS
from sklearn.model_selection import train_test_split
from sksurv.ensemble import RandomSurvivalForest

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y["Status"], random_state=42)


In [128]:
from sksurv.linear_model import CoxnetSurvivalAnalysis

cox = CoxnetSurvivalAnalysis(l1_ratio=0.9, alpha_min_ratio=0.01, max_iter=1000)
cox.fit(X_train, y_train)

estimated_alphas = cox.alphas_
l1_ratios = [0.8]

In [129]:
from sksurv.metrics import (
    concordance_index_censored,
    concordance_index_ipcw,
    cumulative_dynamic_auc,
    integrated_brier_score,
)

def evaluate_model_uno_c(model, test_X, test_y, train_y, times):
    pred = model.predict(test_X)
    uno_concordance = concordance_index_ipcw(train_y, test_y, pred, tau=times[-1])
    return uno_concordance

In [130]:
from tqdm import tqdm
import numpy as np

best_params = None
highest_cindex=0

lower, upper = np.percentile(y["Days"], [10, 90])
times = np.arange(lower, upper + 1)

cox_grid=CoxnetSurvivalAnalysis()

pbar = tqdm(total = len(estimated_alphas)*len(l1_ratios), desc='Hyperparameter Tuning')

for ratio in l1_ratios:
    for alpha in estimated_alphas:
        cox_grid.set_params(l1_ratio=ratio, alphas=[alpha])

        cox_grid.fit(X_train, y_train)
        uno_score = evaluate_model_uno_c(cox_grid, X_test, y_test, y_train, times)

        uno = float(uno_score[0])
        if uno > highest_cindex:
            highest_cindex = uno
            best_params = (ratio, alpha)
            best_rsf_model = cox_grid

        # Update the progress bar
        pbar.update()

Hyperparameter Tuning: 100%|██████████| 100/100 [14:07<00:00,  8.47s/it]


In [131]:
highest_cindex

0.6887661166608358

In [132]:
best_params

(0.8, 0.0027421014675726627)

In [133]:
coxnet_pred = CoxnetSurvivalAnalysis(l1_ratio=best_params[0],alphas=[best_params[1]], fit_baseline_model=True)
coxnet_pred.fit(X_train, y_train)

In [134]:
lower, upper = np.percentile(y["Days"], [10, 90])
times = np.arange(lower, upper + 1)

evaluate_model_uno_c(coxnet_pred, X_test, y_test, y_train, times)

(0.6887661166608358, 101924831, 46456787, 0, 22418)

0.689 without HLA. without scikit pipeline
0.688 with HLA with scikit pipeline

In [135]:
import pickle

with open('pickle/COXNET_DECEASED_MODEL.pickle', 'wb') as f:
    pickle.dump(coxnet_pred, f)

In [136]:
prediction = coxnet_pred.predict_survival_function(X_test)

Exception ignored in: <function tqdm.__del__ at 0x17f05ad40>
Traceback (most recent call last):
  File "/Users/kyrylo/anaconda3/envs/kidney-life/lib/python3.11/site-packages/tqdm/std.py", line 1145, in __del__
    self.close()
  File "/Users/kyrylo/anaconda3/envs/kidney-life/lib/python3.11/site-packages/tqdm/notebook.py", line 283, in close
    self.disp(bar_style='danger', check_delay=False)
    ^^^^^^^^^
AttributeError: 'tqdm_notebook' object has no attribute 'disp'
Exception ignored in: <function tqdm.__del__ at 0x17f05ad40>
Traceback (most recent call last):
  File "/Users/kyrylo/anaconda3/envs/kidney-life/lib/python3.11/site-packages/tqdm/std.py", line 1145, in __del__
    self.close()
  File "/Users/kyrylo/anaconda3/envs/kidney-life/lib/python3.11/site-packages/tqdm/notebook.py", line 283, in close
    self.disp(bar_style='danger', check_delay=False)
    ^^^^^^^^^
AttributeError: 'tqdm_notebook' object has no attribute 'disp'


In [137]:
prediction.shape

(23514,)

In [138]:
from sklearn.inspection import permutation_importance


result = permutation_importance(coxnet_pred, X_test, y_test, n_repeats=10, random_state=0, n_jobs=1)

In [139]:
pd.set_option('display.max_rows', 500)

columns = numeric_features + categorical_features

importances_df = pd.DataFrame(result.importances_mean, index=X.columns)
importances_df.columns = ['Importance']
importances_df.sort_values(by='Importance', ascending=False, inplace=True)

# Print out feature importances
importances_df

AttributeError: 'numpy.ndarray' object has no attribute 'columns'

In [None]:
dataset.head()

Unnamed: 0_level_0,NPKID,PTIME,DIAB,RDR2,PSTATUS,BMI_CALC,AGE_DON,DR2,PRE_TX_TXFUS,ON_DIALYSIS,ETHCAT,COLD_ISCH_KI,HCV_SEROSTATUS,DIABETES_DON,CREAT_TRR,AGE,CREAT_DON,DR53,GENDER
__null_dask_index__,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
191,0.0,3501.0,5.0,103.0,1.0,21.1,65.0,13.0,N,N,4,20.4,P,N,10.7,47.0,1.2,0.0,M
1891,0.0,6479.0,1.0,97.0,1.0,28.7,20.0,0.0,Y,N,2,25.0,N,N,22.2,40.0,1.0,0.0,M
2420,1.0,5460.0,1.0,11.0,0.0,34.4,20.0,11.0,N,N,2,16.0,N,N,23.6,28.0,1.0,0.0,M
3705,0.0,6008.0,5.0,10.0,1.0,35.7,39.0,10.0,Y,N,1,20.0,N,N,11.8,42.0,0.8,0.0,F
4702,0.0,5277.0,1.0,14.0,0.0,21.9,44.0,14.0,N,N,4,21.3,N,N,5.5,42.0,2.6,0.0,F
