# Feature selection ;)

In [32]:
import pickle
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sksurv.preprocessing import OneHotEncoder as SurvOneHotEncoder
from sksurv.util import Surv

from sksurv.column import encode_categorical
from sksurv.column import standardize
from sksurv.util import Surv

from sklearn.model_selection import train_test_split
from sksurv.ensemble import RandomSurvivalForest
from sksurv.linear_model import CoxnetSurvivalAnalysis


In [33]:
from sksurv.metrics import (
    concordance_index_censored,
    concordance_index_ipcw,
    cumulative_dynamic_auc,
    integrated_brier_score,
)

def evaluate_model_uno_c(model, test_X, test_y, train_y, times):
    pred = model.predict(test_X)
    uno_concordance = concordance_index_ipcw(train_y, test_y, pred, tau=times[-1])
    return uno_concordance

In [34]:
pickle_file = 'data/COX_DATA_FULL_LIVING.pkl'

with open(pickle_file, 'rb') as f:
    dataset = pickle.load(f)

dataset.drop(['DIAG_KI', 'COD_KI'], axis=1, inplace=True)

In [35]:
dataset["HBV_CORE"].value_counts()

N     32646
ND     4997
P      1806
Name: HBV_CORE, dtype: int64

In [36]:
yes_categorical=["PRE_TX_TXFUS", 
                "GENDER",
                "ON_DIALYSIS", 
                "ABO_MAT", 
                "ETHCAT", 
                "ETHCAT_DON",
                'HBV_CORE', 
                'DIAB',
                'HBV_SUR_ANTIGEN', 
                'HCV_SEROSTATUS',  
                'LIV_DON_TY',
                # "ABO", # negative importance
                # "ABO_DON", # negative importance
                # 'HBV_SUR_ANTIGEN_DON',  # negative importance
                # "GENDER_DON", # negative importance
                # "DIABETES_DON", # zero importance
                
            ]
yes_numerical = [   
                "KI_CREAT_PREOP",
                "SERUM_CREAT",
                'BMIS', 
                "NPKID", 
                "AGE", 
                "HGT_CM_CALC", 
                "BMI_DON_CALC",
                "AGE_DON", 
                'DR1', 
                'BW6', 
                'BW4', 
                'RA1',
                'A2', 
                'DR53', 
                'C1', 
                'C2', 
                'A1',
                'RA2', 
                'DR51',
                'DR52', 
                'DQ2', 
                'RDR1',
                # 'AMIS', # negative importance    
                # "WGT_KG_CALC", # negative importance
                # 'DR52_2', # zero importance
                # 'DR53_2', # zero importance
                # 'B2', # negative importance
                # "CREAT_TRR", # negative importance
                # 'DR2', # negative importance
                # 'RB1',# negative importance
                # 'B1', # negative importance
                # "BMI_CALC", # negative importance 
                # "WGT_KG_DON_CALC", # negative importance
                # "HGT_CM_DON_CALC", # negative importance
                # 'DQ1',# negative importance
                # 'RB2',# negative importance
                # 'RDR2',# negative importance
                ]

In [37]:
dataset.columns[dataset.isna().any()].tolist()

[]

In [38]:
numeric_features = [x for x in yes_numerical if x != "PTIME" and x != "PSTATUS"]

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformations for all features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, yes_numerical),
        ('cat', categorical_transformer, yes_categorical)
    ]
)

# Set up the final pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

# Apply preprocessing to X
X = pipeline.fit_transform(dataset[yes_numerical + yes_categorical])

# categorical_x = encode_categorical(dataset[yes_categorical])
# numerical_x = standardize(dataset[yes_numerical])

# X = pd.concat([numerical_x, categorical_x], axis=1)

survival_time = dataset["PTIME"].astype(np.float64)
event = dataset["PSTATUS"].astype(float).astype(bool)

y = Surv.from_arrays(event, survival_time, "Status", "Days")

In [39]:
# save the preprocessor
with open('pickle/pipeline_coxnet_living.pkl', 'wb') as f:
    pickle.dump(pipeline, f)

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y["Status"], random_state=42)

In [41]:
from tqdm import tqdm
import numpy as np

best_params = None
highest_cindex=0

lower, upper = np.percentile(y["Days"], [10, 90])
times = np.arange(lower, upper + 1)

cox = CoxnetSurvivalAnalysis(l1_ratio=0.9, alpha_min_ratio=0.01, max_iter=100)
cox.fit(X_train, y_train)

estimated_alphas = cox.alphas_
l1_ratios = [0.001]

cox_grid=CoxnetSurvivalAnalysis()

pbar = tqdm(total = len(estimated_alphas)*len(l1_ratios), desc='Hyperparameter Tuning')

for ratio in l1_ratios:
    for alpha in estimated_alphas:
        cox_grid.set_params(l1_ratio=ratio, alphas=[alpha])

        cox_grid.fit(X_train, y_train)
        uno_score = evaluate_model_uno_c(cox_grid, X_test, y_test, y_train, times)

        uno = float(uno_score[0])
        if uno > highest_cindex:
            highest_cindex = uno
            best_params = (ratio, alpha)
            best_rsf_model = cox_grid

        # Update the progress bar
        pbar.update()

  cox.fit(X_train, y_train)
Hyperparameter Tuning: 100%|██████████| 100/100 [24:12:25<00:00, 871.45s/it]


In [42]:
highest_cindex # start: 0.7240850453830636 // 0.7174403712287055

0.7329622178077422

In [43]:
best_params

(0.001, 0.002029078035659028)

In [44]:
coxnet_pred = CoxnetSurvivalAnalysis(l1_ratio=best_params[0],alphas=[best_params[1]], fit_baseline_model=True)
coxnet_pred.fit(X_train, y_train)

In [45]:
lower, upper = np.percentile(y["Days"], [10, 90])
times = np.arange(lower, upper + 1)

evaluate_model_uno_c(coxnet_pred, X_test, y_test, y_train, times)

(0.7329622178077422, 7095493, 2623564, 0, 2389)

Interesting: the use of scikit-learn transformers alone increases model's c-index from 0.717 to o.733

In [48]:
import pickle

# Specify the file path where you want to save the pickle file
pickle_file = 'pickle/COXNET_LIVING_MODEL.pickle'

# Save the coxnet_pred object into the pickle file
with open(pickle_file, 'wb') as f:
    pickle.dump(coxnet_pred, f)


In [None]:
n

In [47]:
from sklearn.inspection import permutation_importance

result = permutation_importance(coxnet_pred, X_test, y_test, n_repeats=10, random_state=0, n_jobs=-1)

KeyboardInterrupt: 

In [None]:
pd.set_option('display.max_rows', None)

columns = numeric_features + categorical_features

importances_df = pd.DataFrame(result.importances_mean, index=X_test.columns)
importances_df.columns = ['Importance']
importances_df.sort_values(by='Importance', ascending=False, inplace=True)

# Print out feature importances
print(importances_df)

NameError: name 'categorical_features' is not defined

In [None]:
# TODO: transform data with the transformer

In [None]:
# TODO: train model with transformed data