In [None]:
import os 
import re
import json
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
from itertools import product
from tqdm import tqdm
import time

from sksurv.linear_model import CoxPHSurvivalAnalysis, CoxnetSurvivalAnalysis
from sksurv.ensemble import RandomSurvivalForest, GradientBoostingSurvivalAnalysis

from sksurv.metrics import cumulative_dynamic_auc, concordance_index_censored, integrated_brier_score

from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from scipy.stats import sem

import importlib.util
spec = importlib.util.spec_from_file_location('script_utils', '/PHShome/jpc91/clinical_text_project/python_scripts/script_utils.py') 
script_utils = importlib.util.module_from_spec(spec)	 
spec.loader.exec_module(script_utils)

import warnings
warnings.filterwarnings("ignore")

data_path = '/data/gusev/USERS/jpconnor/clinical_text_project/data/'
surv_path = data_path + 'survival_data/'
notes_path = data_path + 'batched_datasets/VTE_data/processed_datasets/'

time_decayed_events_df = pd.read_csv(surv_path + 'time-to-icd/full_vte_decayed_embeddings_pred_df.csv', index_col=0)
# cancer_stage_type_df = time_decayed_events_df[['DFCI_MRN', 'CANCER_STAGE', 'CANCER_TYPE']]

# mean_decayed_events_df = pd.read_csv(surv_path + 'time-to-icd/full_vte_mean_embeddings_pred_df.csv', index_col=0).drop(columns=['CANCER_STAGE', 'CANCER_TYPE']).dropna()
mean_decayed_events_df = pd.read_csv(surv_path + 'time-to-icd/full_vte_mean_embeddings_pred_df.csv', index_col=0).dropna()

mrns_to_analyze = list(set(time_decayed_events_df['DFCI_MRN'].unique()).intersection(set(mean_decayed_events_df['DFCI_MRN'].unique())))

mean_decayed_events_df = mean_decayed_events_df.loc[mean_decayed_events_df['DFCI_MRN'].isin(mrns_to_analyze)]
time_decayed_events_df = time_decayed_events_df.loc[time_decayed_events_df['DFCI_MRN'].isin(mrns_to_analyze)]

# cancer_stage_type_df = cancer_stage_type_df.loc[cancer_stage_type_df['DFCI_MRN'].isin(mrns_to_analyze)]

data_path = '/data/gusev/USERS/jpconnor/clinical_text_project/data/'
results_path = data_path + 'survival_data/results/ICD_predictions_v2/'
events = os.listdir(results_path)

In [None]:
len(mrns_to_analyze)

19479

In [None]:
import importlib.util
spec = importlib.util.spec_from_file_location('script_utils', '/PHShome/jpc91/clinical_text_project/python_scripts/script_utils.py') 
script_utils = importlib.util.module_from_spec(spec)	 
spec.loader.exec_module(script_utils)

In [None]:
events_df_w_stage = time_decayed_events_df.dropna()

events_df_w_stage_dummies = pd.get_dummies(events_df_w_stage, columns=['CANCER_STAGE', 'CANCER_TYPE'], drop_first=True)

type_cols = [col for col in events_df_w_stage_dummies.columns if 'CANCER_TYPE' in col]
stage_cols = [col for col in events_df_w_stage_dummies.columns if 'CANCER_STAGE' in col]
embed_cols = [col for col in events_df_w_stage_dummies.columns if 'EMBEDDING' in col]

In [None]:
events_df_w_stage

Unnamed: 0,DFCI_MRN,AGE_AT_TREATMENTSTART,GENDER,first_treatment_date,tt_death,death,tt_vte,vte,tt_R97.0,R97.0,...,PATHOLOGY_EMBEDDING_758,PATHOLOGY_EMBEDDING_759,PATHOLOGY_EMBEDDING_760,PATHOLOGY_EMBEDDING_761,PATHOLOGY_EMBEDDING_762,PATHOLOGY_EMBEDDING_763,PATHOLOGY_EMBEDDING_764,PATHOLOGY_EMBEDDING_765,PATHOLOGY_EMBEDDING_766,PATHOLOGY_EMBEDDING_767
3,107014,52,1.0,2015-08-12,272.0,1,57.0,1,272.0,0.0,...,0.450859,0.243804,-0.206363,0.194647,-0.169599,-0.044956,-0.419032,0.086813,-0.133701,-0.297517
27,125735,39,0.0,2018-12-20,2238.0,1,2238.0,0,2238.0,0.0,...,0.527641,0.337430,-0.359544,0.198400,-0.217727,-0.004087,-0.485390,0.107006,-0.100065,-0.299856
45,131032,65,0.0,2022-04-14,76.0,1,76.0,0,76.0,0.0,...,0.472568,0.274600,-0.238545,0.190206,-0.161633,-0.039321,-0.452198,0.083708,-0.068232,-0.351894
50,132040,44,1.0,2021-01-14,1384.0,0,1384.0,0,1384.0,0.0,...,0.446520,0.224585,-0.199595,0.186995,-0.162346,-0.039413,-0.416169,0.102057,-0.086084,-0.361926
52,133178,44,1.0,2017-12-01,2616.0,0,159.0,1,2616.0,0.0,...,0.438667,0.217678,-0.197282,0.174760,-0.163654,-0.028163,-0.424951,0.103675,-0.065650,-0.387339
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27849,1119055,78,0.0,2018-08-14,2388.0,0,2388.0,0,2388.0,0.0,...,0.465862,0.290659,-0.268181,0.177764,-0.160730,-0.012528,-0.446080,0.087141,-0.061920,-0.334991
27850,1119086,67,0.0,2023-02-21,731.0,0,731.0,0,731.0,0.0,...,0.440540,0.257709,-0.222845,0.169254,-0.160611,-0.014457,-0.410675,0.083980,-0.044364,-0.357464
27857,1119217,18,0.0,2023-02-09,707.0,0,707.0,0,707.0,0.0,...,0.436461,0.247039,-0.216796,0.202496,-0.169259,-0.025037,-0.420010,0.102314,-0.066606,-0.360219
27931,1122152,72,0.0,2023-02-13,447.0,1,28.0,1,447.0,0.0,...,0.460997,0.241586,-0.177401,0.176394,-0.179976,0.003976,-0.378216,0.072007,-0.072245,-0.372534


In [None]:
Xt = events_df_w_stage_dummies[['GENDER', 'AGE_AT_TREATMENTSTART'] + type_cols + stage_cols]
y = np.asarray(list(zip(events_df_w_stage_dummies['death'], events_df_w_stage_dummies['tt_death'])),
               dtype=[('Status', '?'), ('Survival_in_days', '<f8')])
continuous_vars = ['AGE_AT_TREATMENTSTART']

lower, upper = np.percentile(y['Survival_in_days'], [25, 75])
eval_times = np.arange(lower, upper + 1)

X_train_plus_val, X_test, y_train_plus_val, y_test = train_test_split(Xt, y, test_size=0.2, random_state=1234)
    
cv = KFold(n_splits=5, shuffle=True, random_state=1234)

results_list = []

c_index_vals = []
mean_auc_t_vals = []
ibs_vals=[]

for train_idx, val_idx in cv.split(X_train_plus_val, y_train_plus_val):
    X_train, y_train = Xt.iloc[train_idx], y[train_idx]
    X_val, y_val = Xt.iloc[val_idx], y[val_idx]

    X_train, X_val = script_utils.scale_model_data(X_train, X_val, continuous_vars)
    cox_model = CoxPHSurvivalAnalysis().fit(X_train, y_train)

    try:
        mean_auc_t, ibs, c_index = script_utils.evaluate_surv_model(cox_model, X_val, y_train, y_val, eval_times)
    except:
        mean_auc_t=np.nan; ibs=np.nan; c_index=np.nan;

    c_index_vals.append(c_index)
    mean_auc_t_vals.append(mean_auc_t)
    ibs_vals.append(ibs)

In [None]:
print(f'Average mean_auc_t = {np.mean(mean_auc_t_vals) : 0.2f}')

Average mean_auc_t =  0.68


In [None]:
alphas_to_test=np.linspace(3, 1e-3, 10)
l1_ratios=np.linspace(0.0001, 1.0, 3)
_, grid_results = script_utils.run_grid_CoxPH(events_df_w_stage_dummies, ['GENDER', 'AGE_AT_TREATMENTSTART'], ['AGE_AT_TREATMENTSTART'] + embed_cols, 
                                              embed_cols, l1_ratios, alphas_to_test, event_col='death', tstop_col='tt_death')

100%|██████████| 30/30 [09:20<00:00, 18.68s/it]


In [None]:
grid_results.sort_values(by='mean_auc(t)', ascending=False)

Unnamed: 0,l1_ratio,alpha,mean_c_index,sem_c_index,mean_auc(t),sem_auc(t),mean_ibs,sem_ibs
29,1.0,0.001,0.818388,0.003807,0.878525,0.00389,2039.093,1003.587
19,0.50005,0.001,0.811898,0.004723,0.87179,0.004839,52235.83,37406.27
8,0.0001,0.334222,0.805344,0.004029,0.865126,0.004438,8.433093,1.33418
7,0.0001,0.667444,0.795905,0.004008,0.854884,0.004474,4.595429,0.4571256
9,0.0001,0.001,0.795216,0.012613,0.852934,0.014351,211649700.0,199524800.0
6,0.0001,1.000667,0.789553,0.003944,0.84808,0.00455,3.42888,0.24244
5,0.0001,1.333889,0.784394,0.003898,0.842768,0.004512,2.848706,0.1541263
4,0.0001,1.667111,0.780215,0.003934,0.838185,0.004604,2.503558,0.1067111
3,0.0001,2.000333,0.776599,0.003883,0.834484,0.004531,2.262248,0.0891349
2,0.0001,2.333556,0.773529,0.003931,0.831253,0.004615,2.076169,0.06584107
