### Import necesssary libraries

In [1]:
import pandas as pd
import numpy as np
import gc
import glob
import os
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn_pandas import DataFrameMapper
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold, GroupKFold, KFold
from sklearn.linear_model import LinearRegression, BayesianRidge
from sklearn.metrics import brier_score_loss, make_scorer
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.utils import shuffle
from imblearn.under_sampling import RandomUnderSampler, EditedNearestNeighbours, TomekLinks
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek, SMOTEENN
import torch
import torchtuples as tt
from pycox.models import CoxPH
from pycox.evaluation import EvalSurv
from pycox.preprocessing.feature_transforms import OrderedCategoricalLong
from pycox.models.loss import NLLMTLRLoss
from concurrent.futures import ThreadPoolExecutor, as_completed
from sklearn.model_selection import KFold, GroupShuffleSplit
from sksurv.metrics import brier_score, integrated_brier_score, concordance_index_censored, cumulative_dynamic_auc
from sksurv.util import Surv
from scipy.stats import randint, uniform
from sklearn.model_selection import ParameterSampler
from sklearn.neighbors import NearestNeighbors
from sklearn.utils import Bunch
import traceback
import pickle
gc.collect()

20

### Load the imputed training data set

In [2]:
# gc.collect()
# def load_imputed_datasets_hdf5(base_filename):
#     imputed_datasets = {}
#     with pd.HDFStore(f"{base_filename}.h5", 'r') as store:
#         for key in store.keys():
#             estimator_name, dataset_name = key.strip('/').split('/')
#             if estimator_name not in imputed_datasets:
#                 imputed_datasets[estimator_name] = []
#             imputed_datasets[estimator_name].append(store[key])

#     # Convert lists to arrays if necessary, or process them as needed
#     return imputed_datasets

# gc.collect()

0

In [2]:
def read_imputed_datasets_hdf5(base_filename):
    datasets = {}
    with pd.HDFStore(f"{base_filename}.h5", 'r') as store:
        for key in store.keys():
            # Split the key to get estimator_name and dataset identifier
            parts = key.strip('/').split('/')
            estimator_name = parts[0]
            dataset_name = '/'.join(parts[1:])
            if estimator_name not in datasets:
                datasets[estimator_name] = []
            datasets[estimator_name].append(store[key])
    return datasets

class RowFilter(BaseEstimator, TransformerMixin):
    def __init__(self, column, condition):
        self.column = column
        self.condition = condition
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()  # To avoid modifying the original DataFrame
        if self.column in X.columns:
            X = X[self.condition(X[self.column])]
        return X.reset_index(drop=True)

# Custom transformer for log transformation
class LogTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, offset=1e-6):
        self.offset = offset

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = pd.DataFrame(X)  # Ensure X is a DataFrame
        # Apply log transformation column-wise
        X = X.apply(lambda col: np.log(col + self.offset) if np.issubdtype(col.dtype, np.number) else col)
        return X

class DataFrameTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, column_names, dtypes=None):
        self.column_names = column_names
        self.dtypes = dtypes or {}

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = pd.DataFrame(X, columns=self.column_names)
        for column, dtype in self.dtypes.items():
            X[column] = X[column].astype(dtype)
        return X
    
class DataFrameShuffler(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return shuffle(X).reset_index(drop=True)

# Define feature groups
cat_features = ['gender', 'dm', 'ht', 'sprint', 'endpoint']
log_features = ['a1c', 'po4', 'UACR_mg_g', 'Cr']
standard_features = ['age', 'alb', 'ca', 'hb', 'hco3']
impute_features = cat_features + log_features + standard_features
passthrough_features = ['key', 'date_from_sub_60']

dtypes = {
    **{name: 'float' for name in log_features + standard_features},
    **{name: 'category' for name in cat_features},
    'key': 'int'
    # 'date': 'datetime64[ns]'
}

RANDOM_SEED=12345

# Full pipeline with DataFrame preservation and shuffling
pipeline = Pipeline([
    ('impute', ColumnTransformer([
        ('imputer', IterativeImputer(estimator=BayesianRidge(), max_iter=10, random_state=RANDOM_SEED,
                                     initial_strategy='mean', n_nearest_features=None, min_value=1e-6,
                                   imputation_order='ascending'), impute_features),
        ('passthrough', 'passthrough', passthrough_features)
    ], remainder='drop')),
    ('to_df', DataFrameTransformer(impute_features + passthrough_features, dtypes)),
    ('process', ColumnTransformer([
        ('categorical', FunctionTransformer(lambda x: x.astype('category')), cat_features),
        ('log', LogTransformer(), log_features),
        ('scaler', StandardScaler(), standard_features),
        ('passthrough', 'passthrough', passthrough_features)
    ], remainder='drop')),
    ('to_df2', DataFrameTransformer(impute_features + passthrough_features, dtypes)),
    ('row_filter', RowFilter('date_from_sub_60', lambda x: x <= 1825)),  # Add this step
    ('shuffle', DataFrameShuffler())  # Shuffling step
])
X_load = read_imputed_datasets_hdf5('/mnt/d/pydatascience/g3_regress/data/X/X')
X_train = X_load['X_train_main'][0]
X_test = X_load['X_test_main'][0]
# X_train['gender'] = X_train['gender'].map({'M':1, 'F':0})
# X_test['gender'] = X_test['gender'].map({'M':1, 'F':0})

X_train_transformed = pipeline.fit_transform(X_train)
X_test_transformed = pipeline.transform(X_test)

gc.collect()


238

### Load the endpoints, prepare pipeline

In [5]:
# def prepare_endpoint_data(endpoint_df):
#     # Convert date columns to datetime
#     date_columns = ['first_sub_60_date', 'first_sub_10_date', 'ot_date', 'death_date', 'first_endpoint_date']
#     for col in date_columns:
#         endpoint_df[col] = pd.to_datetime(endpoint_df[col], errors='coerce')  # Use coerce to handle any invalid date formats
    
#     # Drop the 'Unnamed: 0' column if it's just an artifact of reading from a CSV
#     if 'Unnamed: 0' in endpoint_df.columns:
#         endpoint_df = endpoint_df.drop(columns=['Unnamed: 0'])
    
#     return endpoint_df

# def merge_with_endpoint(imputed_datasets, endpoint_df):
#     # Prepare the endpoint data
#     endpoint_df = prepare_endpoint_data(endpoint_df)
#     # Dictionary to hold the merged data
#     merged_datasets = {}

#     # Loop over each estimator and its datasets
#     for estimator_name, datasets in imputed_datasets.items():
#         merged_datasets[estimator_name] = []
#         for dataset in datasets:
#             # Merge using a left join to keep all rows from the imputed dataset
#             merged_data = pd.merge(dataset, endpoint_df, on='key', how='left')
#             merged_datasets[estimator_name].append(merged_data)

#     return merged_datasets

# def process_death_key(df):
#     def process_group(group):
#         max_date = group['date'].max()
#         first_endpoint_date = group['first_endpoint_date'].iloc[0]

#         if max_date < first_endpoint_date and (group['endpoint'] == 2).any():
#             latest_row = group.loc[group['date'].idxmax()]
#             new_row = latest_row.copy()
#             new_row['date'] = first_endpoint_date
#             new_row['date_from_sub_60'] = (new_row['date'] - new_row['first_sub_60_date']).days
#             new_row['date_till_endpoint'] = (new_row['first_endpoint_date'] - new_row['date']).days
#             new_row['endpoint'] = 2
#             group['endpoint'] = 0  # Set all existing rows' endpoint to 0
#             return pd.concat([group, new_row.to_frame().T], ignore_index=True)
#         else:
#             return group

#     return df.groupby('key').apply(process_group).reset_index(drop=True)

# def prepare_for_training(merged_datasets):
#     prepared_datasets = {}

#     for estimator_name, datasets in merged_datasets.items():
#         prepared_datasets[estimator_name] = []
#         for df in datasets:
#             df = df.copy()
            
#             # Handle categorical data
#             df['gender'] = LabelEncoder().fit_transform(df['gender'])
#             # Filter rows where 'date' is greater than 'first_sub_60_date'
#             df = df[df['date'] > df['first_sub_60_date']]
#             # Calculate 'date_from_sub_60' in days
#             df.loc[:, 'date_from_sub_60'] = (df['date'] - df['first_sub_60_date']).dt.days.astype(float)
#             # Calculate 'date_till_endpoint' in days
#             df.loc[:, 'date_till_endpoint'] = (df['first_endpoint_date'] - df['date']).dt.days.astype(float)
#             # Log scaling for 'UACR_mg_g', handle 0, negative and NaN values appropriately
#             df.loc[df['UACR_mg_g'] <= 0, 'UACR_mg_g'] = 0  # Ensure no non-positive values
#             df.loc[:, 'UACR_mg_g'] = df['UACR_mg_g'] + 1  # Shift data to avoid log(0)
#             df.loc[:, 'UACR_mg_g_log'] = np.log(df['UACR_mg_g'])
#             df.loc[:, 'UACR_mg_g_log'] = df['UACR_mg_g_log'].replace(-np.inf, 0)  # Replace -inf with 0 if log(1) results in -inf
#             # df = process_death_key(df)

#             prepared_datasets[estimator_name].append(df)

#     return prepared_datasets

# # Load the datasets
# base_filename = '/mnt/d/pydatascience/g3_regress/data/X/X_imputed_mice'
# X_train_all = load_imputed_datasets_hdf5(base_filename)
# del base_filename
# endpoint_df = pd.read_csv('/mnt/d/pydatascience/g3_regress/data/pt_endpoint_ls.csv')
# X_train_all = merge_with_endpoint(X_train_all, endpoint_df)
# X_train_all = prepare_for_training(X_train_all)
# gc.collect()

588

In [3]:
X_train_transformed.info(
    
)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 431832 entries, 0 to 431831
Data columns (total 16 columns):
 #   Column            Non-Null Count   Dtype   
---  ------            --------------   -----   
 0   gender            431832 non-null  category
 1   dm                431832 non-null  category
 2   ht                431832 non-null  category
 3   sprint            431832 non-null  category
 4   endpoint          431832 non-null  category
 5   a1c               431832 non-null  float64 
 6   po4               431832 non-null  float64 
 7   UACR_mg_g         431832 non-null  float64 
 8   Cr                431832 non-null  float64 
 9   age               431832 non-null  float64 
 10  alb               431832 non-null  float64 
 11  ca                431832 non-null  float64 
 12  hb                431832 non-null  float64 
 13  hco3              431832 non-null  float64 
 14  key               431832 non-null  int64   
 15  date_from_sub_60  431832 non-null  float64 
dtypes:

### Set up training model

#### 1. Find out the best imputation strategy for the cox model

In [None]:
# def write_csv(path, data):
#     os.makedirs(os.path.dirname(path), exist_ok=True)
#     data.to_csv(path, mode='a', header=not os.path.isfile(path))

# def fit_and_evaluate(model, train_data, test_data, duration_col, event_col, penalizer, l1_ratio, fold_number):
#     try:
#         # model.set_params(penalizer=penalizer, l1_ratio=l1_ratio)
#         print(f" Fitting and evaluating Fold {fold_number} with penalizer={penalizer}, l1_ratio={l1_ratio}")
#         model.fit(train_data, duration_col=duration_col, event_col=event_col)
#         predictions = model.predict_partial_hazard(test_data)
#         score = concordance_index(test_data[duration_col], -predictions, test_data[event_col])
#         return score
#     except Exception as e:
#         # Log the error details
#         error_message = f"Fold {fold_number}: Failed to fit model with penalizer={penalizer}, l1_ratio={l1_ratio}: {str(e)}"
#         print(error_message)
#         traceback.print_exc()  # Optional: to log the traceback of the exception
        
#         # You can also append the error message to a file or a list
#         with open('model_fit_errors.log', 'a') as f:
#             f.write(error_message + '\n')
        
#         return None  # Return None or a specific indicator of failure

# def perform_cross_validation(data, k, penalizer_values, l1_ratios, duration_col, event_col):
#     kf = KFold(n_splits=k, shuffle=True, random_state=42)
#     best_score = 0
#     best_model = None
#     best_config = (None, None)
#     cpu_cores = os.cpu_count()

#     for fold_number, (train_index, test_index) in enumerate(kf.split(data), start=1):
#         train_data = data.iloc[train_index]
#         test_data = data.iloc[test_index]
#         with ThreadPoolExecutor(max_workers=cpu_cores-1 if cpu_cores > 1 else 1) as executor:
#             futures = []
#             for penalizer in penalizer_values:
#                 for l1_ratio in l1_ratios:
#                     # print(f"Recent penalizer: {penalizer}, Recent L1 ratio: {l1_ratio}")
#                     model = CoxPHFitter(penalizer=penalizer, l1_ratio=l1_ratio)
#                     # Submit the task and store the future along with its parameters in a tuple
#                     future = executor.submit(fit_and_evaluate, model, train_data, test_data, duration_col, event_col, penalizer, l1_ratio, fold_number)
#                     futures.append((future, penalizer, l1_ratio))

#             # Iterate over the list of tuples containing the futures and their parameters
#             for future, penalizer, l1_ratio in futures:
#                 score = future.result()  # Obtain the result from the future
#                 if score is not None and score > best_score:
#                     best_score = score
#                     best_config = (penalizer, l1_ratio)
#                     best_model = model  # Note: This model might not reflect the exact state when best was achieved due to threading

#     return best_model, best_score, best_config

# def fit_models_to_imputed_datasets(imputed_datasets, penalizer_values, l1_ratios, k_folds=10):
#     summary_file = '/mnt/d/pydatascience/g3_regress/doc/cox_model_summaries.csv'
#     config_file = '/mnt/d/pydatascience/g3_regress/doc/cox_best_configs.csv'

#     for estimator_name, datasets in imputed_datasets.items():
#         for dataset_index, dataset in enumerate(datasets):
#             print(f"Training of Cox model with {estimator_name} number {dataset_index} started")
#             categorical_columns = ['gender', 'dm', 'ht']
#             continuous_columns = ['age', 'Cr', 'UACR_mg_g_log', 'hb', 'a1c', 'ca', 'hco3']
#             duration_col = 'date_from_sub_60'
#             event_col = 'endpoint'

#             dataset = dataset[categorical_columns + continuous_columns + [duration_col, event_col]]
#             best_model, best_score, best_config = perform_cross_validation(dataset, k_folds, penalizer_values, l1_ratios, duration_col, event_col)
            
#             summary_df = best_model.summary
#             summary_df['imputer'] = estimator_name
#             summary_df['dataset_index'] = dataset_index
#             write_csv(summary_file, summary_df)
            
#             config_df = pd.DataFrame({
#                 'imputer': [estimator_name],
#                 'dataset_index': [dataset_index],
#                 'best_penalizer': [best_config[0]],
#                 'best_l1_ratio': [best_config[1]],
#                 'best_score': [best_score]
#             })
#             write_csv(config_file, config_df)

#             print(f"Training of Cox model with {estimator_name} number {dataset_index} completed")

#     return summary_file, config_file

# # Example usage
# penalizer_values = np.logspace(-1, 2, 3)  # From 0.1 to 100 in logarithmic scale
# l1_ratios = np.linspace(0, 1, 5)
# summaries, best_penalizers = fit_models_to_imputed_datasets(X_train_all, penalizer_values, l1_ratios)


#### Best imputation for cox model:
- Best imputer for cox model: BayesianRidge
- Best penalizer: 0.1
- Best l1_ratio: 0
- Best concordance index: 0.7645822002946346

In [228]:
def model_init(df, params):
    net = tt.practical.MLPVanilla(df.shape[1], params['num_nodes'], 1, params['batch_norm'], params['dropout'])
    model = CoxPH(net, tt.optim.AdamWR(decoupled_weight_decay=1e-6,
                            cycle_eta_multiplier=0.8))
    model.optimizer.set_lr(params['lr'])
    return model

def define_medoid(X,y, duration_col, event_col):
    
    majority_mask = y == 0
    majority_indices = np.where(y == 0)[0]
    minority_indices = np.where(y == 1)[0]
    
    features = [col for col in X.columns if col not in duration_col]
    X['majority_mask'] = majority_mask
    X_majority = X.loc[X['majority_mask'] == True, features]
    X_majority_duration = X.loc[X_train['majority_mask'] == True, duration_col]
    
    n_clusters = min(np.sum(y == 1), len(X_majority))
    print(f"Setting n_neighbors to {n_clusters} (size of minority class)")
    # Initialize and fit NearestNeighbors
    nn = NearestNeighbors(n_neighbors=n_clusters + 1, algorithm='auto', n_jobs=-1)  # +1 because the first neighbor is the point itself
    nn.fit(X_majority)
    
    # Compute the distances to the n_neighbors-th nearest neighbor for each point
    # Suppose medoid should be the row that have lowest distances with all other point
    distances, _ = nn.kneighbors(X_majority)
    total_distance = distances.sum(axis=1)
    cluster_center = np.argsort(total_distance)[:n_clusters]
    X_train_medoid = X_majority.iloc[cluster_center].copy()
    X_remain = X.copy()
    X_remain.loc[:, event_col] = y
    X_remain = X_remain.drop(X_majority.index[cluster_center])
    y_remain = X_remain[event_col].values
    X_remain = X_remain.drop(columns=event_col)
    X_remain = X_remain.drop(columns=['majority_mask'])
    
    X_train_medoid.loc[:, duration_col] = X_majority_duration.iloc[cluster_center]
    y_train_majority = y[majority_indices]
    X_train_medoid.loc[:, event_col] = y_train_majority[cluster_center].copy()
    
    X_minority = X_train.loc[X_train['majority_mask'] == False]
    y_train_minority = y_train_event[minority_indices]
    X_minority.loc[:,event_col] = y_train_minority
   
    # Concat
    X_total = pd.concat([X_train_medoid, X_minority])
    X_total = X_total.sample(frac=1)
    X = X_total.drop(columns=event_col)
    X = X_total.drop(columns=['majority_mask'])
    y = X_total[event_col].values
    
    return X, y, X_remain, y_remain

def recursive_clustering(model, X, y, duration_col, event_col, feature_col, params, val, callbacks, max_repeats):
    remaining_data = X.copy()
    remaining_y = y.copy()
    repeat_count = 0
    logs = []
    while len(remaining_data) > 0 and repeat_count < max_repeats:
        print(f"Performing clustering iteration {repeat_count}")
        X_cluster, y_cluster, remaining_data, remaining_y = define_medoid(remaining_data, remaining_y, duration_col, event_col)
        
        X_train_cluster_tensor, y_train_cluster_tensor = prepare_tensor(X_cluster, y_cluster, feature_col=feature_col, duration_col=duration_col)
        lrfinder = model.lr_finder(X_train_cluster_tensor, y_train_cluster_tensor, params['batch_size'], tolerance=10)
        model.optimizer.set_lr(lrfinder.get_best_lr())
        log = model.fit(X_train_cluster_tensor, y_train_cluster_tensor, params['batch_size'], params['epochs'], callbacks, verbose=True, val_data=val)
        
        logs.append(log)
        repeat_count += 1
        gc.collect()

        # Stop if there are no more majority samples left to cluster
        if remaining_data.empty:
            break

    return model, logs

model_cluster = model_init(X[feature_col], params=params)
model_cluster, log_cluster = recursive_clustering(model=model_cluster, X=X_train, y=y_train_event, 
                                                  duration_col=duration_col,
                                                  event_col=event_col, feature_col=feature_col, params=params,
                                                  val=val, callbacks=callbacks, max_repeats=10)
gc.collect()
# X_cluster, y_event_cluster, X_cluster_remain = define_medoid(X_train, y_train_event, duration_col=duration_col, event_col=event_col)
# X_train_cluster_tensor, y_train_cluster_tensor = prepare_tensor(X_cluster, y_event_cluster, feature_col=feature_col, duration_col=duration_col)
# log_cluster = model_cluster.fit(X_train_cluster_tensor, y_train_cluster_tensor, params['batch_size'], params['epochs'], callbacks, verbose=True, val_data=val)

Performing clustering iteration 1
Setting n_neighbors to 788 (size of minority class)
0.08902150854450441
0:	[0s / 0s],		train_loss: 4.8731,	val_loss: 7.5142


AttributeError: 'TrainingLogger' object has no attribute 'get_info'

In [214]:
gc.collect()

35605

In [216]:
len(model_cluster.compute_baseline_hazards())

833

In [182]:
def process_test_batch(X_batch, y_batch, model, max_follow_up_time, time_points, feature_col=feature_col, duration_col=duration_col,):
    original_indices = X_batch.index
    # Transform and prepare tensors from the batch
    X_batch.loc[:, feature_col] = X_batch[feature_col].apply(lambda x: x.cat.codes if x.dtype.name == 'category' else x)
    X_test_batch = X_batch[feature_col + duration_col].copy()
    y_test_event_batch = y_batch
    X_test_batch_tensor, y_test_batch_tensor = prepare_tensor(X_test_batch, y_test_event_batch,
                                                        feature_col=feature_col, 
                                                        duration_col=duration_col)
    
    # max_follow_up_time = int(min(1825, X_batch_prepared[duration_col].values.max()))
    # time_points = np.linspace(0, max_follow_up_time, int(max_follow_up_time), dtype='int')
    # Predict
    y_pred = model.predict_surv_df(X_test_batch_tensor, max_duration=max_follow_up_time)
    
    # Convert predictions to DataFrame and match time points
    # y_pred_df = pd.DataFrame(y_pred.T, index=np.arange(y_pred.shape[1]))
    y_pred = y_pred.loc[time_points]
    y_pred = y_pred.T
    y_pred.index = original_indices
    return y_pred

n = 200  # Batch size
# y_fin_val = (X_fin_val[event_col] == 1).astype(int).values
# X_fin_val = X_fin_val[feature_col + duration_col].copy()
max_follow_up_time = int(min(1825, X_fin_val[duration_col].values.max()))
time_points = [i for i in range(0, max_follow_up_time)]
X_batches = [X_fin_val.iloc[i:i + n] for i in range(0, X_fin_val.shape[0], n)]
y_batches = [y_fin_val[i:i + n] for i in range(0, y_fin_val.shape[0], n)]
results_smoteenn = [process_test_batch(X_batch=X_batch, y_batch=y_batch, model=model_smoteenn,
                                       max_follow_up_time=max_follow_up_time,
                                       time_points=time_points)
                    for X_batch, y_batch in zip(X_batches, y_batches)]
final_results_smoteenn = pd.concat(results_smoteenn)

results_smotetomek = [process_test_batch(X_batch=X_batch, y_batch=y_batch, model=model_smotetomek,
                                         max_follow_up_time=max_follow_up_time,
                                         time_points=time_points)
                      for X_batch, y_batch in zip(X_batches, y_batches)] 
final_results_smotetomek = pd.concat(results_smotetomek)

results_cluster = [process_test_batch(X_batch=X_batch, y_batch=y_batch, model=model_cluster, 
                                      max_follow_up_time=max_follow_up_time,
                                      time_points=time_points)
                   for X_batch, y_batch in zip(X_batches, y_batches)]
final_results_cluster = pd.concat(results_cluster)
# Process each batch and collect the results
# results = [process_test_batch(X_batch=X_batches, model=model_smoteenn) for batch in batches]

gc.collect()

KeyError: '[4, 5, 6, 8, 9, 10, 14, 15, 17, 18, 19, 21, 23, 25, 26, 27, 30, 31, 32, 33, 34, 38, 39, 43, 44, 45, 46, 47, 48, 50, 51, 52, 53, 55, 56, 60, 61, 63, 64, 66, 67, 71, 72, 75, 76, 77, 78, 82, 86, 90, 102, 114, 121, 122, 124, 136, 138, 142, 144, 147, 150, 151, 154, 155, 157, 158, 162, 163, 164, 172, 184, 185, 186, 187, 188, 190, 193, 194, 201, 205, 213, 215, 226, 232, 235, 237, 238, 240, 243, 246, 247, 253, 256, 258, 261, 262, 265, 267, 269, 270, 272, 276, 277, 278, 282, 291, 297, 308, 309, 311, 314, 319, 321, 325, 326, 330, 338, 340, 341, 342, 347, 349, 351, 352, 353, 354, 355, 356, 360, 362, 364, 365, 366, 372, 373, 374, 375, 378, 379, 380, 387, 390, 391, 392, 395, 402, 404, 407, 410, 413, 415, 416, 418, 422, 423, 425, 432, 436, 438, 443, 445, 452, 453, 460, 463, 465, 467, 468, 470, 471, 472, 473, 479, 480, 481, 486, 487, 488, 490, 491, 493, 498, 501, 502, 504, 506, 507, 508, 509, 511, 514, 515, 517, 518, 520, 522, 523, 527, 542, 543, 545, 547, 548, 549, 551, 557, 560, 563, 564, 566, 567, 571, 574, 575, 577, 580, 582, 583, 585, 587, 588, 590, 591, 593, 594, 596, 600, 605, 606, 607, 610, 611, 612, 613, 614, 615, 617, 618, 619, 620, 624, 625, 626, 627, 628, 631, 633, 635, 637, 641, 642, 647, 648, 650, 651, 652, 653, 654, 655, 658, 660, 662, 665, 666, 667, 668, 669, 673, 674, 675, 678, 681, 684, 688, 689, 690, 693, 694, 696, 698, 700, 708, 713, 726, 727, 729, 730, 732, 735, 736, 737, 739, 743, 745, 747, 748, 751, 752, 753, 754, 760, 766, 767, 768, 775, 778, 779, 780, 781, 786, 789, 792, 793, 794, 795, 796, 802, 803, 804, 805, 808, 811, 812, 813, 816, 820, 821, 822, 825, 827, 829, 831, 833, 836, 837, 839, 843, 846, 848, 849, 850, 859, 860, 864, 865, 867, 868, 873, 874, 878, 879, 880, 881, 882, 883, 884, 885, 888, 890, 891, 892, 893, 895, 897, 898, 904, 906, 908, 911, 914, 916, 917, 920, 921, 922, 923, 925, 926, 928, 930, 931, 932, 933, 935, 939, 941, 943, 947, 948, 950, 955, 956, 959, 962, 964, 967, 969, 970, 972, 975, 976, 978, 979, 980, 982, 984, 985, 987, 988, 994, 999, 1003, 1004, 1005, 1009, 1010, 1012, 1013, 1016, 1017, 1018, 1019, 1021, 1025, 1026, 1029, 1032, 1035, 1036, 1037, 1041, 1042, 1045, 1046, 1048, 1049, 1051, 1054, 1056, 1058, 1059, 1062, 1064, 1065, 1066, 1068, 1069, 1070, 1071, 1074, 1078, 1082, 1083, 1086, 1088, 1089, 1090, 1093, 1094, 1095, 1096, 1098, 1101, 1103, 1104, 1106, 1107, 1108, 1110, 1111, 1113, 1114, 1116, 1118, 1119, 1121, 1122, 1126, 1129, 1130, 1132, 1134, 1136, 1139, 1140, 1141, 1142, 1143, 1146, 1147, 1148, 1151, 1152, 1156, 1157, 1159, 1160, 1166, 1167, 1169, 1173, 1174, 1175, 1176, 1177, 1178, 1179, 1180, 1183, 1185, 1186, 1191, 1192, 1194, 1198, 1208, 1209, 1210, 1211, 1213, 1215, 1217, 1219, 1221, 1222, 1225, 1226, 1227, 1229, 1235, 1236, 1237, 1238, 1240, 1241, 1244, 1247, 1248, 1249, 1253, 1255, 1258, 1259, 1263, 1268, 1269, 1270, 1271, 1273, 1274, 1275, 1276, 1277, 1278, 1282, 1283, 1284, 1285, 1286, 1288, 1289, 1290, 1292, 1293, 1296, 1299, 1300, 1301, 1302, 1303, 1305, 1307, 1309, 1310, 1312, 1313, 1315, 1316, 1317, 1319, 1320, 1321, 1322, 1323, 1325, 1326, 1327, 1328, 1331, 1332, 1336, 1338, 1339, 1342, 1344, 1347, 1349, 1350, 1353, 1354, 1356, 1358, 1359, 1360, 1361, 1362, 1364, 1365, 1366, 1367, 1369, 1372, 1373, 1374, 1375, 1376, 1379, 1380, 1381, 1382, 1384, 1385, 1387, 1388, 1389, 1391, 1393, 1395, 1396, 1397, 1398, 1399, 1400, 1402, 1406, 1408, 1410, 1411, 1412, 1413, 1417, 1420, 1423, 1425, 1426, 1427, 1428, 1429, 1430, 1432, 1434, 1435, 1437, 1439, 1442, 1444, 1447, 1449, 1451, 1457, 1458, 1459, 1460, 1462, 1463, 1464, 1467, 1472, 1475, 1478, 1479, 1481, 1483, 1484, 1486, 1487, 1490, 1493, 1495, 1496, 1500, 1502, 1506, 1507, 1508, 1509, 1510, 1511, 1514, 1515, 1516, 1517, 1518, 1519, 1520, 1521, 1523, 1526, 1527, 1530, 1532, 1535, 1536, 1537, 1539, 1541, 1546, 1547, 1550, 1551, 1552, 1553, 1555, 1557, 1558, 1559, 1560, 1563, 1565, 1566, 1569, 1570, 1574, 1576, 1577, 1581, 1584, 1586, 1587, 1588, 1591, 1593, 1597, 1598, 1599, 1600, 1601, 1602, 1603, 1605, 1606, 1607, 1611, 1612, 1615, 1619, 1621, 1622, 1623, 1625, 1626, 1627, 1628, 1629, 1630, 1631, 1632, 1633, 1634, 1635, 1638, 1642, 1644, 1647, 1648, 1649, 1650, 1653, 1654, 1658, 1659, 1662, 1663, 1666, 1668, 1669, 1672, 1673, 1675, 1676, 1678, 1679, 1680, 1682, 1683, 1686, 1687, 1688, 1689, 1690, 1691, 1692, 1694, 1695, 1697, 1698, 1700, 1701, 1702, 1703, 1705, 1707, 1709, 1713, 1714, 1716, 1719, 1720, 1724, 1725, 1726, 1728, 1729, 1730, 1732, 1733, 1734, 1735, 1739, 1741, 1743, 1744, 1745, 1746, 1748, 1751, 1753, 1754, 1755, 1759, 1762, 1763, 1765, 1766, 1769, 1770, 1773, 1774, 1775, 1776, 1777, 1779, 1780, 1783, 1785, 1786, 1788, 1789, 1791, 1793, 1795, 1796, 1798, 1800, 1801, 1804, 1809, 1810, 1816, 1817, 1819, 1822, 1823, 1824] not in index'

In [None]:
lrfinder = model.lr_finder(X_batch, y_train, batch_size, tolerance=10)
_ = lrfinder.plot()

In [194]:
X_batch = X_fin_val.iloc[0:200]
y_batch = y_fin_val[0:200]
max_follow_up_time = int(min(1825, X_fin_val[duration_col].values.max()))
time_points = [i for i in range(0, max_follow_up_time)]

X_batch.loc[:, feature_col] = X_batch[feature_col].apply(lambda x: x.cat.codes if x.dtype.name == 'category' else x)
X_test_batch = X_batch[feature_col + duration_col].copy()
y_test_event_batch = y_batch
X_test_batch_tensor, y_test_batch_tensor = prepare_tensor(X_test_batch, y_test_event_batch,
                                                    feature_col=feature_col, 
                                                    duration_col=duration_col)
y_pred = model_cluster.predict_surv_df(X_test_batch_tensor, max_duration=time_points)
# model_cluster.predict_surv
# results_cluster = [process_test_batch(X_batch=X_batch, y_batch=y_batch, model=model_cluster, 
#                                       max_follow_up_time=max_follow_up_time,
#                                       time_points=time_points)
#                    for X_batch, y_batch in zip(X_batches, y_batches)]
# final_results_cluster = pd.concat(results_cluster)

ValueError: ('Lengths must match to compare', (961,), (1825,))

In [189]:
y_pred

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
duration,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,1.000000,1.00000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,...,1.000000,1.00000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
1.0,1.000000,1.00000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,...,1.000000,1.00000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
2.0,1.000000,1.00000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,...,1.000000,1.00000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
3.0,1.000000,1.00000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,...,1.000000,1.00000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
7.0,1.000000,1.00000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,...,1.000000,1.00000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1814.0,0.924087,0.86031,0.957543,0.980553,0.958949,0.951138,0.958681,0.969423,0.957123,0.951428,...,0.938489,0.91261,0.941571,0.949537,0.958695,0.952072,0.974157,0.938854,0.961965,0.942656
1815.0,0.924087,0.86031,0.957543,0.980553,0.958949,0.951138,0.958681,0.969423,0.957123,0.951428,...,0.938489,0.91261,0.941571,0.949537,0.958695,0.952072,0.974157,0.938854,0.961965,0.942656
1818.0,0.924087,0.86031,0.957543,0.980553,0.958949,0.951138,0.958681,0.969423,0.957123,0.951428,...,0.938489,0.91261,0.941571,0.949537,0.958695,0.952072,0.974157,0.938854,0.961965,0.942656
1820.0,0.924087,0.86031,0.957543,0.980553,0.958949,0.951138,0.958681,0.969423,0.957123,0.951428,...,0.938489,0.91261,0.941571,0.949537,0.958695,0.952072,0.974157,0.938854,0.961965,0.942656


In [4]:
gc.collect()

20

In [5]:


n = 200  # Batch size
batches = [X_test_transformed.iloc[i:i + n] for i in range(0, X_test_transformed.shape[0], n)]
# Process each batch and collect the results
results = [process_batch(batch) for batch in batches]
# Concatenate all results into a single DataFrame
final_results = pd.concat(results)
# X_test_transformed[feature_col] = X_test_transformed[feature_col].apply(lambda x: x.cat.codes if x.dtype.name == 'category' else x)
# X_test = X_test_transformed[feature_col + duration_col].copy()
# y_test_event = (X_test_transformed[event_col] == 1).astype(int).values
# X_test_tensor, y_test_event_tensor = prepare_tensor(X_test, y_test_event, feature_col=feature_col, duration_col=duration_col)


# y_pred = model.predict_surv(X_test_tensor, max_duration=max_follow_up_time)
# # Convert y_pred to DataFrame and match time points
# y_pred_df = pd.DataFrame(y_pred.T, index=np.arange(y_pred.shape[1]))
# # Ensure y_pred is a DataFrame and time points match
# y_pred_df = y_pred_df.loc[time_points]

# integrated_brier = integrated_brier_score(y_test_surv, y_test_surv, y_pred[:, :-2], time_points[:-1])
# display(integrated_brier)

# risk_scores = -np.log(y_pred[:, :int(max_follow_up_time)])
# cidx = concordance_index_censored(y_test_event_truncated.astype(bool), y_test_duration_truncated, risk_scores.mean(axis=1))[0]
# display(cidx)
gc.collect()


0

In [7]:
y_test_duration_truncated = np.minimum(X_test[duration_col].values, max_follow_up_time)
y_test_event = (X_test_transformed[event_col] == 1).astype(int).values
y_test_event_truncated = np.where(X_test[duration_col].values > max_follow_up_time, 0, y_test_event)
y_test_surv = Surv.from_arrays(event=y_test_event_truncated.reshape(-1), time=y_test_duration_truncated.reshape(-1))
integrated_brier = integrated_brier_score(y_test_surv, y_test_surv, final_results.iloc[:, :-1], time_points[:-1])
display(integrated_brier)

# roc_auc = cumulative_dynamic_auc(y_test_surv, y_test_surv, final_results.iloc[:, :-1], time_points[:-1])

0.024053286495431304

In [120]:
# roc_auc[0][]
days_in_yr = 365.25
max_predict_yr = 5
time_points = [round(days_in_yr * i) for i in range(max_predict_yr)]
roc_auc[0][time_points]

array([       nan, 0.81024565, 0.84366161, 0.8583991 , 0.85975026])

In [113]:
# Function to preprocess the data
def preprocess_data(df, features):
    df[features] = df[features].apply(lambda x: x.cat.codes if x.dtype.name == 'category' else x)
    return df[features].values.astype(float), df['date_from_sub_60'].values.astype(float), df['key'].values

# Function to balance the data using SMOTE and random undersampling
def balance_data(X, y_event):
    enn = EditedNearestNeighbours()
    tomek = TomekLinks()
    smote_enn = SMOTEENN(enn=enn, sampling_strategy=0.5, random_state=RANDOM_SEED, n_jobs=-1)
    smote_tomek = SMOTETomek(tomek=tomek, sampling_strategy=0.5, random_state=RANDOM_SEED, n_jobs=-1)
    
    smote = SMOTE(sampling_strategy=0.1, random_state=RANDOM_SEED)
    X_smote, y_event_smote = smote.fit_resample(X, y_event)

    rus = RandomUnderSampler(sampling_strategy=0.5, random_state=RANDOM_SEED)
    X_balanced, y_event_balanced = rus.fit_resample(X_smote, y_event_smote)
    
    return X_balanced, y_event_balanced

# Function to perform random search
def random_search(df, feature_col, duration_col, event_col, event_focus, cluster_col, y_event, y_duration, param_distributions, n_iter, n_splits, weight_brier=0.5, weight_cindex=0.5):
    best_score = -np.inf
    best_params = None

    group_kfold = GroupKFold(n_splits=n_splits)
    param_sampler = list(ParameterSampler(param_distributions, n_iter=n_iter, random_state=RANDOM_SEED))

    X = df[feature_col + duration_col].copy()
    y_event = (df[event_col] == event_focus).astype(int).values
    
    for params in param_sampler:
        combined_score = 0
        for fold, (train_idx, val_idx) in enumerate(group_kfold.split(X, y_event_rrt, groups)):
            print(f'Training fold {fold} of the test dataset:')
            X_train, X_val = X[train_idx], X[val_idx]
            y_train_event, y_val_event = y_event[train_idx], y_event[val_idx]
            print()

            X_train, y_train_event, y_train_duration = balance_data(X_train, y_train_event, y_train_duration)

            net = tt.practical.MLPVanilla(X_train.shape[1], params['num_nodes'], 1, params['batch_norm'], params['dropout'])
            model = CoxPH(net, tt.optim.Adam)
            model.optimizer.set_lr(params['lr'])

            X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
            y_train_tensor = (torch.tensor(y_train_duration, dtype=torch.float32), torch.tensor(y_train_event, dtype=torch.int64))
            X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
            y_val_tensor = (torch.tensor(y_val_duration, dtype=torch.float32), torch.tensor(y_val_event, dtype=torch.int64))
            val = X_val_tensor, y_val_tensor

            # Ensure all tensors have the same length
            assert len(X_train_tensor) == len(y_train_tensor[0]) == len(y_train_tensor[1])
            assert len(X_val_tensor) == len(y_val_tensor[0]) == len(y_val_tensor[1])

            log = model.fit(X_train_tensor, y_train_tensor, params['batch_size'], params['epochs'], callbacks, verbose=False, val_data=val)
            model.compute_baseline_hazards()

            y_pred = model.predict_surv(X_val_tensor, max_duration=int(max(y_val_duration))).detach().numpy()
            y_pred_df = pd.DataFrame(y_pred.T, index=np.arange(y_pred.shape[1]))

            y_test_surv = Surv.from_arrays(event=y_val_event.astype(bool), time=y_val_duration)
            integrated_brier = integrated_brier_score(y_test_surv, y_test_surv, y_pred_df.values, np.arange(y_pred_df.shape[0]))

            risk_scores = -np.log(y_pred[:, :int(max(y_val_duration))])
            c_index = concordance_index_censored(y_val_event.astype(bool), y_val_duration, risk_scores.mean(axis=1))[0]

            # Combine the two metrics
            combined_score += (weight_brier * (1 - integrated_brier)) + (weight_cindex * c_index)

        combined_score /= n_splits

        if combined_score > best_score:
            best_score = combined_score
            best_params = params

    return best_params, best_score

# Define features and target
features = ['gender', 'dm', 'ht', 'sprint', 'a1c', 'po4', 'UACR_mg_g', 'Cr', 'age', 'alb', 'ca', 'hb', 'hco3']
duration_col = ['date_from_sub_60']
event_col = ['endpoint']
cluster_col = ['key']
target = ['date_from_sub_60', 'endpoint']

# Convert categorical features to numerical
X_train_transformed[features] = X_train_transformed[features].apply(lambda x: x.cat.codes if x.dtype.name == 'category' else x)

# Extract input data
X = X_train_transformed[features].values
y_duration = X_train_transformed[duration_col].values
groups = X_train_transformed[cluster_col].values

# Convert endpoint to binary for RRT (1) and death (2)
y_event_rrt = (X_train_transformed['endpoint'] == 1).astype(int).values
y_event_death = (X_train_transformed['endpoint'] == 2).astype(int).values

# Define parameter distributions for random search
param_distributions = {
    'num_nodes': [[64, 32], [128, 64], [128, 64, 32]],
    'batch_norm': [True, False],
    'dropout': uniform(0.1, 0.5),
    'lr': uniform(1e-5, 1e-3),
    'batch_size': randint(128, 512),
    'epochs': randint(100, 300)
}

# Perform random search
n_iter = 20
n_splits = 5
best_params, best_score = random_search(X, y_event_rrt, y_duration, param_distributions, n_iter, n_splits)

# Train final model using best parameters
X_train_balanced, y_train_event_balanced, y_train_duration_balanced = balance_data(X, y_event_rrt, y_duration)

AssertionError: 

In [None]:

# Calculate class weights for RRT
rrt_class_counts = np.bincount(y_event_rrt)
rrt_class_weights = 1.0 / rrt_class_counts
rrt_sample_weights = rrt_class_weights[y_event_rrt]
death_class_counts = np.bincount(y_event_death)
death_class_weights = 1.0 / death_class_counts
death_sample_weights = death_class_weights[y_event_death]

# Define the number of splits for cross-validation
n_splits = 10
group_kfold = GroupKFold(n_splits=n_splits)
# Prepare the split indices
splits_rrt = list(group_kfold.split(X, y_event_rrt, groups))
splits_death = list(group_kfold.split(X, y_event_death, groups))

# Prepare the split indices
splits = list(group_kfold.split(X, y_event_rrt, groups))

# Define the DeepSurv model
in_features = X_train.shape[1]
num_nodes = [128, 64]
out_features = 1
batch_norm = True
dropout = 0.4
output_bias = False

net = tt.practical.MLPVanilla(in_features, num_nodes, out_features, batch_norm, dropout)
model_rrt = CoxPH(net, tt.optim.Adam)
model_rrt.optimizer.set_lr(0.0001)

# Set training parameters
batch_size = 256
epochs = 512
callbacks = [tt.callbacks.EarlyStopping(patience=10)]

# Train the model using cluster cross-validation for RRT
for fold, (train_idx, val_idx) in enumerate(splits):
    print(f"Training RRT model fold {fold+1}/{n_splits}")
    X_train, X_val = X[train_idx], X[val_idx]
    y_train_duration, y_val_duration = y_duration[train_idx], y_duration[val_idx]
    y_train_event_rrt, y_val_event_rrt = y_event_rrt[train_idx], y_event_rrt[val_idx]
    train_sample_weights = torch.tensor(rrt_sample_weights[train_idx], dtype=torch.float32)
    
    # Ensure all inputs are numpy arrays
    X_train, X_val = X_train.astype(float), X_val.astype(float)
    y_train_duration, y_val_duration = y_train_duration.astype(float), y_val_duration.astype(float)
    y_train_event_rrt, y_val_event_rrt = y_train_event_rrt.astype(int), y_val_event_rrt.astype(int)
    
    X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
    y_train_tensor = (torch.tensor(y_train_duration, dtype=torch.float32), torch.tensor(y_train_event_rrt, dtype=torch.int64))
    X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
    y_val_tensor = (torch.tensor(y_val_duration, dtype=torch.float32), torch.tensor(y_val_event_rrt, dtype=torch.int64))
    val = X_val_tensor, y_val_tensor

    log = model_rrt.fit(X_train_tensor, y_train_tensor, batch_size, epochs, callbacks, verbose=True, val_data=val)
    
# Predict the risk for each time point
gc.collect()

X_test_transformed[features] = X_test_transformed[features].apply(lambda x: x.cat.codes if x.dtype.name == 'category' else x)
X_test_tensor = torch.tensor(X_test_transformed[features].values, dtype=torch.float32)

y_test_duration = X_test_transformed['date_from_sub_60'].values
y_test_event_rrt = (X_test_transformed['endpoint'] == 1).astype(int).values
y_test_event_death = (X_test_transformed['endpoint'] == 2).astype(int).values

max_follow_up_time = int(min(1825, y_test_duration.max()))
time_points = np.linspace(0, max_follow_up_time, int(max_follow_up_time), dtype='int')
y_test_duration_truncated = np.minimum(y_test_duration, max_follow_up_time)
y_test_event_truncated = np.where(y_test_duration > max_follow_up_time, 0, y_test_event_rrt)
y_test_surv = Surv.from_arrays(event=y_test_event_truncated, time=y_test_duration_truncated)

model_rrt.compute_baseline_hazards()
y_pred = model_rrt.predict_surv(X_test_tensor, max_duration=max_follow_up_time).detach().numpy()
# Convert y_pred to DataFrame and match time points
y_pred_df = pd.DataFrame(y_pred.T, index=np.arange(y_pred.shape[1]))
# Ensure y_pred is a DataFrame and time points match
y_pred_df = y_pred_df.loc[time_points]

integrated_brier = integrated_brier_score(y_test_surv, y_test_surv, y_pred[:, :-2], time_points[:-1])
display(integrated_brier)

risk_scores = -np.log(y_pred[:, :int(max_follow_up_time)])
concordance_index_censored(y_test_event_truncated.astype(bool), y_test_duration_truncated, risk_scores.mean(axis=1))[0]
gc.collect()



In [99]:
concord_index = concordance_index_ipcw(y_test_surv, y_test_surv, y_pred)

ValueError: Expected 1D array, got 2D array instead:
array=[[1.         1.         1.         ... 0.99823599 0.99823599 0.99823599]
 [1.         1.         1.         ... 0.99269117 0.99269117 0.99269117]
 [1.         1.         1.         ... 0.99330113 0.99330113 0.99330113]
 ...
 [1.         1.         1.         ... 0.99365333 0.99365333 0.99365333]
 [1.         1.         1.         ... 0.99668962 0.99668962 0.99668962]
 [1.         1.         1.         ... 0.99255937 0.99255937 0.99255937]].


In [23]:
y_test_duration = X_test_transformed['date_from_sub_60'].values

max_follow_up_time = np.max(y_test_duration_truncated)
# time_points = np.array(time_points)
# y_pred = y_pred[:,:len(time_points)]


In [32]:
y_test_event_truncated.shape

(47548,)

In [27]:
y_test_surv = Surv.from_arrays(event=y_test_event_truncated, time=y_test_duration_truncated)
integrated_brier_score(y_test_surv, y_test_surv, y_pred[:,:int(max_follow_up_time)], y_test_duration_truncated)

ValueError: all times must be within follow-up time of test data: [0.0; 1701.0[

In [38]:
days_in_yr = 365.25
max_predict_yr = 6
time_points = [round(days_in_yr * i) for i in range(max_predict_yr)]

integrated_brier_score(y_test_surv, y_test_surv, y_pred.T[time_points], y_test_duration_truncated)

ValueError: all times must be within follow-up time of test data: [0.0; 1701.0[

In [30]:
days_in_yr = 365.25
max_predict_yr = 6
time_points = [round(days_in_yr * i) for i in range(max_predict_yr)]
y_pred.T[time_points]

duration,0.0,365.0,730.0,1096.0,1461.0,1826.0
0,1.0,0.998685,0.997425,0.995957,0.994475,0.992077
1,1.0,0.999130,0.998295,0.997322,0.996340,0.994748
2,1.0,0.998947,0.997938,0.996762,0.995575,0.993652
3,1.0,0.999042,0.998124,0.997053,0.995972,0.994222
4,1.0,0.999122,0.998281,0.997299,0.996309,0.994704
...,...,...,...,...,...,...
599807,1.0,0.999327,0.998682,0.997930,0.997170,0.995939
599808,1.0,0.998571,0.997201,0.995606,0.993996,0.991390
599809,1.0,0.998499,0.997061,0.995386,0.993695,0.990959
599810,1.0,0.999102,0.998240,0.997236,0.996222,0.994580


In [21]:
ev = EvalSurv(y_pred.T, y_test_duration_truncated, y_test_event_truncated, censor_surv='km')

AttributeError: 'Series' object has no attribute 'is_monotonic'

In [11]:
y_pred

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,599802,599803,599804,599805,599806,599807,599808,599809,599810,599811
duration,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
1.0,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
2.0,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
3.0,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
4.0,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1822.0,0.992102,0.994765,0.993672,0.994240,0.994721,0.992306,0.995439,0.989502,0.992861,0.992284,...,0.992283,0.988717,0.991707,0.993422,0.992658,0.995951,0.991417,0.990988,0.994597,0.994718
1823.0,0.992102,0.994765,0.993672,0.994240,0.994721,0.992306,0.995439,0.989502,0.992861,0.992284,...,0.992283,0.988717,0.991707,0.993422,0.992658,0.995951,0.991417,0.990988,0.994597,0.994718
1824.0,0.992094,0.994759,0.993665,0.994234,0.994715,0.992298,0.995434,0.989491,0.992853,0.992276,...,0.992275,0.988706,0.991699,0.993415,0.992650,0.995947,0.991408,0.990978,0.994591,0.994712
1825.0,0.992085,0.994754,0.993659,0.994228,0.994710,0.992290,0.995430,0.989480,0.992846,0.992268,...,0.992267,0.988694,0.991690,0.993408,0.992642,0.995943,0.991399,0.990969,0.994585,0.994707


In [49]:
display(X_train.shape)
display(y_train_duration.shape)
display(y_train_event_rrt.shape)

(539830, 13)

(539830,)

(539830,)

In [None]:
y_train_duration.shape

#### Use one imputed dataset to train first
#### Due to the need of handling competing risk (rrt and death), we use Fine-Gray subdistribution hazard model to remake the event data

In [113]:
def write_csv(path, data):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    data.to_csv(path, mode='a', header=not os.path.isfile(path))


def parallel_predict(model, data, batch_size):
    """Distribute data batches across multiple threads for prediction."""
    def batch_predict(data_batch):
        return model.predict_partial_hazard(data_batch)

    num_batches = len(data) // batch_size + (1 if len(data) % batch_size else 0)
    futures = []
    results = []
    
    with ThreadPoolExecutor() as executor:
        for i in range(num_batches):
            batch_data = data.iloc[i * batch_size:(i + 1) * batch_size]
            futures.append(executor.submit(batch_predict, batch_data))
        
        for future in futures:
            results.append(future.result())
    
    return pd.concat(results)

def fit_models_to_imputed_datasets(imputed_datasets, penalizer_values, l1_ratios, k_folds=100):
    # Initialize file paths
    summary_file = '/mnt/d/pydatascience/g3_regress/doc/cox_model_summaries.csv'
    config_file = '/mnt/d/pydatascience/g3_regress/doc/cox_best_configs.csv'
    
    # Ensure directories exist
    # os.makedirs(os.path.dirname(summary_file), exist_ok=True)
    # os.makedirs(os.path.dirname(config_file), exist_ok=True)
    
    # Create or clear the files and write headers
    with open(summary_file, 'w') as f, open(config_file, 'w') as g:
        pass  # Just to create/clear files
    
    for estimator_name, datasets in imputed_datasets.items():
        for dataset_index, dataset in enumerate(datasets):
            # Handling categorical and continuous columns as before
            categorical_columns = ['gender', 'dm', 'ht']
            continuous_columns = ['age', 'Cr', 'date_from_sub_60', 'UACR_mg_g_log', 'hb', 'a1c', 'ca', 'hco3']
            duration_col = 'date_till_endpoint'
            event_col = 'endpoint'
            # cluster_col = 'key'
            
            for col in categorical_columns:
                dataset[col] = dataset[col].astype('category')
            dataset = dataset[categorical_columns + continuous_columns + [duration_col, event_col]]
            
            # Grid search over penalizer values and L1/L2 ratios
            best_score = 0
            best_config = (None, None)
            for penalizer in penalizer_values:
                for l1_ratio in l1_ratios:
                    cph = CoxPHFitter(penalizer=penalizer, l1_ratio=l1_ratio)
                    scores = k_fold_cross_validation(cph, dataset, duration_col, event_col, k=k_folds, scoring_method="concordance_index")
                    average_score = np.mean(scores)
                    if average_score > best_score:
                        best_score = average_score
                        best_config = (penalizer, l1_ratio)
            
            # Fit the model with the best configuration
            best_model = CoxPHFitter(penalizer=best_config[0], l1_ratio=best_config[1])
            best_model.fit(dataset, duration_col=duration_col, event_col=event_col)
            
            # Convert summary to DataFrame and write to CSV
            summary_df = best_model.summary
            summary_df['imputer'] = estimator_name
            summary_df['dataset_index'] = dataset_index
            write_csv(summary_file, summary_df)
            # summary_df.to_csv(summary_file, mode='a', header=not os.path.isfile(summary_file))
            
            # Write the best configuration to another CSV
            config_df = pd.DataFrame({
                'imputer': [estimator_name],
                'dataset_index': [dataset_index],
                'best_penalizer': [best_config[0]],
                'best_l1_ratio': [best_config[1]],
                'best_score': [best_score]
            })
            write_csv(config_file, config_df)
            
            print(f"Training of Cos model with {estimator_name} number {dataset_index} completed")
            # config_df.to_csv(config_file, mode='a', header=not os.path.isfile(config_file))
    
    return summary_file, config_file


penalizer_values = np.logspace(-1, 2, 3)  # From 0.1 to 100 in logarithmic scale
l1_ratios = np.linspace(0, 1, 5)
summaries, best_penalizers = fit_models_to_imputed_datasets(X_train_all, penalizer_values, l1_ratios)

AttributeError: Must call `fit` first.

In [None]:
            # Grid search over penalizer values and L1/L2 ratios
            # best_score = 0
            # best_config = (None, None)
            # for penalizer in penalizer_values:
            #     for l1_ratio in l1_ratios:
            #         cph = CoxPHFitter(penalizer=penalizer, l1_ratio=l1_ratio)
            #         scores = k_fold_cross_validation(cph, dataset, duration_col, event_col, k=k_folds, scoring_method="concordance_index")
            #         average_score = np.mean(scores)
            #         if average_score > best_score:
            #             best_score = average_score
            #             best_config = (penalizer, l1_ratio)

In [None]:
            # # Fit the model with the best configuration
            # best_model = CoxPHFitter(penalizer=best_config[0], l1_ratio=best_config[1])
            # best_model.fit(dataset, duration_col=duration_col, event_col=event_col)
            
            # # Convert summary to DataFrame and write to CSV
            # summary_df = best_model.summary
            # summary_df['imputer'] = estimator_name
            # summary_df['dataset_index'] = dataset_index
            # write_csv(summary_file, summary_df)
            # # summary_df.to_csv(summary_file, mode='a', header=not os.path.isfile(summary_file))
            
            # # Write the best configuration to another CSV
            # config_df = pd.DataFrame({
            #     'imputer': [estimator_name],
            #     'dataset_index': [dataset_index],
            #     'best_penalizer': [best_config[0]],
            #     'best_l1_ratio': [best_config[1]],
            #     'best_score': [best_score]
            # })
            # write_csv(config_file, config_df)
            
            # print(f"Training of Cos model with {estimator_name} number {dataset_index} completed")
            # config_df.to_csv(config_file, mode='a', header=not os.path.isfile(config_file))
    
    # return summary_file, config_file

In [None]:
def parallel_predict(model, data, batch_size):
    """Distribute data batches across multiple threads for prediction."""
    def batch_predict(data_batch):
        return model.predict_partial_hazard(data_batch)

    num_batches = len(data) // batch_size + (1 if len(data) % batch_size else 0)
    futures = []
    results = []
    
    with ThreadPoolExecutor() as executor:
        for i in range(num_batches):
            batch_data = data.iloc[i * batch_size:(i + 1) * batch_size]
            futures.append(executor.submit(batch_predict, batch_data))
        
        for future in futures:
            results.append(future.result())
    
    return pd.concat(results)