In [1]:
import pandas as pd
import numpy as np
import os
import xgboost as xgb
from sklearn.model_selection import ParameterGrid

In [2]:
# Set random seed for reproducibility
np.random.seed(12345)

In [3]:
category = 'previous'

In [4]:
folder_path = '../../training_data'
datasets = [name for name in os.listdir(folder_path) if os.path.isdir(os.path.join(folder_path, name))]
# datasets.remove('detailed')

In [5]:
# Define parameter grid
param_grid = {
    'objective': ['survival:aft'],
    'eval_metric': ['aft-nloglik'],
    'aft_loss_distribution': ['normal'],
    'aft_loss_distribution_scale': [1],
    'tree_method': ['hist'],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [6, 8, 10, 20]
}

def perform_cross_validation(params, dtrain):
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=10000,
        nfold=2,
        early_stopping_rounds=100,
        verbose_eval=100,
        as_pandas=True,
        seed=42  # Set random seed for reproducibility
    )
    return cv_results

for dataset in datasets:
    # Load data
    folds_df = pd.read_csv(f'../../training_data/{dataset}/folds.csv')
    features_df = pd.read_csv(f'../../training_data/{dataset}/features.csv')
    target_df = pd.read_csv(f'../../training_data/{dataset}/target.csv')

    for test_fold in range(1, np.unique(folds_df['fold']).size + 1):
        # Split data into training and test sets
        train_ids = folds_df[folds_df['fold'] != test_fold]['sequenceID']
        test_ids = folds_df[folds_df['fold'] == test_fold]['sequenceID']

        features_df_train = features_df[features_df['sequenceID'].isin(train_ids)]
        features_df_test = features_df[features_df['sequenceID'].isin(test_ids)]
        target_df_train = target_df[target_df['sequenceID'].isin(train_ids)]

        # Create X_train and X_test
        X_train = features_df_train.iloc[:, 1:].to_numpy()
        X_test = features_df_test.iloc[:, 1:].to_numpy()

        # Get target bounds for training
        y_lower_bound = np.exp(target_df_train['min.log.lambda'].to_numpy())
        y_upper_bound = np.exp(target_df_train['max.log.lambda'].to_numpy())

        # Create DMatrix with bounds
        dtrain = xgb.DMatrix(X_train)
        dtrain.set_float_info('label_lower_bound', y_lower_bound)
        dtrain.set_float_info('label_upper_bound', y_upper_bound)

        # Perform cross-validation to find the best parameters
        best_params = None
        best_cv_score = float('inf')
        for params in ParameterGrid(param_grid):
            cv_results = perform_cross_validation(params, dtrain)
            mean_score = cv_results['test-aft-nloglik-mean'].min()  # Minimize the log-likelihood

            if mean_score < best_cv_score:
                best_cv_score = mean_score
                best_params = params

        # Train the final model with the best parameters
        dtrain_train = xgb.DMatrix(X_train)
        dtrain_train.set_float_info('label_lower_bound', y_lower_bound)
        dtrain_train.set_float_info('label_upper_bound', y_upper_bound)

        params = best_params  # Use the best parameters from cross-validation

        bst = xgb.train(
            params, 
            dtrain_train, 
            num_boost_round=10000, 
            evals=[(dtrain_train, 'train')],
            early_stopping_rounds=1000,
            verbose_eval=100
        )

        # Create DMatrix for prediction
        dtest = xgb.DMatrix(X_test)

        # Predict
        pred_lldas_exp = bst.predict(dtest)

        # Apply logarithm transformation to predictions
        pred_lldas = np.log(pred_lldas_exp)

        # Save predictions to CSV
        lldas_df = pd.DataFrame({
            'sequenceID': features_df_test['sequenceID'],
            'llda': pred_lldas
        })
        lldas_df.to_csv(f'predictions/previous.{dataset}.{test_fold}.100.csv', index=False)

[0]	train-aft-nloglik:27.62763+0.00340	test-aft-nloglik:27.62973+0.00129
[100]	train-aft-nloglik:7.01138+0.06968	test-aft-nloglik:7.18292+0.70157
[200]	train-aft-nloglik:1.94382+0.02668	test-aft-nloglik:2.21244+0.36516
[300]	train-aft-nloglik:0.98026+0.00502	test-aft-nloglik:1.43576+0.21349
[400]	train-aft-nloglik:0.75615+0.00465	test-aft-nloglik:1.29804+0.12766
[500]	train-aft-nloglik:0.69381+0.00361	test-aft-nloglik:1.27000+0.08291
[600]	train-aft-nloglik:0.67014+0.00192	test-aft-nloglik:1.27776+0.05788
[606]	train-aft-nloglik:0.66917+0.00182	test-aft-nloglik:1.27855+0.05681
[0]	train-aft-nloglik:27.62763+0.00340	test-aft-nloglik:27.62973+0.00129
[100]	train-aft-nloglik:6.99630+0.06615	test-aft-nloglik:7.25955+0.65657
[200]	train-aft-nloglik:1.92238+0.02789	test-aft-nloglik:2.21857+0.30231
[300]	train-aft-nloglik:0.96961+0.00419	test-aft-nloglik:1.42386+0.15704
[400]	train-aft-nloglik:0.74515+0.00283	test-aft-nloglik:1.29180+0.07444
[500]	train-aft-nloglik:0.68207+0.00070	test-aft-nl

In [6]:
# for dataset in datasets:
#     # Load data
#     folds_df = pd.read_csv(f'../../training_data/{dataset}/folds.csv')
#     features_df = pd.read_csv(f'../../training_data/{dataset}/features.csv')
#     target_df = pd.read_csv(f'../../training_data/{dataset}/target.csv')

#     for test_fold in range(1, np.unique(folds_df['fold']).__len__() + 1):
#         # Split data into training and test sets
#         train_ids = folds_df[folds_df['fold'] != test_fold]['sequenceID']
#         test_ids = folds_df[folds_df['fold'] == test_fold]['sequenceID']

#         features_df_train = features_df[features_df['sequenceID'].isin(train_ids)]
#         features_df_test = features_df[features_df['sequenceID'].isin(test_ids)]
#         target_df_train = target_df[target_df['sequenceID'].isin(train_ids)]

#         # Create X_train and X_test
#         X_train = features_df_train.iloc[:, 1:].to_numpy()
#         X_test = features_df_test.iloc[:, 1:].to_numpy()

#         # Split data into training and test sets
#         train_ids = folds_df[folds_df['fold'] != test_fold]['sequenceID']
#         test_ids = folds_df[folds_df['fold'] == test_fold]['sequenceID']

#         features_df_train = features_df[features_df['sequenceID'].isin(train_ids)]
#         features_df_test = features_df[features_df['sequenceID'].isin(test_ids)]
#         target_df_train = target_df[target_df['sequenceID'].isin(train_ids)]

#         # Create X_train and X_test
#         X_train = features_df_train.iloc[:, 1:].to_numpy()
#         X_test = features_df_test.iloc[:, 1:].to_numpy()

#         # Get target bounds for training
#         y_lower_bound = np.exp(target_df_train['min.log.lambda'].to_numpy())
#         y_upper_bound = np.exp(target_df_train['max.log.lambda'].to_numpy())

#         # Create DMatrix with bounds
#         dtrain = xgb.DMatrix(X_train)
#         dtrain.set_float_info('label_lower_bound', y_lower_bound)
#         dtrain.set_float_info('label_upper_bound', y_upper_bound)

#         # Split a portion of the training data for validation
#         num_train = len(X_train)
#         split_index = int(0.8 * num_train)  # 80% training, 20% validation
#         dtrain_train = xgb.DMatrix(X_train[:split_index])
#         dtrain_train.set_float_info('label_lower_bound', y_lower_bound[:split_index])
#         dtrain_train.set_float_info('label_upper_bound', y_upper_bound[:split_index])

#         dtrain_valid = xgb.DMatrix(X_train[split_index:])
#         dtrain_valid.set_float_info('label_lower_bound', y_lower_bound[split_index:])
#         dtrain_valid.set_float_info('label_upper_bound', y_upper_bound[split_index:])

#         # Define model parameters
#         params = {
#             'objective': 'survival:aft',
#             'eval_metric': 'aft-nloglik',
#             'aft_loss_distribution': 'normal',
#             'aft_loss_distribution_scale': 1,
#             'tree_method': 'hist',
#             'learning_rate': 0.01,
#             'max_depth': 8
#         }

#         # Train the model
#         bst = xgb.train(
#             params, 
#             dtrain_train, 
#             num_boost_round=10000, 
#             evals=[(dtrain_train, 'train'), (dtrain_valid, 'valid')],
#             early_stopping_rounds=1000,
#             verbose_eval=100
#         )

#         # Create DMatrix for prediction
#         dtest = xgb.DMatrix(X_test)

#         # Predict
#         pred_lldas_exp = bst.predict(dtest)

#         # Apply logarithm transformation to predictions
#         pred_lldas = np.log(pred_lldas_exp)

#         # Save predictions to CSV
#         lldas_df = pd.DataFrame({
#             'sequenceID': features_df_test['sequenceID'],
#             'llda': pred_lldas
#         })
#         lldas_df.to_csv(f'predictions/{dataset}.{test_fold}.100.csv', index=False)