In [1]:
import pandas as pd
import numpy as np
import os
import xgboost as xgb
from sklearn.model_selection import ParameterGrid

In [2]:
# Set random seed for reproducibility
np.random.seed(12345)

In [3]:
category = 'previous'

In [4]:
folder_path = '../../training_data'
datasets = ['cancer']

In [5]:
# Define parameter grid
param_grid = {
    'objective': ['survival:aft'],
    'eval_metric': ['aft-nloglik'],
    'aft_loss_distribution': ['normal'],
    'aft_loss_distribution_scale': [1],
    'tree_method': ['hist'],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [6, 8, 10, 20]
}

def perform_cross_validation(params, dtrain):
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=10000,
        nfold=2,
        early_stopping_rounds=100,
        verbose_eval=100,
        as_pandas=True,
        seed=42  # Set random seed for reproducibility
    )
    return cv_results

for dataset in datasets:
    # Load data
    folds_df = pd.read_csv(f'../../training_data/{dataset}/folds.csv')
    features_df = pd.read_csv(f'../../training_data/{dataset}/features.csv')
    target_df = pd.read_csv(f'../../training_data/{dataset}/target.csv')

    for test_fold in range(1, np.unique(folds_df['fold']).size + 1):
        # Split data into training and test sets
        train_ids = folds_df[folds_df['fold'] != test_fold]['sequenceID']
        test_ids = folds_df[folds_df['fold'] == test_fold]['sequenceID']

        features_df_train = features_df[features_df['sequenceID'].isin(train_ids)]
        features_df_test = features_df[features_df['sequenceID'].isin(test_ids)]
        target_df_train = target_df[target_df['sequenceID'].isin(train_ids)]

        # Create X_train and X_test
        X_train = features_df_train.iloc[:, 1:].to_numpy()
        X_test = features_df_test.iloc[:, 1:].to_numpy()

        # Get target bounds for training
        y_lower_bound = np.exp(target_df_train['min.log.lambda'].to_numpy())
        y_upper_bound = np.exp(target_df_train['max.log.lambda'].to_numpy())

        # Create DMatrix with bounds
        dtrain = xgb.DMatrix(X_train)
        dtrain.set_float_info('label_lower_bound', y_lower_bound)
        dtrain.set_float_info('label_upper_bound', y_upper_bound)

        # Perform cross-validation to find the best parameters
        best_params = None
        best_cv_score = float('inf')
        for params in ParameterGrid(param_grid):
            cv_results = perform_cross_validation(params, dtrain)
            mean_score = cv_results['test-aft-nloglik-mean'].min()  # Minimize the log-likelihood

            if mean_score < best_cv_score:
                best_cv_score = mean_score
                best_params = params

        # Train the final model with the best parameters
        dtrain_train = xgb.DMatrix(X_train)
        dtrain_train.set_float_info('label_lower_bound', y_lower_bound)
        dtrain_train.set_float_info('label_upper_bound', y_upper_bound)

        params = best_params  # Use the best parameters from cross-validation

        bst = xgb.train(
            params, 
            dtrain_train, 
            num_boost_round=10000, 
            evals=[(dtrain_train, 'train')],
            early_stopping_rounds=1000,
            verbose_eval=100
        )

        # Create DMatrix for prediction
        dtest = xgb.DMatrix(X_test)

        # Predict
        pred_lldas_exp = bst.predict(dtest)

        # Apply logarithm transformation to predictions
        pred_lldas = np.log(pred_lldas_exp)

        # Save predictions to CSV
        lldas_df = pd.DataFrame({
            'sequenceID': features_df_test['sequenceID'],
            'llda': pred_lldas
        })
        lldas_df.to_csv(f'predictions/previous.{dataset}.{test_fold}.100.csv', index=False)

  y_upper_bound = np.exp(target_df_train['max.log.lambda'].to_numpy())


[0]	train-aft-nloglik:1.26991+0.06233	test-aft-nloglik:1.27391+0.06344
[100]	train-aft-nloglik:0.55033+0.02228	test-aft-nloglik:0.82766+0.09315
[200]	train-aft-nloglik:0.37527+0.00570	test-aft-nloglik:0.72806+0.08683
[300]	train-aft-nloglik:0.32436+0.00008	test-aft-nloglik:0.71380+0.07985
[400]	train-aft-nloglik:0.30666+0.00267	test-aft-nloglik:0.71832+0.07864
[401]	train-aft-nloglik:0.30655+0.00269	test-aft-nloglik:0.71835+0.07859
[0]	train-aft-nloglik:1.26976+0.06230	test-aft-nloglik:1.27411+0.06340
[100]	train-aft-nloglik:0.54116+0.02280	test-aft-nloglik:0.83787+0.09713
[200]	train-aft-nloglik:0.36723+0.00762	test-aft-nloglik:0.74333+0.09990
[300]	train-aft-nloglik:0.31797+0.00145	test-aft-nloglik:0.72321+0.09114
[400]	train-aft-nloglik:0.30146+0.00115	test-aft-nloglik:0.72577+0.08681
[426]	train-aft-nloglik:0.29898+0.00163	test-aft-nloglik:0.72694+0.08619
[0]	train-aft-nloglik:1.26975+0.06229	test-aft-nloglik:1.27411+0.06339
[100]	train-aft-nloglik:0.53941+0.02381	test-aft-nloglik: