In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import ParameterGrid

In [2]:
# Set random seed for reproducibility
np.random.seed(12345)

In [3]:
datasets = ['detailed', 'systematic']

In [4]:
# Define parameter grid
param_grid = {
    'objective': ['survival:aft'],
    'eval_metric': ['aft-nloglik'],
    'aft_loss_distribution': ['normal'],
    'aft_loss_distribution_scale': [1],
    'tree_method': ['hist'],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [6, 8, 10, 20]
}

def perform_cross_validation(params, dtrain):
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=10000,
        nfold=2,
        early_stopping_rounds=100,
        verbose_eval=100,
        as_pandas=True,
        seed=42  # Set random seed for reproducibility
    )
    return cv_results

for dataset in datasets:
    # Load data
    folds_df = pd.read_csv(f'../../training_data/{dataset}/folds.csv')
    features_df = pd.read_csv(f'../../training_data/{dataset}/features.csv')
    target_df = pd.read_csv(f'../../training_data/{dataset}/target.csv')

    # Split data into training and test sets
    train_ids = folds_df[folds_df['fold'] == 'short']['sequenceID']
    test_ids = folds_df[folds_df['fold'] == 'long']['sequenceID']

    features_df_train = features_df[features_df['sequenceID'].isin(train_ids)]
    features_df_test = features_df[features_df['sequenceID'].isin(test_ids)]
    target_df_train = target_df[target_df['sequenceID'].isin(train_ids)]

    # Create X_train and X_test
    X_train = features_df_train.iloc[:, 1:].to_numpy()
    X_test = features_df_test.iloc[:, 1:].to_numpy()

    # Get target bounds for training
    y_lower_bound = np.exp(target_df_train['min.log.lambda'].to_numpy())
    y_upper_bound = np.exp(target_df_train['max.log.lambda'].to_numpy())

    # Create DMatrix with bounds
    dtrain = xgb.DMatrix(X_train)
    dtrain.set_float_info('label_lower_bound', y_lower_bound)
    dtrain.set_float_info('label_upper_bound', y_upper_bound)

    # Perform cross-validation to find the best parameters
    best_params = None
    best_cv_score = float('inf')
    for params in ParameterGrid(param_grid):
        cv_results = perform_cross_validation(params, dtrain)
        mean_score = cv_results['test-aft-nloglik-mean'].min()  # Minimize the log-likelihood

        if mean_score < best_cv_score:
            best_cv_score = mean_score
            best_params = params

    # Train the final model with the best parameters
    dtrain_train = xgb.DMatrix(X_train)
    dtrain_train.set_float_info('label_lower_bound', y_lower_bound)
    dtrain_train.set_float_info('label_upper_bound', y_upper_bound)

    params = best_params  # Use the best parameters from cross-validation

    bst = xgb.train(
        params, 
        dtrain_train, 
        num_boost_round=10000, 
        evals=[(dtrain_train, 'train')],
        early_stopping_rounds=1000,
        verbose_eval=100
    )

    # Create DMatrix for prediction
    dtest = xgb.DMatrix(X_test)

    # Predict
    pred_lldas_exp = bst.predict(dtest)

    # Apply logarithm transformation to predictions
    pred_lldas = np.log(pred_lldas_exp)

    # Save predictions to CSV
    lldas_df = pd.DataFrame({
        'sequenceID': features_df_test['sequenceID'],
        'llda': pred_lldas
    })
    lldas_df.to_csv(f'predictions/{dataset}.csv', index=False)

[0]	train-aft-nloglik:0.36072+0.01374	test-aft-nloglik:0.36130+0.01375
[100]	train-aft-nloglik:0.15003+0.00637	test-aft-nloglik:0.18949+0.00803
[200]	train-aft-nloglik:0.08823+0.00301	test-aft-nloglik:0.15101+0.00714
[300]	train-aft-nloglik:0.06501+0.00247	test-aft-nloglik:0.14675+0.00855
[367]	train-aft-nloglik:0.05820+0.00207	test-aft-nloglik:0.14922+0.00914
[0]	train-aft-nloglik:0.36059+0.01369	test-aft-nloglik:0.36134+0.01375
[100]	train-aft-nloglik:0.14128+0.00464	test-aft-nloglik:0.19115+0.00771
[200]	train-aft-nloglik:0.07703+0.00044	test-aft-nloglik:0.15247+0.00766
[300]	train-aft-nloglik:0.05503+0.00005	test-aft-nloglik:0.14785+0.00866
[383]	train-aft-nloglik:0.04771+0.00021	test-aft-nloglik:0.15177+0.00927
[0]	train-aft-nloglik:0.36051+0.01367	test-aft-nloglik:0.36136+0.01376
[100]	train-aft-nloglik:0.13617+0.00340	test-aft-nloglik:0.19138+0.00912
[200]	train-aft-nloglik:0.07258+0.00050	test-aft-nloglik:0.15226+0.00842
[300]	train-aft-nloglik:0.05167+0.00021	test-aft-nloglik: