In [21]:
import pandas as pd
import numpy as np
import os
import xgboost as xgb

In [22]:
# Set random seed for reproducibility
np.random.seed(12345)

In [23]:
folder_path = '../training_data'
datasets = [name for name in os.listdir(folder_path) if os.path.isdir(os.path.join(folder_path, name))]

In [24]:
chosen_feature = ['length', 'variance', 'range_value', 'sum_diff']

In [None]:
# Define hyperparameter grid
param_grid = {
    'learning_rate': [0.001, 0.01, 0.1, 1.0],
    'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10],
    'min_child_weight': [0.001, 0.1, 1.0, 10.0, 100.0],
    'reg_alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0],
    'reg_lambda': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0],
    'aft_loss_distribution_scale': [0.5, 0.8, 1.1, 1.4, 1.7, 2.0]
}

# Loop through datasets
for dataset in datasets:
    # Load data
    folds_df = pd.read_csv(f'../training_data/{dataset}/folds.csv')
    features_df = pd.read_csv(f'../training_data/{dataset}/inputs.csv')
    target_df = pd.read_csv(f'../training_data/{dataset}/outputs.csv')

    for test_fold in range(1, np.unique(folds_df['fold']).__len__() + 1):
        # Split data into training and test sets
        train_ids = folds_df[folds_df['fold'] != test_fold]['sequenceID']
        test_ids = folds_df[folds_df['fold'] == test_fold]['sequenceID']

        features_df_train = features_df[features_df['sequenceID'].isin(train_ids)]
        features_df_test = features_df[features_df['sequenceID'].isin(test_ids)]
        target_df_train = target_df[target_df['sequenceID'].isin(train_ids)]

        # Create X_train and X_test
        X_train = features_df_train[chosen_feature].to_numpy()
        X_test = features_df_test[chosen_feature].to_numpy()

        # Get target bounds for training
        y_lower_bound = np.exp(target_df_train['min.log.lambda'].to_numpy())
        y_upper_bound = np.exp(target_df_train['max.log.lambda'].to_numpy())

        # Create DMatrix with bounds
        dtrain = xgb.DMatrix(X_train)
        dtrain.set_float_info('label_lower_bound', y_lower_bound)
        dtrain.set_float_info('label_upper_bound', y_upper_bound)

        # Cross-validation for hyperparameter tuning
        best_params = None
        best_score = float('inf')

        for learning_rate in param_grid['learning_rate']:
            for max_depth in param_grid['max_depth']:
                for min_child_weight in param_grid['min_child_weight']:
                    for reg_alpha in param_grid['reg_alpha']:
                        for reg_lambda in param_grid['reg_lambda']:
                            for aft_loss_distribution_scale in param_grid['aft_loss_distribution_scale']:
                                params = {
                                    'objective': 'survival:aft',
                                    'eval_metric': 'aft-nloglik',
                                    'aft_loss_distribution': 'normal',
                                    'aft_loss_distribution_scale': aft_loss_distribution_scale,
                                    'tree_method': 'hist',
                                    'learning_rate': learning_rate,
                                    'max_depth': max_depth,
                                    'min_child_weight': min_child_weight,
                                    'reg_alpha': reg_alpha,
                                    'reg_lambda': reg_lambda
                                }

                                # Perform cross-validation
                                cv_results = xgb.cv(
                                    params,
                                    dtrain,
                                    num_boost_round=10000,
                                    nfold=5,
                                    metrics='aft-nloglik',
                                    seed=42,
                                    early_stopping_rounds=1000,
                                    verbose_eval=False
                                )

                                # Check for the best score
                                mean_score = cv_results['test-aft-nloglik-mean'].min()
                                if mean_score < best_score:
                                    best_score = mean_score
                                    best_params = params

        # Train the model with the best parameters
        bst = xgb.train(
            best_params,
            dtrain,
            num_boost_round=10000,
            verbose_eval=100
        )

        # Create DMatrix for prediction
        dtest = xgb.DMatrix(X_test)

        # Predict
        pred_lldas_exp = bst.predict(dtest)

        # Apply logarithm transformation to predictions
        pred_lldas = np.log(pred_lldas_exp)

        # Save predictions to CSV
        lldas_df = pd.DataFrame({
            'sequenceID': features_df_test['sequenceID'],
            'llda': pred_lldas
        })
        lldas_df.to_csv(f'predictions/{dataset}.{test_fold}.4.csv', index=False)

[0]	train-aft-nloglik:0.37068	valid-aft-nloglik:0.84453
[100]	train-aft-nloglik:0.16032	valid-aft-nloglik:0.27397
[200]	train-aft-nloglik:0.09964	valid-aft-nloglik:0.17222
[300]	train-aft-nloglik:0.07712	valid-aft-nloglik:0.16689
[400]	train-aft-nloglik:0.06651	valid-aft-nloglik:0.18322
[500]	train-aft-nloglik:0.06188	valid-aft-nloglik:0.19991
[600]	train-aft-nloglik:0.05901	valid-aft-nloglik:0.21560
[700]	train-aft-nloglik:0.05600	valid-aft-nloglik:0.22863
[800]	train-aft-nloglik:0.05345	valid-aft-nloglik:0.24101
[900]	train-aft-nloglik:0.05147	valid-aft-nloglik:0.24792
[1000]	train-aft-nloglik:0.04997	valid-aft-nloglik:0.25213
[1100]	train-aft-nloglik:0.04834	valid-aft-nloglik:0.25484
[1200]	train-aft-nloglik:0.04683	valid-aft-nloglik:0.25778
[1254]	train-aft-nloglik:0.04609	valid-aft-nloglik:0.25886
[0]	train-aft-nloglik:0.35673	valid-aft-nloglik:0.84133
[100]	train-aft-nloglik:0.16061	valid-aft-nloglik:0.28480
[200]	train-aft-nloglik:0.10279	valid-aft-nloglik:0.17701
[300]	train-af