In [1]:
import pandas as pd
import numpy as np
import os
import xgboost as xgb

In [2]:
# Set random seed for reproducibility
np.random.seed(12345)

In [3]:
category = 'previous'

In [4]:
folder_path = '../../training_data'
datasets = [name for name in os.listdir(folder_path) if os.path.isdir(os.path.join(folder_path, name))]
# datasets.remove('detailed')

In [5]:
for dataset in datasets:
    # Load data
    folds_df = pd.read_csv(f'../../training_data/{dataset}/folds.csv')
    features_df = pd.read_csv(f'../../training_data/{dataset}/features.csv')
    target_df = pd.read_csv(f'../../training_data/{dataset}/target.csv')

    for test_fold in range(1, np.unique(folds_df['fold']).__len__() + 1):
        # Split data into training and test sets
        train_ids = folds_df[folds_df['fold'] != test_fold]['sequenceID']
        test_ids = folds_df[folds_df['fold'] == test_fold]['sequenceID']

        features_df_train = features_df[features_df['sequenceID'].isin(train_ids)]
        features_df_test = features_df[features_df['sequenceID'].isin(test_ids)]
        target_df_train = target_df[target_df['sequenceID'].isin(train_ids)]

        # Create X_train and X_test
        X_train = features_df_train.iloc[:, 1:].to_numpy()
        X_test = features_df_test.iloc[:, 1:].to_numpy()

        # Split data into training and test sets
        train_ids = folds_df[folds_df['fold'] != test_fold]['sequenceID']
        test_ids = folds_df[folds_df['fold'] == test_fold]['sequenceID']

        features_df_train = features_df[features_df['sequenceID'].isin(train_ids)]
        features_df_test = features_df[features_df['sequenceID'].isin(test_ids)]
        target_df_train = target_df[target_df['sequenceID'].isin(train_ids)]

        # Create X_train and X_test
        X_train = features_df_train.iloc[:, 1:].to_numpy()
        X_test = features_df_test.iloc[:, 1:].to_numpy()

        # Get target bounds for training
        y_lower_bound = np.exp(target_df_train['min.log.lambda'].to_numpy())
        y_upper_bound = np.exp(target_df_train['max.log.lambda'].to_numpy())

        # Create DMatrix with bounds
        dtrain = xgb.DMatrix(X_train)
        dtrain.set_float_info('label_lower_bound', y_lower_bound)
        dtrain.set_float_info('label_upper_bound', y_upper_bound)

        # Split a portion of the training data for validation
        num_train = len(X_train)
        split_index = int(0.8 * num_train)  # 80% training, 20% validation
        dtrain_train = xgb.DMatrix(X_train[:split_index])
        dtrain_train.set_float_info('label_lower_bound', y_lower_bound[:split_index])
        dtrain_train.set_float_info('label_upper_bound', y_upper_bound[:split_index])

        dtrain_valid = xgb.DMatrix(X_train[split_index:])
        dtrain_valid.set_float_info('label_lower_bound', y_lower_bound[split_index:])
        dtrain_valid.set_float_info('label_upper_bound', y_upper_bound[split_index:])

        # Define model parameters
        params = {
            'objective': 'survival:aft',
            'eval_metric': 'aft-nloglik',
            'aft_loss_distribution': 'normal',
            'aft_loss_distribution_scale': 1,
            'tree_method': 'hist',
            'learning_rate': 0.01,
            'max_depth': 8
        }

        # Train the model
        bst = xgb.train(
            params, 
            dtrain_train, 
            num_boost_round=10000, 
            evals=[(dtrain_train, 'train'), (dtrain_valid, 'valid')],
            early_stopping_rounds=1000,
            verbose_eval=100
        )

        # Create DMatrix for prediction
        dtest = xgb.DMatrix(X_test)

        # Predict
        pred_lldas_exp = bst.predict(dtest)

        # Apply logarithm transformation to predictions
        pred_lldas = np.log(pred_lldas_exp)

        # Save predictions to CSV
        lldas_df = pd.DataFrame({
            'sequenceID': features_df_test['sequenceID'],
            'llda': pred_lldas
        })
        lldas_df.to_csv(f'predictions/{dataset}.{test_fold}.100.csv', index=False)

[0]	train-aft-nloglik:27.62642	valid-aft-nloglik:27.63102
[100]	train-aft-nloglik:7.00776	valid-aft-nloglik:6.64687
[200]	train-aft-nloglik:1.95354	valid-aft-nloglik:1.92078
[300]	train-aft-nloglik:1.00655	valid-aft-nloglik:1.20775
[400]	train-aft-nloglik:0.79023	valid-aft-nloglik:1.10860
[500]	train-aft-nloglik:0.72699	valid-aft-nloglik:1.10093
[600]	train-aft-nloglik:0.70259	valid-aft-nloglik:1.09331
[700]	train-aft-nloglik:0.69113	valid-aft-nloglik:1.09710
[800]	train-aft-nloglik:0.68405	valid-aft-nloglik:1.10065
[900]	train-aft-nloglik:0.68002	valid-aft-nloglik:1.10082
[1000]	train-aft-nloglik:0.67738	valid-aft-nloglik:1.09947
[1100]	train-aft-nloglik:0.67579	valid-aft-nloglik:1.09931
[1200]	train-aft-nloglik:0.67467	valid-aft-nloglik:1.10036
[1300]	train-aft-nloglik:0.67392	valid-aft-nloglik:1.10060
[1400]	train-aft-nloglik:0.67336	valid-aft-nloglik:1.10026
[1500]	train-aft-nloglik:0.67295	valid-aft-nloglik:1.09973
[1598]	train-aft-nloglik:0.67264	valid-aft-nloglik:1.09997
[0]	tra