In [1]:
import pandas as pd
import numpy as np
import random
import sys
import xgboost as xgb
import os
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import KFold
import copy

In [2]:
dataset = 'simulated.sin'
test_fold = 1

In [3]:
# Load features, target, and fold
folds_df = pd.read_csv(f'../../data/{dataset}/folds.csv')
target_df = pd.read_csv(f'../../data/{dataset}/targets.csv')
features_df = pd.read_csv(f'../../data/{dataset}/features.csv')

# Split data into training and test sets based on the fold
train_ids = folds_df[folds_df['fold'] != test_fold].index
test_ids = folds_df[folds_df['fold'] == test_fold].index

# Prepare train and test sequences as arrays
X_train = features_df.loc[train_ids].values
y_train = target_df.loc[train_ids].values
X_test = features_df.loc[test_ids].values
y_test = target_df.loc[test_ids].values

In [4]:
y_train = copy.deepcopy(np.exp(y_train))

In [5]:
# Define parameter grid
param_grid = {
    'objective': ['survival:aft'],
    'eval_metric': ['aft-nloglik'],
    'aft_loss_distribution': ['normal'],
    'learning_rate': [0.001],
    'max_depth': [10],
    'min_child_weight': [0.001],
    'reg_alpha': [0.001],
    'reg_lambda': [0.001, 1.0]
}

In [6]:
# Perform K-Fold Cross Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

best_models = []
for train_idx, val_idx in kf.split(X_train):

    dtrain = xgb.DMatrix(X_train[train_idx])
    dtrain.set_float_info('label_lower_bound', y_train[train_idx][:, 0])
    dtrain.set_float_info('label_upper_bound', y_train[train_idx][:, 1])

    dval = xgb.DMatrix(X_train[val_idx])
    dval.set_float_info('label_lower_bound', y_train[val_idx][:, 0])
    dval.set_float_info('label_upper_bound', y_train[val_idx][:, 1])

    best_model = None
    best_val_loss = float('inf')
    for params in ParameterGrid(param_grid):
        model = xgb.train(params, dtrain, num_boost_round=5000, evals=[(dval, 'val')], early_stopping_rounds=20, verbose_eval=False)
        print(model.best_iteration)
        val_loss = model.best_score
        if val_loss < best_val_loss:
            best_model = copy.deepcopy(model)
            best_val_loss = val_loss
    best_models.append(best_model)

1599
1862
1350
2514
1292
1852
1328
2155
1382
2565


In [7]:
# get prediction on the train set
train_preds = np.zeros(X_train.shape[0])
for model in best_models:
    train_preds += model.predict(xgb.DMatrix(X_train))
train_preds /= len(best_models)

In [8]:
# print y_test along with its prediction
for i in range(len(y_train)):
    print(f'{y_train[i]} \t {train_preds[i]}')

[0.86157687 2.13589764] 	 1.0809584379196167
[0.84992599 2.08242692] 	 1.266771960258484
[1.11573681        inf] 	 2.173110842704773
[0.77855239        inf] 	 1.7251196503639221
[1.60731486 2.8917215 ] 	 1.8122792720794678
[1.32955859 2.39723742] 	 1.6269896268844604
[1.13606325 2.66523171] 	 1.2981778383255005
[0.21433351 0.5990645 ] 	 0.3945064961910248
[2.83874485 6.58353609] 	 2.5536781787872314
[1.21241229 3.68826811] 	 1.7227067708969117
[1.08803466 2.15886366] 	 1.3804918766021728
[0.30757787 0.76840296] 	 0.563711267709732
[0.47781866 1.27125805] 	 0.7218468427658081
[0.25102167 0.87039426] 	 0.6300823152065277
[0.24069637 0.55527211] 	 0.38476194739341735
[2.97607703 4.9217443 ] 	 2.5051972389221193
[0.22948078        inf] 	 0.8790407180786133
[0.         1.48966312] 	 0.33654133677482606
[0.45168872 1.28867557] 	 0.6594567239284516
[0.         1.02942871] 	 0.2930677503347397
[1.51403913 2.72624717] 	 1.8429232358932495
[1.03505234 3.67759866] 	 1.629229736328125
[0.86667    

In [9]:
# count the number of correct predictions from the test set
correct = 0
incorrect = 0
for i in range(len(y_train)):
    if y_train[i, 0] < train_preds[i] and y_train[i, 1] > train_preds[i]:
        correct += 1
    else:
        incorrect += 1

# print the number of correct and incorrect predictions
print(f'Correct: {correct}')
print(f'Incorrect: {incorrect}')

Correct: 156
Incorrect: 4


In [10]:
# Make predictions on the test set
dtest = xgb.DMatrix(X_test)
test_preds = np.zeros(X_test.shape[0])
for model in best_models:
    test_preds += model.predict(dtest)
test_preds /= len(best_models)
test_preds = np.log(test_preds)

In [11]:
# print y_test along with its prediction
for i in range(len(y_test)):
    print(f'{y_test[i]} \t {test_preds[i]}')

[-0.228758  0.583414] 	 0.3615355853369168
[0.157429 0.960006] 	 0.5507345153774135
[    -inf 0.582814] 	 0.46914526699786296
[0.470278 1.156113] 	 0.3801672374240819
[0.183034 1.112268] 	 -0.4035246494965674
[0.49832 1.41573] 	 -0.31795834850781735
[-0.432071  0.384011] 	 -0.30219474687009773
[    -inf 0.637887] 	 0.4850622185650307
[-1.529285 -0.523827] 	 -0.4725259669836724
[-0.807926 -0.330616] 	 -0.5786799562474638
[0.592316 1.380831] 	 0.7646765894070523
[0.234569 1.113224] 	 0.23060074267848285
[-1.705988 -0.6865  ] 	 -0.9661809264094748
[0.494488 1.229534] 	 0.45903055576266744
[0.184802 1.159551] 	 0.30190315795794653
[-0.649578  0.167296] 	 -0.555204481225148
[0.266344      inf] 	 0.7925693311942511
[-0.700897  0.807319] 	 -0.4984561773477483
[0.062691 0.747556] 	 -0.4356985605264991
[0.63216  1.592851] 	 0.4380651932705141
[-1.452828 -0.643022] 	 -0.9669616463285174
[0.73509  1.644592] 	 0.21614401395177318
[0.621593      inf] 	 0.45263883583573333
[0.558498      inf] 	 0.61

In [12]:
# count the number of correct predictions from the test set
correct = 0
incorrect = 0
for i in range(len(y_test)):
    if y_test[i, 0] < test_preds[i] and y_test[i, 1] > test_preds[i]:
        correct += 1
    else:
        incorrect += 1

# print the number of correct and incorrect predictions
print(f'Correct: {correct}')
print(f'Incorrect: {incorrect}')

Correct: 26
Incorrect: 14
