## Data Loading

In [1]:
from data.process import *

2024-07-15 20:21:37.440608: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-15 20:21:37.448182: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-15 20:21:37.458795: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-15 20:21:37.458814: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-15 20:21:37.466181: I tensorflow/core/platform/cpu_feature_gua

In [2]:
train_df, dev_df, test_df = retrieve_train_dev_test_dataframe()
X_train, Y_train = split_input_output(train_df)
X_dev, Y_dev = split_input_output(dev_df)

## Train

In [3]:
import xgboost as xgb
from model.xgb_opt import *
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, rand
import pprint

In [4]:
# train_dataset = xgb.DMatrix(data=X_train, label=Y_train, enable_categorical=True)
# dev_dataset = xgb.DMatrix(X_dev, Y_dev, enable_categorical=True)

# XGBoost Model

In [5]:
OPTIMIZED = False

## HyperParams

In [6]:
space = XGBoostSearchSpace(**xgb_tunable_hyperparams)

### optimization

In [7]:
trials = Trials()
best = fmin(
        fn=lambda params: objective(
            params, 
            xgb_fixed_params, 
            X_train, 
            Y_train, 
            X_dev, 
            Y_dev
        ), 
        space=space.Space, 
        algo=tpe.suggest, 
        max_evals=20,
        trials=trials
    )
print(best) 

[0]	validation_0-auc:0.85539                          
[100]	validation_0-auc:0.87392                        
[200]	validation_0-auc:0.87787                        
[300]	validation_0-auc:0.87929                        
[377]	validation_0-auc:0.87887                        
[0]	validation_0-auc:0.85628                                                        
[100]	validation_0-auc:0.87570                                                      
[200]	validation_0-auc:0.87854                                                      
[300]	validation_0-auc:0.87958                                                      
[366]	validation_0-auc:0.87902                                                      
[0]	validation_0-auc:0.85459                                                        
[100]	validation_0-auc:0.86360                                                    
[200]	validation_0-auc:0.86958                                                    
[300]	validation_0-auc:0.87292                   

In [None]:
try:
    pprint.pprint(trials.trials[0])
except:
    pass

{'book_time': datetime.datetime(2024, 7, 14, 23, 15, 54, 973000),
 'exp_key': None,
 'misc': {'cmd': ('domain_attachment', 'FMinIter_Domain'),
          'idxs': {'gamma': [0],
                   'learning_rate': [0],
                   'max_bin': [0],
                   'max_depth': [0],
                   'reg_lambda': [0],
                   'subsample': [0]},
          'tid': 0,
          'vals': {'gamma': [0.00029970746759093393],
                   'learning_rate': [0.02978127757766048],
                   'max_bin': [0],
                   'max_depth': [15.0],
                   'reg_lambda': [0.01242645359997944],
                   'subsample': [0.7871403718638288]},
          'workdir': None},
 'owner': None,
 'refresh_time': datetime.datetime(2024, 7, 14, 23, 17, 0, 971000),
 'result': {'loss': -0.8771351580421762, 'status': 'ok'},
 'spec': None,
 'state': 2,
 'tid': 0,
 'version': 0}


In [None]:

best['max_depth'] = int(best['max_depth'])
best['max_bin'] = MAX_BIN_CHOICE[best['max_bin']]

In [None]:
with open('xgb_best_params.txt', 'w') as f:
    f.write('best params:\n')
    for key, value in best.items():
        f.write(f"{key}: {value}\n")

    f.write('\n--------------------------------\n')
    f.write('trials:\n')
    for trial in trials.trials:
        try:
            f.write(pprint.pformat(trial.get('misc')))
            f.write('\n\n')
        except Exception as e:
            print(e)
            print('Failed to write best params to file')

In [36]:
# fixed_params_for_training = {
#     **xgb_fixed_params,
#     'n_estimators': 10000,
# }
keys_to_exclude_in_training = ['n_estimators', 'early_stopping_rounds']
filtered_xgb_params = {k: v for k, v in xgb_fixed_params.items() if k not in keys_to_exclude_in_training}
params_for_training = {
    **filtered_xgb_params,
    'gamma': 0.025068042675769814,
    'learning_rate': 0.06926657138388485,
    'max_bin': 16383,
    'max_depth': 11,
    'reg_lambda': 0.31371648373994243,
    'subsample': 0.8637049820225611,
    # override key-value pair in xgb_fixed_params
    'n_estimators': 10000,
    'early_stopping_rounds': 200,
}

In [37]:
evals = [(X_dev, Y_dev)]
model = xgb.XGBClassifier(
    **params_for_training
)

boosting = model.fit(X_train, Y_train, eval_set=evals, verbose=100)
model.save_model("xgboost_model.json")

[0]	validation_0-auc:0.85825
[100]	validation_0-auc:0.87783
[200]	validation_0-auc:0.88041
[300]	validation_0-auc:0.87970
[400]	validation_0-auc:0.87916
[407]	validation_0-auc:0.87923
