In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import time

In [None]:
X = # dataset less label
y = # label

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.4, random_state=42)

In [None]:
num_boost_round = 999

In [None]:
params = {
    # Parameters that we are going to tune.
    'max_depth':6,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 1,
    'colsample_bytree': 1,
    # Other parameters
    'objective':'reg:squarederror',
    # Eval with Mean Absolute Error
    'eval_metric': "rmse",
    #'eval_metric': "mae",
    #'tree_method': 'gpu_hist'
}

In [None]:
time_begin = time.time()
model = xgb.train(
    params,
    dtrain,
    num_boost_round=999,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)
print("Training Time: %s seconds" % (str(time.time() - time_begin)))

# Grid Search (with Cross Validation)

In [None]:
params = {
    # Parameters that we are going to tune.
    'max_depth':6,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 1,
    'colsample_bytree': 1,
    # Other parameters
    'objective':'reg:squarederror',
    # Eval with Mean Absolute Error
    'eval_metric': "rmse",
    #'eval_metric': "mae",
    'tree_method': 'gpu_hist',
    'predictor':'gpu_predictor'
}

time_begin = time.time()
model = xgb.train(
    params,
    dtrain,
    num_boost_round=999,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)
print("Training Time: %s seconds" % (str(time.time() - time_begin)))

In [None]:
cv_results = xgb.cv(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    seed=42,
    nfold=5,
    metrics={'rmse'},
#    metrics={'mae'},
    early_stopping_rounds=10
)
cv_results

## max_depth & min_child_weight

In [None]:
# You can try wider intervals with a larger step between
# each value and then narrow it down. Here after several
# iteration I found that the optimal value was in the
# following ranges.
gridsearch_params = [
    (max_depth, min_child_weight)
    #for max_depth in range(9,12)
    #for min_child_weight in range(5,8)
    for max_depth in range(7,10)
    for min_child_weight in range(1,4)
]

In [None]:
# Define initial best params and MAE
min_rmse = float("Inf")
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    
    time_begin = time.time()
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))
    # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'rmse'},
        #metrics={'mae'},
        early_stopping_rounds=10
    )
    # Update best MAE
#    mean_mae = cv_results['test-rmse-mean'].min()
#    boost_rounds = cv_results['test-rmse-mean'].idxmin()
#    print("\tRMSE {} for {} rounds".format(mean_mae, boost_rounds))
#    if mean_mae < min_mae:
#        min_mae = mean_mae
#        best_params = (max_depth,min_child_weight)
#print("Best params: {}, {}, RMSE: {}".format(best_params[0], best_params[1], min_mae))


    mean_rmse = cv_results['test-rmse-mean'].min()
    boost_rounds = cv_results['test-rmse-mean'].idxmin()
    print("\tRMSE {} for {} rounds".format(mean_rmse, boost_rounds))
    if mean_rmse < min_rmse:
        min_rmse = mean_rmse
        best_params = (max_depth,min_child_weight)
    print("Training Time: %s seconds" % (str(time.time() - time_begin)))
print("Best params: {}, {}, RMSE: {}".format(best_params[0], best_params[1], min_rmse))


In [None]:
params['max_depth'] = best_params[0]
params['min_child_weight'] = best_params[1]
params

## subsample & colsample

In [None]:
gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(7,11)]
    for colsample in [i/10. for i in range(7,11)]
#    for subsample in np.linspace(0.5, 1, 5)
#    for colsample in np.linspace(0.5, 1, 5)
]

In [None]:
min_rmse = float("Inf")
best_params = None
# We start by the largest values and go down to the smallest
for subsample, colsample in reversed(gridsearch_params):
    time_begin = time.time()
    print("CV with subsample={}, colsample={}".format(
                             subsample,
                             colsample))
    # We update our parameters
    params['subsample'] = subsample
    params['colsample_bytree'] = colsample
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'rmse'},
        early_stopping_rounds=10
    )
    # Update best score
#    mean_mae = cv_results['test-rmse-mean'].min()
#    boost_rounds = cv_results['test-rmse-mean'].idxmin()
#    print("\tRMSE {} for {} rounds".format(mean_mae, boost_rounds))
#    if mean_mae < min_mae:
#        min_mae = mean_mae
#        best_params = (subsample,colsample)
#print("Best params: {}, {}, RMSE: {}".format(best_params[0], best_params[1], min_mae))

    mean_rmse = cv_results['test-rmse-mean'].min()
    boost_rounds = cv_results['test-rmse-mean'].argmin()
    print("\tRMSE {} for {} rounds".format(mean_rmse, boost_rounds))
    if mean_rmse < min_rmse:
        min_rmse = mean_rmse
        best_params = (subsample,colsample)
    print("Training Time: %s seconds" % (str(time.time() - time_begin)))
print("Best params: {}, {}, RMSE: {}".format(best_params[0], best_params[1], min_rmse))

In [None]:
params['max_depth'] = best_params[0]
params['min_child_weight'] = best_params[1]
params

## eta

In [None]:
min_rmse = float("Inf")
best_params = None
for eta in [.3, .2, .1, .05, .01, .005]:
    time_begin = time.time()
    print("CV with eta={}".format(eta))
    # We update our parameters
    params['eta'] = eta
    # Run and time CV
   
    cv_results = xgb.cv(
            params,
            dtrain,
            num_boost_round=num_boost_round,
            seed=42,
            nfold=5,
            metrics=['rmse'],
            early_stopping_rounds=10)
    
    # Update best score
#    mean_mae = cv_results['test-rmse-mean'].min()
#    boost_rounds = cv_results['test-rmse-mean'].idxmin()
#    print("\tRMSE {} for {} rounds\n".format(mean_mae, boost_rounds))
#   if mean_mae < min_mae:
#        min_mae = mean_mae
#        best_params = eta
#print("Best params: {}, RMSE: {}".format(best_params, min_mae))

    mean_rmse = cv_results['test-rmse-mean'].min()
    boost_rounds = cv_results['test-rmse-mean'].idxmin()
    print("\tRMSE {} for {} rounds\n".format(mean_rmse, boost_rounds))
    if mean_rmse < min_rmse:
        min_rmse = mean_rmse
        best_params = eta
    print("Training Time: %s seconds" % (str(time.time() - time_begin)))
print("Best params: {}, RMSE: {}".format(best_params, min_rmse))

In [None]:
params['eta'] = best_params #.1

## final model

In [None]:
model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)

In [None]:
num_boost_round = model.best_iteration + 1
best_model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")]
)

In [None]:
pd.DataFrame([validy,best_model.predict(dvalid)])

In [None]:
best_model.save_model('trial1a.model')
best_model.dump_model('trial1b.model')