In [1]:
from hyperopt import STATUS_OK, fmin, tpe, Trials, hp
import xgboost as xgb
import logging
from timeit import default_timer as timer
import os
from functools import partial
import pandas as pd
import numpy as np
import time
import csv
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras import optimizers
from sklearn.model_selection import KFold
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from keras.optimizers import SGD, Adam, RMSprop
import openpyxl

Using TensorFlow backend.


<font face='黑体' color=#0099ff size=5>以“河水污染”为例</font>

In [2]:
MAX_EVALS = 2 # 迭代次数
NFOLDS = 5 # K-FOLD 
FOLDS = None # 自定义的FOLDS，优先级高于NFOLDS
BASE_DIR = os.path.dirname(os.path.abspath('__file__'))

In [3]:
XGB_SPACE = {
    'booster': 'gbtree',
    'random_state': 2019,
    'eval_metric': 'rmse',
    'n_jobs': -1,
    'learning_rate': 0.05,
    'subsample': hp.uniform('subsample', 0.1, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.1, 1.0),
    'max_depth': hp.quniform('max_depth', 5, 10, 1),
    'gamma': hp.uniform('gamma', 0.0, 2.0),
    'min_child_weight': hp.uniform('min_child_weight', 0.0, 5.0),
    'reg_alpha': hp.uniform('reg_alpha', 0.0, 3.0),
    'reg_lambda': hp.uniform('reg_lambda', 0.0, 3.0)
}

In [4]:
#定义优化的目标函数：

def objective_base(params,
                   train_set,
                   folds=None,
                   nfold=5,
                   writetoFile=True):
    """
    Objective function for Gradient Boosting Machine Hyperparameter Optimization
    Args:
        folds: This argument has highest priority over other data split arguments.
    Return:
    """
    # Keep track of evals
    global _ITERATION
    _ITERATION += 1
    # Make sure parameters that need to be integers are integers
    for parameter_name in [
            'num_leaves', 'max_depth', 'bagging_freq', 'min_data_in_leaf',
            'min_samples_split', 'min_samples_leaf'
    ]:
        if parameter_name in params:
            params[parameter_name] = int(params[parameter_name])
    start = timer()
    logging.info(f"{_ITERATION} ITERATION")
    logging.info(f"params:\n{params}")
    cv_dict = xgb.cv(params,
                     train_set,
                     num_boost_round=5000,
                     nfold=nfold,
                     stratified=False,
                     folds=folds,
                     early_stopping_rounds=100,
                     as_pandas=False,
                     verbose_eval=10,
                     seed=0,
                     shuffle=False)
    # Extract the min rmse, Loss must be minimized
    loss = np.min(cv_dict['test-rmse-mean'])
    # Boosting rounds that returned the lowest cv rmse
    n_estimators = int(np.argmin(cv_dict['test-rmse-mean'])+1)
    run_time = timer() - start
    # Write to the csv file ('a' means append)
    if writetoFile:
        random_datetime = str(int(time.time()))
        hyper_base_path = os.path.join(BASE_DIR, 'hyperopt_output')
        trial_file = os.path.join(hyper_base_path, 'trials.csv')
        trial_file_rename = os.path.join(hyper_base_path,
                                         'trials_%s.csv' % random_datetime)
        if not os.path.exists(hyper_base_path):
            os.makedirs(hyper_base_path)
            print(
                "No trial file directory <hyperopt_output> exists, will be created..."
            )
        if os.path.exists(trial_file) and _ITERATION == 1:
            print("Trial file exists, will be renamed...")
            os.rename(trial_file, trial_file_rename)
            assert os.path.exists(
                trial_file
            ) == False, "Trial file still exists, rename failed..."
            # File to save first results
            of_connection = open(trial_file, 'w')
            writer = csv.writer(of_connection)
            # Write the headers to the file
            writer.writerow(
                ['loss', 'params', 'iteration', 'estimators', 'train_time'])
            of_connection.close()
        of_connection = open(trial_file, 'a')
        writer = csv.writer(of_connection)
        writer.writerow([loss, params, _ITERATION, n_estimators, run_time])
    # Dictionary with information for evaluation
    return {
        'loss': loss,
        'params': params,
        'iteration': _ITERATION,
        'estimators': n_estimators,
        'train_time': run_time,
        'status': STATUS_OK
    }

In [5]:
#定义前处理和后处理模块：

def build_train_set(X_train, y_train):
    isX_df = isinstance(X_train, pd.DataFrame)
    isY_sr = isinstance(y_train, pd.Series)
    isY_df = isinstance(y_train, pd.DataFrame)
    if isY_df:
        raise TypeError(
            f"y_train is df, with the shape {y_train.shape}, which is not supportable now."
        )
    if isX_df ^ isY_sr:
        raise TypeError(f"X_train and y_train have different types!")
    if isX_df:
        train_set = xgb.DMatrix(X_train.values, y_train.values)
    else:
        train_set = xgb.DMatrix(X_train, y_train)
    return train_set
 
def post_hyperopt(bayes_trials, train_set, folds=None, nfold=5):
    # get best params
    bayes_results = pd.DataFrame(bayes_trials.results)
    bayes_results = bayes_results.sort_values(by='loss')
    bayes_results.reset_index(drop=True, inplace=True)
    best_params = bayes_results.loc[0, 'params']
    # get best loss and trees
    best_params['learning_rate'] = 0.01
    # Perform n_folds cross validation
    cv_dict = xgb.cv(best_params,
                     train_set,
                     num_boost_round=5000,
                     folds=folds,
                     nfold=nfold,
                     stratified=False,
                     shuffle=False,
                     early_stopping_rounds=100,
                     as_pandas=False,
                     verbose_eval=10,
                     seed=2019)
    # Extract the min rmse, Loss must be minimized
    loss = np.min(cv_dict['test-rmse-mean'])
    # Boosting rounds that returned the lowest cv rmse
    n_estimators = int(np.argmin(cv_dict['test-rmse-mean']) + 1)
    best_params['n_estimators'] = n_estimators
    logging.info(f"best loss: {loss}, best n_estimators: {n_estimators}")
    logging.info(f"best params: {best_params}")
    return best_params, loss

In [6]:
#定义主函数：

  
def main_tuning_with_bo(X_train,
                        y_train,
                        max_evals=MAX_EVALS,
                        folds=FOLDS,
                        nfold=NFOLDS):
    # Keep track of results
    bayes_trials = Trials()
    # Global variable
    global _ITERATION
    _ITERATION = 0
    TRAIN_SET = build_train_set(X_train, y_train)
    SPACE = XGB_SPACE
    func_objective = partial(objective_base,
                             train_set=TRAIN_SET,
                             folds=folds,
                             nfold=nfold,
                             writetoFile=True)
    # Run optimization
    best = fmin(fn=func_objective,
                space=SPACE,
                algo=tpe.suggest,
                max_evals=max_evals,
                trials=bayes_trials,
                rstate=np.random.RandomState(2019))
    best_params, loss = post_hyperopt(bayes_trials,
                                      train_set=TRAIN_SET,
                                      folds=folds,
                                      nfold=nfold)
    return best_params, loss

In [7]:
data1 = pd.read_excel(r"E:\水环境\论文修改\数据\河水污染对压力关键词.xlsx",sheet_name = 'Sheet1')
y = data1.iloc[:,1]
X = data1.iloc[:,2:]

In [8]:
main_tuning_with_bo(X, y)

[0]	train-rmse:0.06096+0.00504	test-rmse:0.05607+0.02445                                                               

[10]	train-rmse:0.06096+0.00504	test-rmse:0.05607+0.02445                                                              

[20]	train-rmse:0.06096+0.00504	test-rmse:0.05607+0.02445                                                              

[30]	train-rmse:0.06096+0.00504	test-rmse:0.05607+0.02445                                                              

[40]	train-rmse:0.06096+0.00504	test-rmse:0.05607+0.02445                                                              

[50]	train-rmse:0.06096+0.00504	test-rmse:0.05607+0.02445                                                              

[60]	train-rmse:0.06096+0.00504	test-rmse:0.05607+0.02445                                                              

[70]	train-rmse:0.06096+0.00504	test-rmse:0.05607+0.02445                                                              

[80]	train-rmse:0.06096+0.00504	

({'booster': 'gbtree',
  'colsample_bytree': 0.998952348197383,
  'eval_metric': 'rmse',
  'gamma': 0.39980749795422654,
  'learning_rate': 0.01,
  'max_depth': 5,
  'min_child_weight': 1.240085707035738,
  'n_jobs': -1,
  'random_state': 2019,
  'reg_alpha': 2.512606228077046,
  'reg_lambda': 2.8615987166118226,
  'subsample': 0.655378325608154,
  'n_estimators': 4957},
 0.0458002)