In [1]:
# 导入基本包
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 显示所有过程结果
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

plt.rcParams['font.sans-serif']=['SimHei']  # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False  # 用来正常显示负号

In [3]:
######### 模块导入区 #########
from hyperopt import fmin, tpe, hp, Trials  # 贝叶斯调参框架

from sklearn.datasets import load_digits  # 手写字体数据集

from sklearn.neighbors import KNeighborsClassifier  # K近邻
from sklearn.tree import DecisionTreeClassifier  # 决策树
from sklearn.ensemble import RandomForestClassifier  # 随机森林
from lightgbm.sklearn import LGBMClassifier  # lgb
import xgboost as xgb # xgb

from sklearn.svm import SVC  # 支撑向量机
from sklearn.linear_model import LogisticRegression  # 逻辑回归

from sklearn.model_selection import train_test_split  # 数据切分
from sklearn.model_selection import cross_val_score

  from pandas import MultiIndex, Int64Index


In [4]:
######### 数据生成区 #########
x = load_digits().data
y = load_digits().target

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=666)  # 切分数据



In [5]:
######### 定义训练函数 #########
def hyperopt_train(x, y, params, scoring=None):
    '''
    x: 特征
    y: 标签
    params: 特征字典
    '''
    params_copy = params.copy()
    train_method = params_copy['method']
    del(params_copy['method'])
    if train_method == 'knn':
        clf = KNeighborsClassifier(**params_copy)
    if train_method == 'dtree':
        clf = DecisionTreeClassifier(**params_copy)
    if train_method == 'rf':
        clf = RandomForestClassifier(**params_copy)
    if train_method == 'lgb':
        clf = LGBMClassifier(**params_copy)
    if train_method == 'xgb':
        clf = xgb.XGBClassifier(**params_copy)
    if train_method == 'lr':
        clf = LogisticRegression(**params_copy)
    if train_method == 'svc':
        if scoring == 'roc_auc_ovr':
            params_copy['probability'] = True  # 使用auc做评分的时候，需要设置这个参数
        clf = SVC(**params_copy)
    else:
        return 0, None
    
    score = cross_val_score(clf, x, y, cv=5, n_jobs =-1, scoring=scoring).mean()  # 取交叉验证的均值
    
    return score, scoring  


In [6]:
######### 定义搜索空间 #########
search_space = hp.choice('train_method', [
    {
        'method': 'knn'
        ,'n_neighbors': hp.choice('knn_n_neighbors', range(1,51))
    }
    ,{
        'method': 'dtree'
        ,'max_depth': hp.choice('dtree_max_depth', range(1,51))
        ,'max_features': hp.choice('dtree_max_features', range(1,65))
        ,'criterion': hp.choice('dtree_criterion', ['gini', 'entropy'])
    }
    ,{
        'method': 'rf'
        ,'n_estimators': hp.choice('rf_n_estimators', range(1,51))
        ,'max_depth': hp.choice('rf_max_depth', range(1,51))
        ,'max_features': hp.choice('rf_max_features', range(1,65))
        ,'criterion': hp.choice('rf_criterion', ['gini', 'entropy'])
    }
    ,{
        'method': 'lgb'
        ,'n_estimators': hp.choice('lgb_n_estimators', range(1,51))
        ,'learning_rate': hp.uniform('lgb_learning_rate', 0.01, 0.3)
        ,'max_depth': hp.choice('lgb_max_depth', range(1,51))
        ,'lambda_l1': hp.uniform('lgb_lambda_l1', 0.01, 0.2)
        ,'lambda_l2': hp.uniform('lgb_lambda_l2', 0.01, 0.2)
        ,'subsample': hp.uniform('lgb_subsample', 0.5, 1.0)
        ,'colsample_bytree': hp.uniform('lgb_colsample_bytree', 0.6, 1.0)
        ,'reg_alpha': hp.uniform('lgb_reg_alpha', 0, 1.0)
        ,'objective': 'multiclass'
        ,'num_class': 10
    }
    ,{
        'method': 'xgb'
        ,'n_estimators': hp.choice('xgb_n_estimators', range(1,51))
        ,'learning_rate': hp.uniform('xgb_learning_rate', 0.01, 0.3)
        ,'max_depth': hp.choice('xgb_max_depth', range(1,51))
        ,'subsample': hp.uniform('xgb_subsample', 0.5, 1.0)
        ,'colsample_bytree': hp.uniform('xgb_colsample_bytree', 0.6, 1.0)
        ,'reg_alpha': hp.uniform('xgb_reg_alpha', 0, 1.0)
        ,'objective': 'multi:softmax'
        ,'num_class': 10
    }
    ,{
        'method': 'lr'
        ,'penalty': hp.choice('lr_penalty', ['l1', 'l2'])
        ,'C': hp.uniform('lr_c', 0.1, 1.0)
    }
    ,{
        'method': 'svc'
        ,'C': hp.uniform('svc_C',0, 10.0)
        ,'kernel': hp.choice('svc_kernel', ['linear', 'rbf'])
        ,'gamma': hp.uniform('svc_gamma', 0, 20.0)
    }
])


In [7]:
######### 定义搜索时所参考的目标函数 #########
count = 0
best_score = 0
model = None
def target_f(params):
    global best_score, count, model
    count += 1
    score, scoring = hyperopt_train(x_train, y_train, params, scoring='roc_auc_ovr')
    if score > best_score:
        print('更优的{}为：{}；使用模型为：{}'.format(scoring, score, params['method']))
        print('迭代次数为：{}；{}为：{}；参数组合为：{}'.format(count, scoring, score, params))
        best_score = score
        model = params['method']
        print('-'*120)
        
    # 如果没有搜索到更优的组合，每迭代50次打印一次
    if count % 50 == 0:
        print('迭代次数为：{}；{}为：{}；参数组合为：{}'.format(count, scoring, score, params))
        print('='*120)
    return -score  # 由于要找到最大的acc，所以要转换成最小化问题，-acc



In [8]:
######### 搜索开始 #########
trials = Trials()  # 设置查看黑盒函数fn中的搜索情况（每次选择参数等）
max_evals = 1500
best_params = fmin(fn=target_f, space=search_space, algo=tpe.suggest, max_evals=max_evals,trials=trials)
print('最优模型为：{}, 最优分数为：{}'.format(model, best_score))

更优的roc_auc_ovr为：0.9994477131876767；使用模型为：svc                                                                         
迭代次数为：4；roc_auc_ovr为：0.9994477131876767；参数组合为：{'C': 9.846308185531292, 'gamma': 0.8897514620851665, 'kernel': 'linear', 'method': 'svc'}
------------------------------------------------------------------------------------------------------------------------
更优的roc_auc_ovr为：0.9994749504169542；使用模型为：svc                                                                         
迭代次数为：23；roc_auc_ovr为：0.9994749504169542；参数组合为：{'C': 0.14470611410214818, 'gamma': 19.090776815920606, 'kernel': 'linear', 'method': 'svc'}
------------------------------------------------------------------------------------------------------------------------
迭代次数为：50；None为：0；参数组合为：{'criterion': 'entropy', 'max_depth': 4, 'max_features': 63, 'method': 'dtree'}               
更优的roc_auc_ovr为：0.9994774030952793；使用模型为：svc                                                                         
迭代次数为：75；

迭代次数为：1050；roc_auc_ovr为：0.5；参数组合为：{'C': 4.376286205158519, 'gamma': 10.950522438110998, 'kernel': 'rbf', 'method': 'svc'}
迭代次数为：1100；None为：0；参数组合为：{'method': 'knn', 'n_neighbors': 21}                                                        
迭代次数为：1150；roc_auc_ovr为：0.9994047549384069；参数组合为：{'C': 6.112165105143904, 'gamma': 0.017866021072412478, 'kernel': 'linear', 'method': 'svc'}
迭代次数为：1200；roc_auc_ovr为：0.48040748120123905；参数组合为：{'C': 6.846029776515956, 'gamma': 0.3445250322685499, 'kernel': 'rbf', 'method': 'svc'}
迭代次数为：1250；None为：0；参数组合为：{'colsample_bytree': 0.972843187734425, 'learning_rate': 0.01980982336320411, 'max_depth': 33, 'method': 'xgb', 'n_estimators': 38, 'num_class': 10, 'objective': 'multi:softmax', 'reg_alpha': 0.965666587636153, 'subsample': 0.7662201918288185}
迭代次数为：1300；roc_auc_ovr为：0.9850635558172796；参数组合为：{'C': 5.94372385758136, 'gamma': 0.018873217169233174, 'kernel': 'rbf', 'method': 'svc'}
迭代次数为：1350；None为：0；参数组合为：{'method': 'knn', 'n_neighbors': 45}            