In [1]:
import time
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, f1_score, make_scorer
from xgboost import XGBClassifier
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK

In [3]:
data = pd.read_csv('/kaggle/input/diabetes/diabetes.csv') 
X = data.drop('Outcome', axis=1)
y = data['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scoring = make_scorer(f1_score)

In [9]:
param_dist = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [3, 4, 5, 6, 7],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0],
    'reg_alpha': [0, 0.1, 1],
    'reg_lambda': [1, 2, 5]
}

xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')

start = time.time()
random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=20,
    scoring='f1',
    cv=3,
    verbose=2,
    n_jobs=-1,
    random_state=42
)
random_search.fit(X_train, y_train)
end = time.time()

print("RandomizedSearchCV лучшие параметры:", random_search.best_params_)
print("RandomizedSearchCV лучший F1-score:", random_search.best_score_)
print("Время подбора (сек):", end - start)

# Оценка на тесте
best_xgb = random_search.best_estimator_
y_pred = best_xgb.predict(X_test)
print("F1-score на тесте:", f1_score(y_test, y_pred))

Fitting 3 folds for each of 20 candidates, totalling 60 fits
RandomizedSearchCV лучшие параметры: {'subsample': 1.0, 'reg_lambda': 2, 'reg_alpha': 1, 'n_estimators': 200, 'max_depth': 5, 'learning_rate': 0.05, 'colsample_bytree': 0.7}
RandomizedSearchCV лучший F1-score: 0.6826917656079109
Время подбора (сек): 1.9866306781768799
F1-score на тесте: 0.6434782608695652


In [11]:
def objective(params):
    model = XGBClassifier(
        n_estimators=int(params['n_estimators']),
        max_depth=int(params['max_depth']),
        learning_rate=float(params['learning_rate']),
        subsample=float(params['subsample']),
        colsample_bytree=float(params['colsample_bytree']),
        reg_alpha=float(params['reg_alpha']),
        reg_lambda=float(params['reg_lambda']),
        random_state=42,
        use_label_encoder=False,
        eval_metric='logloss'
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score = f1_score(y_test, y_pred)
    return {'loss': -score, 'status': STATUS_OK}

space = {
    'n_estimators': hp.quniform('n_estimators', 50, 200, 1),
    'max_depth': hp.quniform('max_depth', 3, 7, 1),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
    'subsample': hp.uniform('subsample', 0.7, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.7, 1.0),
    'reg_alpha': hp.uniform('reg_alpha', 0, 1),
    'reg_lambda': hp.uniform('reg_lambda', 1, 5)
}

trials = Trials()
start = time.time()
best = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=20,
    trials=trials
)
end = time.time()

print("Hyperopt лучшие параметры:", best)
print("Время подбора (сек):", end - start)

best_model = XGBClassifier(
    n_estimators=int(best['n_estimators']),
    max_depth=int(best['max_depth']),
    learning_rate=float(best['learning_rate']),
    subsample=float(best['subsample']),
    colsample_bytree=float(best['colsample_bytree']),
    reg_alpha=float(best['reg_alpha']),
    reg_lambda=float(best['reg_lambda']),
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)
print("F1-score на тесте:", f1_score(y_test, y_pred))

100%|██████████| 20/20 [00:02<00:00,  9.67trial/s, best loss: -0.6725663716814159]
Hyperopt лучшие параметры: {'colsample_bytree': 0.764009337174574, 'learning_rate': 0.09959740081038493, 'max_depth': 4.0, 'n_estimators': 69.0, 'reg_alpha': 0.4092151773480539, 'reg_lambda': 4.942844105344144, 'subsample': 0.707071552201895}
Время подбора (сек): 2.074958086013794
F1-score на тесте: 0.6725663716814159
[CV] END colsample_bytree=0.7, learning_rate=0.05, max_depth=5, n_estimators=200, reg_alpha=1, reg_lambda=2, subsample=1.0; total time=   0.2s
[CV] END colsample_bytree=1.0, learning_rate=0.05, max_depth=6, n_estimators=100, reg_alpha=0.1, reg_lambda=5, subsample=0.7; total time=   0.1s
[CV] END colsample_bytree=1.0, learning_rate=0.05, max_depth=6, n_estimators=100, reg_alpha=0.1, reg_lambda=5, subsample=0.7; total time=   0.1s
[CV] END colsample_bytree=1.0, learning_rate=0.05, max_depth=6, n_estimators=50, reg_alpha=0, reg_lambda=5, subsample=0.8; total time=   0.1s
[CV] END colsample_byt

Подбор гиперпараметров с помощью Hyperopt (TPE) позволил получить немного более высокий F1-score по сравнению с RandomizedSearchCV. 
однако времени понадобилось немного больше чем в 1 случае хотя и разница оказалась незначительной