# HYPEROPT 

Hyperopt codes for majot machine learning models (Classifiers)

# XGBoost

In [None]:
%%time
# hp: define the hyperparameter space
# fmin: optimization function
# Trials: to evaluate the different searched hyperparameters
from sklearn.model_selection import cross_val_score
from hyperopt import hp, fmin, Trials

# the search algorithms
from hyperopt import rand, anneal, tpe

# for the search
from hyperopt import STATUS_OK, STATUS_FAIL
from xgboost import XGBClassifier

param_grid = {
    'n_estimators': hp.quniform('n_estimators', 10, 2500, 25),
    'max_depth': hp.quniform('max_depth', 1, 7, 1),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.001), np.log(1)),
    'booster': hp.choice('booster', ['gbtree', 'dart', 'gblinear']),
    'gamma': hp.loguniform('gamma', np.log(0.01), np.log(10)),
    'subsample': hp.uniform('subsample', 0.50, 0.90),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.50, 0.99),
    'colsample_bylevel': hp.uniform('colsample_bylevel', 0.50, 0.99),
    'colsample_bynode': hp.uniform('colsample_bynode', 0.50, 0.99),
    'reg_lambda': hp.loguniform('reg_lambda', np.log(1), np.log(20)), 
    'min_child_weight' : hp.quniform('min_child_weight', 1, 20, 1), 
    'base_score': hp.uniform('base_score', 0.3, 0.65), 
    'max_delta_step':  hp.loguniform('max_delta_step', np.log(0.01), np.log(10)), 
    'reg_alpha': hp.uniform('reg_alpha', 0.0001, 2.0)
}

#Defining Objective Function

def objective(params):

    # we need a dictionary to indicate which value from the space
    # to attribute to each value of the hyperparameter in the xgb
    params_dict = {
        # important int, as it takes integers only
        'n_estimators': int(params['n_estimators']),
        # important int, as it takes integers only
        'max_depth': int(params['max_depth']),
        'learning_rate': params['learning_rate'],
        'booster': params['booster'],
        'gamma': params['gamma'],
        'subsample': params['subsample'],
        'colsample_bytree': params['colsample_bytree'],
        'colsample_bylevel': params['colsample_bylevel'],
        'colsample_bynode': params['colsample_bynode'],
        'random_state': seed,
        'reg_lambda': int(params['reg_lambda']), 
        'min_child_weight': int(params['min_child_weight']),
        'base_score': params['base_score'], 
        'max_delta_step': int(params['max_delta_step']), 
        'reg_lambda': int(params['reg_lambda'])
    }

    # with ** we pass the items in the dictionary as parameters
    # to the xgb
    model = XGBClassifier(**params_dict)

    # train with cv
    cross_val_data = cross_val_score(
        model, 
        X_train_pars, 
        y_train,
        scoring='roc_auc', 
        cv=StratifiedKFold(n_splits=10, random_state=seed, shuffle=True), 
        n_jobs=-1,
    )
    
    
    # === IMPORTANT ===
    # data to be returned by the search, we can add as much as we want
    
    loss = -cross_val_data.mean()
    loss_variance = cross_val_data.std()
    
    try:
        return {
            'loss': loss,
            'loss_variance':loss_variance,
            'status': STATUS_OK,
            }
    except Exception as e:
        return {
            'exception': str(e),
            'status': STATUS_FAIL,
            }


trials_tpe = Trials()

tpe_search = fmin(
    fn=objective,
    space=param_grid,
    max_evals=120,
    rstate=np.random.RandomState(seed),
    algo=tpe.suggest,  # tpe
    trials=trials_tpe
)

print()
print(f'Best Params: {tpe_search}')
print()

#Plots
results = pd.concat([
    pd.DataFrame(trials_tpe.vals),
    pd.DataFrame(trials_tpe.results)],
    axis=1,
).sort_values(by='loss', ascending=False).reset_index(drop=True)

results['index'] = results.index

ax = sns.lineplot(x='index', y='loss', data=results)
ax.fill_between(
    results["index"],
    y1=results["loss"] - results["loss_variance"],
    y2=results["loss"] + results["loss_variance"],
    alpha=.5,
)
plt.xlabel('interation')
plt.title('Random Search')

In [None]:
#To pass as dictionary

def create_param_grid(search, booster): #Here, booster represents the cateogircal parameter.
    best_hp_dict = {
        'n_estimators': int(search['n_estimators']),
        # important int, as it takes integers only
        'max_depth': int(search['max_depth']),
        'learning_rate': search['learning_rate'],
        'booster': booster,
        'gamma': search['gamma'],
        'subsample': search['subsample'],
        'colsample_bytree': search['colsample_bytree'],
        'colsample_bylevel': search['colsample_bylevel'],
        'colsample_bynode': search['colsample_bynode'],
        'random_state': seed,
        'reg_lambda': int(search['reg_lambda']), 
        'min_child_weight': int(search['min_child_weight']),
        'base_score': search['base_score'], 
        'max_delta_step': int(search['max_delta_step']), 
        'reg_lambda': int(search['reg_lambda'])
    }
    return best_hp_dict

chosen_param = create_param_grid(tpe_search, #Categorical columns 'gbtree')
print(chosen_param)

# CatBoost

In [None]:
%%time
# hp: define the hyperparameter space
# fmin: optimization function
# Trials: to evaluate the different searched hyperparameters
from sklearn.model_selection import cross_val_score
from hyperopt import hp, fmin, Trials

# the search algorithms
from hyperopt import rand, anneal, tpe

# for the search
from hyperopt import STATUS_OK, STATUS_FAIL
from catboost import CatBoostClassifier

param_grid = {
    'iterations': hp.quniform('iterations', 200, 1000, 25),
    'depth': hp.quniform('depth', 1, 7, 1),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.001), np.log(1)),
    'boosting_type': hp.choice('boosting_type', ['Ordered', 'Plain']),
    'l2_leaf_reg': hp.uniform('l2_leaf_reg', 1.0, 100.0),
    'border_count': hp.loguniform('border_count', np.log(5), np.log(200))
}

#Defining Objective Function

def objective(params):

    # we need a dictionary to indicate which value from the space
    # to attribute to each value of the hyperparameter in the xgb
    params_dict = {
        'iterations': int(params['iterations']),
        'depth': int(params['depth']),
        'learning_rate': params['learning_rate'],
        'boosting_type': params['boosting_type'],
        'l2_leaf_reg': params['l2_leaf_reg'],
        'border_count': int(params['border_count']) 
    }

    # with ** we pass the items in the dictionary as parameters
    # to the xgb
    
    cat = CatBoostClassifier(verbose = False, loss_function='CrossEntropy', eval_metric='TotalF1')
    
    model = cat.set_params(**params_dict)

    # train with cv
    cross_val_data = cross_val_score(
        model, 
        X_train_pars, 
        y_train,
        scoring='roc_auc', 
        cv=StratifiedKFold(n_splits=10, random_state=seed, shuffle=True), 
        n_jobs=-1,
    )
    
    # === IMPORTANT ===
    # data to be returned by the search, we can add as much as we want
    
    loss = -cross_val_data.mean()
    loss_variance = cross_val_data.std()
    
    try:
        return {
            'loss': loss,
            'loss_variance':loss_variance,
            'status': STATUS_OK,
            }
    except Exception as e:
        return {
            'exception': str(e),
            'status': STATUS_FAIL,
            }


trials_tpe = Trials()

tpe_search = fmin(
    fn=objective,
    space=param_grid,
    max_evals=120,
    rstate=np.random.RandomState(seed),
    algo=tpe.suggest,  # tpe
    trials=trials_tpe
)

print()
print(f'Best Params: {tpe_search}')
print()

#Plots
results = pd.concat([
    pd.DataFrame(trials_tpe.vals),
    pd.DataFrame(trials_tpe.results)],
    axis=1,
).sort_values(by='loss', ascending=False).reset_index(drop=True)

results['index'] = results.index

ax = sns.lineplot(x='index', y='loss', data=results)
ax.fill_between(
    results["index"],
    y1=results["loss"] - results["loss_variance"],
    y2=results["loss"] + results["loss_variance"],
    alpha=.5,
)
plt.xlabel('interation')
plt.title('Random Search')

In [None]:
def create_param_grid(search, boosting_type):
    best_hp_dict = {
        'iterations': int(search['iterations']),
        'depth': int(search['depth']),
        'learning_rate': search['learning_rate'],
        'boosting_type': boosting_type,
        'l2_leaf_reg': search['l2_leaf_reg'],
        'border_count': int(search['border_count'])
    }
    return best_hp_dict

chosen_param = create_param_grid(tpe_search, #boosting_type in string format)
print(chosen_param)

# Bagging Classifier

In [None]:
%%time
# hp: define the hyperparameter space
# fmin: optimization function
# Trials: to evaluate the different searched hyperparameters
from sklearn.model_selection import cross_val_score
from hyperopt import hp, fmin, Trials

# the search algorithms
from hyperopt import rand, anneal, tpe

# for the search
from hyperopt import STATUS_OK, STATUS_FAIL
from sklearn.ensemble import BaggingClassifier

param_grid = {
    'n_estimators': hp.quniform('n_estimators', 10, 2500, 25),
    'max_samples': hp.uniform('max_samples', 0.50, 0.999),
    'max_features': hp.uniform('max_features', 0.30, 0.999),
    'bootstrap': hp.choice('bootstrap', [True, False]),
    'bootstrap_features': hp.choice('bootstrap_features', [True, False]),
    'warm_start': hp.choice('warm_start', [True, False])
}

#Defining Objective Function

def objective(params):

    # we need a dictionary to indicate which value from the space
    # to attribute to each value of the hyperparameter in the xgb
    params_dict = {
        # important int, as it takes integers only
        'n_estimators': int(params['n_estimators']),
        'max_samples': params['max_samples'],
        'max_features': params['max_features'],
        'bootstrap': params['bootstrap'],
        'bootstrap_features': params['bootstrap_features'],
        'warm_start': params['warm_start']
    }

    # with ** we pass the items in the dictionary as parameters
    # to the xgb
    model = BaggingClassifier(n_jobs = -1, **params_dict)

    # train with cv
    cross_val_data = cross_val_score(
        model, 
        X_train_pars, 
        y_train,
        scoring='roc_auc', 
        cv=StratifiedKFold(n_splits=10, random_state=seed, shuffle=True), 
        n_jobs=-1,
    )
    
    # === IMPORTANT ===
    # data to be returned by the search, we can add as much as we want
    
    loss = -cross_val_data.mean()
    loss_variance = cross_val_data.std()
    
    try:
        return {
            'loss': loss,
            'loss_variance':loss_variance,
            'status': STATUS_OK,
            }
    except Exception as e:
        return {
            'exception': str(e),
            'status': STATUS_FAIL,
            }


trials_tpe = Trials()

tpe_search = fmin(
    fn=objective,
    space=param_grid,
    max_evals=120,
    rstate=np.random.RandomState(seed),
    algo=tpe.suggest,  # tpe
    trials=trials_tpe
)

print()
print(f'Best Params: {tpe_search}')
print()

#Plots
results = pd.concat([
    pd.DataFrame(trials_tpe.vals),
    pd.DataFrame(trials_tpe.results)],
    axis=1,
).sort_values(by='loss', ascending=False).reset_index(drop=True)

results['index'] = results.index

ax = sns.lineplot(x='index', y='loss', data=results)
ax.fill_between(
    results["index"],
    y1=results["loss"] - results["loss_variance"],
    y2=results["loss"] + results["loss_variance"],
    alpha=.5,
)
plt.xlabel('interation')
plt.title('Random Search')

In [None]:
def create_param_grid(search, bootstrap, bootstrap_features, warm_start):
    best_hp_dict = {
        'n_estimators': int(search['n_estimators']),
        'max_samples': search['max_samples'],
        'max_features': search['max_features'],
        'bootstrap': bootstrap,
        'bootstrap_features': bootstrap_features,
        'warm_start': warm_start
    }
    return best_hp_dict

chosen_param = create_param_grid(tpe_search, #bootstrap, bootstrap_features, warm_start in string format)
print(chosen_param)