### Overview

Regular training local with hyper parameter tuning

In [5]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import accuracy_score
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
import os
for dirname, _, filenames in os.walk('datasets/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

datasets/Wholesale customers data.csv


In [9]:
df = pd.read_csv(f'datasets/{filename}')
df

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen
0,2,3,12669,9656,7561,214,2674,1338
1,2,3,7057,9810,9568,1762,3293,1776
2,2,3,6353,8808,7684,2405,3516,7844
3,1,3,13265,1196,4221,6404,507,1788
4,2,3,22615,5410,7198,3915,1777,5185
...,...,...,...,...,...,...,...,...
435,1,3,29703,12051,16027,13135,182,2204
436,1,3,39228,1431,764,4510,93,2346
437,2,3,14531,15488,30243,437,14841,1867
438,1,3,10290,1981,2232,1038,168,2125


In [13]:
# Drops

X = df.drop('Channel', axis=1)
y = df['Channel']

In [14]:
# Convert labels into binary values

y[y == 2] = 0
y[y == 1] = 1

In [15]:
# Split data for training

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

In [55]:
space={
    'max_depth': hp.quniform("max_depth", 3, 18, 1),
    'gamma': hp.uniform ('gamma', 1,9),
    'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
    'reg_lambda' : hp.uniform('reg_lambda', 0,1),
    'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
    'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
    'n_estimators': 180,
    'seed': 0
    }

In [61]:
def objective(space):
    clf=xgb.XGBClassifier(
                    n_estimators =space['n_estimators'], 
                    max_depth = int(space['max_depth']), 
                    gamma = space['gamma'],
                    reg_alpha = int(space['reg_alpha']),
                    min_child_weight=int(space['min_child_weight']),
                    colsample_bytree=int(space['colsample_bytree']))
    
    evaluation = [( X_train, y_train), ( X_test, y_test)]

    xgb_params = {}
    xgb_params['eval_metric']='auc'
    xgb_params['early_stopping_rounds']=10 

    clf.set_params(**xgb_params)
    
    clf.fit(
        X_train, 
        y_train,
        eval_set=evaluation, 
        verbose=False)
    

    pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, pred>0.5)
    print ("SCORE:", accuracy)
    return {'loss': -accuracy, 'status': STATUS_OK }

In [57]:

xgb_params = {}
xgb_params['eval_metric']='auc'
xgb_params['early_stopping_rounds']=10 
xgb_params

{'eval_metric': 'auc', 'early_stopping_rounds': 10}

In [58]:
trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 100,
                        trials = trials)

SCORE:                                                 
0.3484848484848485                                     
SCORE:                                                                            
0.3484848484848485                                                                
SCORE:                                                                            
0.3484848484848485                                                                
SCORE:                                                                            
0.8712121212121212                                                                
SCORE:                                                                            
0.8863636363636364                                                                
SCORE:                                                                            
0.3484848484848485                                                                
SCORE:                                                    

In [59]:
print(best_hyperparams)

{'colsample_bytree': 0.8015834851378087, 'gamma': 4.160392182471469, 'max_depth': 9.0, 'min_child_weight': 7.0, 'reg_alpha': 74.0, 'reg_lambda': 0.8708226877745266}


In [64]:
colsample_bytree=best_hyperparams["colsample_bytree"]
max_depth=best_hyperparams["max_depth"]
gamma=best_hyperparams["gamma"]
reg_alpha=best_hyperparams["reg_alpha"]
min_child_weight=best_hyperparams["min_child_weight"]
colsample_bytree=best_hyperparams["colsample_bytree"]

clf=xgb.XGBClassifier(
    n_estimators =180, 
    max_depth = int(max_depth), 
    gamma = gamma,
    reg_alpha = reg_alpha,
    min_child_weight=min_child_weight,
    colsample_bytree=colsample_bytree)

evaluation = [( X_train, y_train), ( X_test, y_test)]

clf.fit(
    X_train, 
    y_train,
    eval_set=evaluation, 
    verbose=False)

pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, pred>0.5)
print ("SCORE:", accuracy)

SCORE: 0.8939393939393939


In [66]:
clf=xgb.XGBClassifier()

evaluation = [( X_train, y_train), ( X_test, y_test)]

clf.fit(
    X_train, 
    y_train,
    eval_set=evaluation, 
    verbose=False)

pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, pred>0.5)
print ("SCORE:", accuracy)

SCORE: 0.8939393939393939


In [68]:
df.head()

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen
0,0,3,12669,9656,7561,214,2674,1338
1,0,3,7057,9810,9568,1762,3293,1776
2,0,3,6353,8808,7684,2405,3516,7844
3,1,3,13265,1196,4221,6404,507,1788
4,0,3,22615,5410,7198,3915,1777,5185


In [88]:
# Prediction Single Row

x=pd.DataFrame(X_test.iloc[1,:]).T
pred=clf.predict(x)
pred

array([0])

## Using Vertex Vizier

In [92]:
import datetime

PROJECT_ID = "jchavezar-demo"
REGION = "us-central1"
STUDY_DISPLAY_NAME = "{}_study_{}".format(
    PROJECT_ID.replace("-", ""), datetime.datetime.now().strftime("%Y%m%d_%H%M%S"))
ENDPOINT = REGION + "-aiplatform.googleapis.com"
PARENT = "projects/{}/locations/{}".format(PROJECT_ID, REGION)

In [93]:
print("ENDPOINT: {}".format(ENDPOINT))
print("REGION: {}".format(REGION))
print("PARENT: {}".format(PARENT))

ENDPOINT: us-central1-aiplatform.googleapis.com
REGION: us-central1
PARENT: projects/jchavezar-demo/locations/us-central1


In [113]:
# Parameter Configuration

max_depth = {"parameter_id": "max_depth", "integer_value_spec": {"min_value": 3, "max_value": 18}}
gamma = {"parameter_id": "gamma", "integer_value_spec": {"min_value": 1, "max_value": 9}}
reg_alpha = {"parameter_id": "reg_alpha", "integer_value_spec": {"min_value": 1, "max_value": 9}}
reg_lambda = {"parameter_id": "reg_lambda", "integer_value_spec": {"min_value": 0, "max_value": 1}}
colsample_bytree = {"parameter_id": "colsample_bytree", "double_value_spec": {"min_value": 0.5, "max_value": 1.0}}
min_child_weight = {"parameter_id": "min_child_weight", "integer_value_spec": {"min_value": 0, "max_value": 10.0}}

metrics = {"metric_id": "accuracy", "goal": "MAXIMIZE"}

study = {
    "display_name": STUDY_DISPLAY_NAME,
    "study_spec": {
        "algorithm": "RANDOM_SEARCH",
        "parameters": [
            max_depth,
            gamma,
            reg_alpha,
            reg_lambda,
            colsample_bytree,
            min_child_weight
        ],
        "metrics": [metrics],
    },
}

In [114]:
import json

print(json.dumps(study, indent=2, sort_keys=True))

{
  "display_name": "jchavezardemo_study_20220803_095219",
  "study_spec": {
    "algorithm": "RANDOM_SEARCH",
    "metrics": [
      {
        "goal": "MAXIMIZE",
        "metric_id": "accuracy"
      }
    ],
    "parameters": [
      {
        "integer_value_spec": {
          "max_value": 18,
          "min_value": 3
        },
        "parameter_id": "max_depth"
      },
      {
        "integer_value_spec": {
          "max_value": 9,
          "min_value": 1
        },
        "parameter_id": "gamma"
      },
      {
        "integer_value_spec": {
          "max_value": 9,
          "min_value": 1
        },
        "parameter_id": "reg_alpha"
      },
      {
        "integer_value_spec": {
          "max_value": 1,
          "min_value": 0
        },
        "parameter_id": "reg_lambda"
      },
      {
        "double_value_spec": {
          "max_value": 1.0,
          "min_value": 0.5
        },
        "parameter_id": "colsample_bytree"
      },
      {
        "integer_v

In [115]:
from google.cloud import aiplatform

vizier_client = aiplatform.gapic.VizierServiceClient(
    client_options=dict(api_endpoint=ENDPOINT)
)
study = vizier_client.create_study(parent=PARENT, study=study)
STUDY_ID = study.name
print("STUDY_ID: {}".format(STUDY_ID))

STUDY_ID: projects/569083142710/locations/us-central1/studies/1176465079566


In [120]:
def objective(trial_id, max_depth, gamma, reg_alpha, reg_lambda, colsample_bytree, min_child_weight):
    print(("=========== Start Trial: [{}] =============").format(trial_id))
    clf=xgb.XGBClassifier(
                    n_estimators =180, 
                    max_depth = max_depth, 
                    gamma = gamma,
                    reg_alpha = reg_alpha,
                    min_child_weight= min_child_weight,
                    colsample_bytree= colsample_bytree)
    
    evaluation = [( X_train, y_train), ( X_test, y_test)]

    xgb_params = {}
    xgb_params['eval_metric']='auc'
    xgb_params['early_stopping_rounds']=10 

    clf.set_params(**xgb_params)
    
    clf.fit(
        X_train, 
        y_train,
        eval_set=evaluation, 
        verbose=False)
    

    pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, pred>0.5)
    metric = {"metric_id": "accuracy", "value": accuracy}
    print ("SCORE:", accuracy)
    return [metric]

In [121]:
client_id = "client1"  # @param {type: 'string'}
suggestion_count_per_request = 5  # @param {type: 'integer'}
max_trial_id_to_stop = 4  # @param {type: 'integer'}

print("client_id: {}".format(client_id))
print("suggestion_count_per_request: {}".format(suggestion_count_per_request))
print("max_trial_id_to_stop: {}".format(max_trial_id_to_stop))

client_id: client1
suggestion_count_per_request: 5
max_trial_id_to_stop: 4


In [122]:
trial_id = 0
while int(trial_id) < max_trial_id_to_stop:
    suggest_response = vizier_client.suggest_trials(
        {
            "parent": STUDY_ID,
            "suggestion_count": suggestion_count_per_request,
            "client_id": client_id,
        }
    )

    for suggested_trial in suggest_response.result().trials:
        trial_id = suggested_trial.name.split("/")[-1]
        trial = vizier_client.get_trial({"name": suggested_trial.name})

        if trial.state in ["COMPLETED", "INFEASIBLE"]:
            continue

        for param in trial.parameters:
            if param.parameter_id == "max_depth":
                max_depth = int(param.value)
            elif param.parameter_id == "gamma":
                gamma = int(param.value)
            elif param.parameter_id == "reg_alpha":
                reg_alpha = int(param.value)
            elif param.parameter_id == "reg_lambda":
                reg_lambda = int(param.value)
            elif param.parameter_id == "colsample_bytree":
                colsample_bytree = param.value
            elif param.parameter_id == "min_child_weight":
                min_child_weight = int(param.value)        
        print("""
        Trial : max_depth is {}, 
        gamma is {}, 
        reg_alpha is {}, 
        reg_lambda is {}, 
        colsample_bytree is {},
        min_child_weight is {}
        """.format(max_depth, gamma, reg_alpha, reg_lambda, colsample_bytree, min_child_weight))

        vizier_client.add_trial_measurement(
            {
                "trial_name": suggested_trial.name,
                "measurement": {
                    "metrics": objective(suggested_trial.name, max_depth, gamma, reg_alpha, reg_lambda, colsample_bytree, min_child_weight)
                },
            }
        )

        response = vizier_client.complete_trial(
            {"name": suggested_trial.name, "trial_infeasible": False}
        )


        Trial : max_depth is 11, 
        gamma is 8, 
        reg_alpha is 8, 
        reg_lambda is 0, 
        colsample_bytree is 0.5585021481239816,
        min_child_weight is 10
        
SCORE: 0.8712121212121212

        Trial : max_depth is 7, 
        gamma is 7, 
        reg_alpha is 2, 
        reg_lambda is 1, 
        colsample_bytree is 0.7158379855927917,
        min_child_weight is 4
        
SCORE: 0.8939393939393939

        Trial : max_depth is 16, 
        gamma is 5, 
        reg_alpha is 9, 
        reg_lambda is 0, 
        colsample_bytree is 0.938904473734993,
        min_child_weight is 0
        
SCORE: 0.9015151515151515

        Trial : max_depth is 11, 
        gamma is 6, 
        reg_alpha is 5, 
        reg_lambda is 1, 
        colsample_bytree is 0.5426125461837765,
        min_child_weight is 3
        
SCORE: 0.8712121212121212

        Trial : max_depth is 7, 
        gamma is 6, 
        reg_alpha is 4, 
        reg_lambda is 1, 
        colsamp

In [112]:
# List and Clean

optimal_trials = vizier_client.list_optimal_trials({"parent": STUDY_ID})

print("optimal_trials: {}".format(optimal_trials))

vizier_client.delete_study({"name": STUDY_ID})

optimal_trials: 
