<a href="https://colab.research.google.com/github/harnalashok/h2o/blob/master/h2o_wine_bayesOptimization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
Last amended: 27th Dec, 2020
My folder: C:\Users\Administrator\OneDrive\Documents\wine
Data Source: https://www.kaggle.com/uciml/red-wine-quality-cortez-et-al-2009

Objectives:
        i)  Experiments in neural network and Deeplearning
        ii) Quick Bayes optimization
        iii)Using Optuna library


Ref:
Machine Learning with python and H2O
   https://www.h2o.ai/wp-content/uploads/2018/01/Python-BOOKLET.pdf
H2o deeplearning (latest) booklet
   http://docs.h2o.ai/h2o/latest-stable/h2o-docs/booklets/DeepLearningBooklet.pdf

"""

In [None]:
# 1.0 Install java run-time
! apt-get install default-jre
!java -version

In [None]:
# https://medium.com/@naeemasvat.na/how-to-use-h2o-in-google-colab-b69ba539ab1a
# 2.0 Install h2o
! pip install h2o

In [None]:
# 3.0 Mount your google drive 
#      so that you can access data files 
#      on your Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# 4.0 Install optuna
! pip install optuna

In [None]:
# 5.0 Call libraries
import pandas as pd
import h2o
import os
import optuna
# 5.1
from h2o.estimators.deeplearning import H2ODeepLearningEstimator


In [None]:
# 5.2 Display output of multiple commands from a cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
# 5.3 Start h2o
h2o.init(max_mem_size = "2G")

In [None]:
# 6. Change working folder and read bio_response data
# # os.chdir("C:\Users\Administrator\OneDrive\Documents\wine")

In [None]:
# 6.1 Read data file (colab code)
data =h2o.import_file("/content/drive/MyDrive/Colab_data_files/winequality-red.csv")

In [None]:
# 6.2
data.shape
data.head(5)     # bio.head().as_data_frame()

In [None]:
# 6.3 Transform target to factor column
data['quality'].unique() 
#data['quality'] = data['quality'].asfactor()


In [None]:
# 6.4 Which are predictors and which one is target column
train_cols = data[:,:-1].columns
train_cols
y = 'quality'

In [None]:
# 7.0 Split the dataset into train/test

train,test = data.split_frame(ratios= [0.7])
train.shape   # (1108,12)
test.shape    # (491, 12)

In [None]:
# 8.0 Define an objective function
def objective(trial):
    # 8.1 Parameters
    input_dropout_ratio = trial.suggest_float('input_dropout_ratio', 0.1,0.3, log=False)

    l1 = trial.suggest_float('l1', 1e-6,1e-3, log=True)
    l2 = trial.suggest_float('l2', 1e-6,1e-3, log=True)
    
    h_ratio1 = trial.suggest_float('h_ratio1', 0.3,0.8, log = False)    # hidden dropout ration for 1st layer
    h_ratio2 = trial.suggest_float('h_ratio2', 0.3,0.8, log = False)    # hidden dropout ration for IInd layer
    h_ratio3 = trial.suggest_float('h_ratio3', 0.3,0.8, log = False)    # hidden dropout ration for IIIrd layer

    nn_Ist_layer = trial.suggest_int('nn_Ist_layer', 32, 200)        # hidden dropout ration for IIIrd layer
    nn_IInd_layer = trial.suggest_int('nn_IInd_layer', 32, 200)      # hidden dropout ration for IIIrd layer
    nn_IIIrd_layer = trial.suggest_int('nn_IIIrd_layer', 32, 200)    # hidden dropout ration for IIIrd layer
    
    activation= trial.suggest_categorical('activation', ['TanhWithDropout','RectifierWithDropout'])
    
    # 8.2 Model instaniation
    dl =H2ODeepLearningEstimator(
                                   distribution="gaussian",   # 'multinomial
                                   activation = activation,
                                   l1 = l1,
                                   l2 = l2,
                                   input_dropout_ratio = input_dropout_ratio,
                                   hidden = [nn_Ist_layer,nn_IInd_layer,nn_IIIrd_layer],
                                   hidden_dropout_ratios = [h_ratio1,h_ratio2,h_ratio3], 
                                   epochs = 100,           # Even though epochs are 100,
                                                           # iterations stop very early. 
                                                           # Progress bar after some time to 
                                                           # suddenly jumps to 100
                                   score_each_iteration = True
                                   #initial_weight_distribution = initial_wt_dist[i]
                                  )
    # 8.3 Model training
    dl.train(
              x= train_cols,            # Predictor columns
              y= y,            # Target
              training_frame=train,  # training data
              validation_frame = test
             )
    
    # 8.4
    return dl.rmse()      # dl.logloss()


In [None]:
# 8.5 Instantiate a study object
study = optuna.create_study(direction='minimize')

In [None]:
# 8.6 Begin optimization process
study.optimize(
                objective,      # Given past results, this function
                                #  is called by 'study' with a 'trial' object.
                                #   The 'trial' object guides 'objective' as to what
                                #    parameters to select next   
                n_trials=100
               )



In [None]:
# opt 8.7 Even though 'objective()'  only
#          returns performance-score to
#           'study' object, 'study' has 
#            full information about the parameters
#             and values set from time-to-time by
#              'trial' object
#          See StackOverflow question: https://stackoverflow.com/q/65057819/3282777
study.get_trials()

In [None]:
# opt 8.8 Same as above but in a 
#        dataframe format
study.trials_dataframe().head(5)

In [None]:
# 9.0 So which is best parameter combination
trial = study.best_trial
# 9.1
print('RMSE: {}'.format(trial.value))
trial.params

In [None]:
# 10.0 Build a model with these parameters
dl =H2ODeepLearningEstimator(
                                   distribution="gaussian",
                                   activation = trial.params['activation'],
                                   l1 = trial.params['l1'],
                                   l2 = trial.params['l2'],
                                   input_dropout_ratio = trial.params['input_dropout_ratio'],
                                   hidden_dropout_ratios =[trial.params['h_ratio1'],trial.params['h_ratio2'],trial.params['h_ratio3']], 
                                   hidden = [trial.params['nn_Ist_layer'],trial.params['nn_IInd_layer'],trial.params['nn_IIIrd_layer']],
                                   epochs = 100,           # Even though epochs are 100,
                                                           # iterations stop very early. 
                                                           # Progress bar after some time to 
                                                           # suddenly jumps to 100
                                   score_each_iteration = True,
                                   validation_frame = test
                                   #initial_weight_distribution = initial_wt_dist[i]
                                  )
dl.train(
              x= train_cols,            # Predictor columns
              y= y,            # Target
              training_frame=train,  # training data
              validation_frame = test
             )
    

In [None]:
# 11.0
# Make predictions
out = dl.predict(test)
out = out.as_data_frame()

In [None]:
# opt 11.1
dl.scoring_history().head()

In [None]:
# opt11.2 Plot the scoring-history
#        Model is quite generalized
import matplotlib.pyplot as plt
table_bayes = dl.scoring_history()
fig = plt.figure()
_ = table_bayes[['training_rmse','validation_rmse']].plot()

In [None]:
#################