# Sherpa Example For Model Training

In [None]:
import os
import pickle as pkl
import sherpa
import sherpa.schedulers
import sherpa.algorithms


# Directory where files will be saved.
OVERWRITE  = True
output_dir = '' # All files (logs, results, etc.) written to here.
if os.path.isdir(output_dir) and OVERWRITE:
    print('Warning: Overwriting directory {}'.format(output_dir))
    import shutil
    shutil.rmtree(output_dir)
elif os.path.isdir(output_dir) and not OVERWRITE:
    print('Warning: Directory exists. Skipping {}'.format(output_dir))
    raise Exception
else:
    print('Writing to directory {}'.format(output_dir))

# # Hyperparameter dict
# # Simple version for quickly checking a few hp configurations
# hp_space = {
#     'batch_size':      [128, 512],
#     'act':             ['relu', 'tanh'],
#     'opt':             ['sgd', 'adam'],
#     'h_units':         [64, 256],
#     'epochs':          [50],
#     'early_stopping':  [3], # patience
#     'checkpoint_path': [output_dir],
# }
# parameters = sherpa.Parameter.grid(hp_space)

# More flexible, generally better alternative:
# For details: https://parameter-sherpa.readthedocs.io/en/latest/gettingstarted/guide.html#parameters
parameters = [sherpa.Ordinal(name='size', range=[1]),
              sherpa.Ordinal(name='cwave', range=[True])]

algorithm = sherpa.algorithms.GridSearch(repeat=6)
# For other available algorithms: https://parameter-sherpa.readthedocs.io/en/latest/algorithms/algorithms.html
# env is the .profile file that executes all set up required. 
env = ''  # insert your name
# SLURM options
opt = ''
# For details on slurm& the hpc cluster: https://www.hawaii.edu/its/ci/hpc-tutor/
scheduler = sherpa.schedulers.SLURMScheduler(environment=env, submit_options=opt, output_dir=output_dir)

# Runner file
filename = 'combinedNoVal.py' # Main method takes takes in parameters and sends results to mongodb.

results = sherpa.optimize(parameters=parameters,
                          algorithm=algorithm,
                          lower_is_better=True,
                          filename=filename,
                          output_dir=output_dir,
                          scheduler=scheduler,
                          max_concurrent=22,
                          verbose=1,
                          db_port=8887,
                          mongodb_args={'bind_ip_all':''}
                         )
print(results)
pkl.dump(results, open(output_dir+'/results.pkl', 'wb'))

# Example of loading and predicting using ensemble

In [None]:
import h5py
from keras.models import load_model
import numpy as np
from generator import gen_data

# HDF5 file storing data
data_path = ''

dataset = h5py.File(data_path, 'r')
X = dataset['X_test']
y_true = np.array(dataset['y_test'])
y_pred = np.zeros((y_true.shape[0], 1))
model_dir = ''
for i in range(6):
    model = load_model(model_dir + 'ensemble_{}.h5'.format(i + 1))
    temp = model.predict_generator(gen_data(data_path, 'test', 1024), steps = X.shape[0] // 1024 + 1, verbose=1)
    y_pred += temp[:y_true.shape[0]]
y_pred /= 6
y_pred = y_pred.reshape((19980,))

# Saving Predictions

In [None]:
import pandas as pd
X = dataset["X2_test"]
data = pd.DataFrame({"Labels": y_true, "Predictions": y_pred, "Time of Day": X[:, 0], "Incidence Angle": X[:, 7]})

In [None]:
data.to_csv("predictions_buoy.csv", header=True)

In [None]:
import pandas as pd
data = pd.read_csv("predictions_features_labels.csv")

In [None]:
import numpy as np
data["timeSAR"] = np.load("times.npy")

In [None]:
data.to_csv("predictions_with_times.csv")

# Visualization Of Errors 

In [None]:
residuals = y_true.reshape(-1) - y_pred.reshape(-1)
print(np.sqrt(np.mean((residuals)**2)))
import matplotlib.pyplot as plt
plt.scatter(y_true, residuals)
plt.title('Residuals of Test Set vs. True Label')
plt.ylabel('Residuals (True - Prediction)')
plt.xlabel('True Label')
plt.show()
plt.scatter(y_true, y_pred)
plt.title('Prediction vs. True Label')
plt.ylabel('Prediction')
plt.xlabel('True Label')
plt.show()
hs_large = (y_true > 8).nonzero()
large_true = y_true[hs_large]
large_pred = y_pred[hs_large]
res = large_true.reshape(-1) - large_pred.reshape(-1)
print(np.sqrt(np.mean((res)**2)), len(large_true))
hs_small = (y_true < 1).nonzero()
small_true = y_true[hs_small]
small_pred = y_pred[hs_small]
small_res = small_true.reshape(-1) - small_pred.reshape(-1)
print(np.sqrt(np.mean((small_res) ** 2)), len(small_true))
hs_meds = np.where(np.logical_and(y_true>1, y_true<3))
meds_true = y_true[hs_meds]
meds_pred = y_pred[hs_meds]
res = meds_true.reshape(-1) - meds_pred.reshape(-1)
print(np.sqrt(np.mean((res)**2)), len(meds_true))
hs_medl = np.where(np.logical_and(y_true>3, y_true<8))
medl_true = y_true[hs_medl]
medl_pred = y_pred[hs_medl]
res = medl_true.reshape(-1) - medl_pred.reshape(-1)
print(np.sqrt(np.mean((res)**2)), len(medl_true))

In [None]:
plt.scatter(y_true, residuals)
plt.title('Residuals of Test Set vs. True Label')
plt.ylabel('Residuals (True - Prediction)')
plt.xlabel('True Label')
plt.show()
plt.scatter(y_true, y_pred)
plt.title('Prediction vs. True Label')
plt.ylabel('Prediction')
plt.xlabel('True Label')
plt.show()