In [16]:
import DataPipeline
from tensorflow import data, nn, summary
from tensorflow import summary
from tensorflow.keras import layers, optimizers, metrics, models, regularizers
from tensorboard.plugins.hparams import api as hp


### Hyperparameter Tuning
This notebook uses the keras tuner to find the best 

In [17]:
TOTAL_SAMPLES = 5997886
SUB_SAMPLE_SIZE = 5997886
TRAINING_SIZE = SUB_SAMPLE_SIZE * 0.9
DEVIATION = 0.05
PATIENCE = 10
EPOCHS = 10000

data = DataPipeline.DataPipeline(
  TOTAL_SAMPLES, SUB_SAMPLE_SIZE, TRAINING_SIZE, 1024, DEVIATION, ragged=False)

4753046 df_training examples
264059 validation examples
264058 df_test examples

create train ds
create val ds
create test ds
done


In [20]:
HP_BATCH_SIZE = hp.HParam('batch_size', hp.Discrete([256, 512, 1024, 2048, 4096, 8192, pow(2, 14), pow(2, 15), pow(2, 16)]))
HP_DROPOUT = hp.HParam('dropout', hp.RealInterval(0.1, 0.5))
HP_LREG = hp.HParam('l2_reg', hp.Discrete([1e-5, 1e-4, 0.01]))
HP_NUM_UNITS_L1 = hp.HParam('num_neurons1', hp.Discrete([16, 32, 64, 128, 256]))
HP_OPTIMIZER = hp.HParam('optimizer', hp.Discrete(['adam', 'sgd']))

METRIC_PRC = 'prc'  # precision-recall curve

with summary.create_file_writer('logs/hparam_tuning').as_default():
  hp.hparams_config(
    hparams=[HP_NUM_UNITS_L1, HP_LREG, HP_BATCH_SIZE, HP_DROPOUT, HP_OPTIMIZER],
    metrics=[hp.Metric(METRIC_PRC, display_name='PRC')],
  )

In [21]:
def train_test_model(hparams, num_units):
  model = models.Sequential([
    layers.Dense(hparams[HP_NUM_UNITS_L1], activation=nn.relu, kernel_regularizer=regularizers.l2(HP_LREG)),
    layers.Dropout(hparams[HP_DROPOUT]),
    layers.Dense(num_units-(num_units/2), activation=nn.relu, kernel_regularizer=regularizers.l2(HP_LREG)),
    layers.Dropout(hparams[HP_DROPOUT]),
    layers.Dense(1, activation=nn.sigmoid),
  ])
  model.compile(
      optimizer=hparams[HP_OPTIMIZER],
      loss='binary_crossentropy',
      metrics=[metrics.AUC(name='prc', curve='PR')],
  )
  
  model.fit(
    data.train_ds,
    epochs=50,
    batch_size=HP_BATCH_SIZE) # Run with 1 epoch to speed things up for demo purposes
  
  print(model.metrics_names)
  _,prc = model.evaluate(data.test, data.test_labels)
  
  return prc

def run(run_dir, hparams, num_units):
  with summary.create_file_writer(run_dir).as_default():
    hp.hparams(hparams)  # record the values used in this trial
    prc = train_test_model(hparams, num_units)
    summary.scalar(METRIC_PRC, prc, step=1)

In [None]:
session_num = 0

for num_units in HP_NUM_UNITS_L1.domain.values:
  for batch_size  in HP_BATCH_SIZE.domain.values:
    for lreg  in HP_LREG.domain.values:
      for dropout_rate in (HP_DROPOUT.domain.min_value, HP_DROPOUT.domain.max_value):
        for optimizer in HP_OPTIMIZER.domain.values:
          hparams = {
              HP_NUM_UNITS_L1: num_units,
              HP_LREG: lreg,
              HP_BATCH_SIZE: batch_size,
              HP_DROPOUT: dropout_rate,
              HP_OPTIMIZER: optimizer,
          }
          run_name = "run-%d" % session_num
          print('--- Starting trial: %s' % run_name)
          print({h.name: hparams[h] for h in hparams})
          run('logs/hparam_tuning/' + run_name, hparams, num_units)
          session_num += 1

In [23]:
data.df['NEO'].value_counts()

0.0    5693697
1.0     304189
Name: NEO, dtype: int64

In [None]:
%tensorboard --logdir logs/hparam_tuning