In [13]:
import tensorflow as tf
from tensorflow import keras
from tensorboard.plugins.hparams import api as hp
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import os
import json
import datetime

plt.rcParams["figure.figsize"] = (20, 5)

physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

In [38]:
log_dir = "logs/fit/"
version_dir = "version/"
static_dir = "static/"

os.makedirs(log_dir)
os.makedirs(version_dir)
os.makedirs(static_dir)

In [15]:
dataset_name = "NU"

In [16]:
dataset = pd.read_csv("data/{}_train_set.csv".format(dataset_name))
test_dataset = pd.read_csv("data/{}_test_set.csv".format(dataset_name))
dataset.head()

Unnamed: 0,t,t+1,delta,tokenized_data
0,104289271808,104282980352,-6291456,-6291456
1,104282980352,104282984448,4096,4096
2,104282984448,104282988544,4096,4096
3,104282988544,104282992640,4096,4096
4,104282992640,104282996736,4096,4096


In [51]:
# Static Parameters 
static_params = dict()
static_params["PAST_HISTORY"] = 16
static_params["FUTURE_TARGET"] = 8
static_params["BUFFER_SIZE"] = 200000
static_params["ACTIVATION"] = 'softmax'
static_params["LOSS_FUNCTION"] = 'categorical_crossentropy'
static_params["VAL_SPLIT"] = 0.2
static_params["METRIC_ACCURACY"] = 'accuracy'
static_params["OPTIMIZER"] = 'adam'

# Hyper Parameters
HP_BATCH_SIZE = hp.HParam('batch_size', hp.Discrete([256, 512, 1024]))
HP_EPOCHS = hp.HParam('epochs', hp.Discrete([250, 500, 750, 1000]))
HP_LAYER_1_UNITS = hp.HParam('layer_1_units', hp.Discrete([16, 32, 64, 128]))
HP_LAYER_2_UNITS = hp.HParam('layer_2_units', hp.Discrete([16, 32, 64, 128]))
HP_LAYER_1_DROPOUT = hp.HParam('layer_1_dropout', hp.RealInterval(0.1, 0.3))
HP_LAYER_2_DROPOUT = hp.HParam('layer_2_dropout', hp.RealInterval(0.1, 0.3))

with tf.summary.create_file_writer('logs/hparam_tuning').as_default():
  hp.hparams_config(
    hparams=[HP_BATCH_SIZE, HP_EPOCHS, HP_LAYER_1_UNITS, HP_LAYER_2_UNITS, HP_LAYER_1_DROPOUT, HP_LAYER_2_DROPOUT],
    metrics=[hp.Metric(static_params["METRIC_ACCURACY"], display_name='Accuracy')],
  )

with open("static/static_params.json", "w") as j :
  json.dump(static_params, j, indent=4)

In [41]:
def generate_timeseries(dataset, start_index, end_index, history_size, target_size, n_features):
    data = []
    labels = []

    start_index = start_index + history_size
    if end_index is None:
        end_index = len(dataset) - target_size

    for i in range(start_index, end_index):
        indices = range(i-history_size, i)
        # Reshape data from (history_size,) to (history_size, n_feature)
        data.append(np.reshape(dataset[indices], (history_size, n_features)))
        labels.append(dataset[i:i+target_size])
    return np.array(data), np.array(labels)

In [42]:
from sklearn.preprocessing import OneHotEncoder
import joblib

encoder = OneHotEncoder(dtype=np.float32)
joblib.dump(encoder, "{}/encoder.pkl".format(static_dir))

encoded_data = encoder.fit_transform(dataset["tokenized_data"].values.reshape(-1, 1))
encoded_test_data = encoder.transform(test_dataset["tokenized_data"].values.reshape(-1, 1))
encoded_data[0], encoded_test_data[0], encoder.categories_

(<1x5 sparse matrix of type '<class 'numpy.float32'>'
 	with 1 stored elements in Compressed Sparse Row format>,
 <1x5 sparse matrix of type '<class 'numpy.float32'>'
 	with 1 stored elements in Compressed Sparse Row format>,
 [array([-6291456,       -1,        0,     4096,  2097152], dtype=int64)])

In [43]:
x_train, y_train = generate_timeseries(encoded_data.toarray(), 0, None, static_params["PAST_HISTORY"], static_params["FUTURE_TARGET"], len(encoder.categories_[0]))
x_test, y_test = generate_timeseries(encoded_test_data.toarray(), 0, None, static_params["PAST_HISTORY"], static_params["FUTURE_TARGET"], len(encoder.categories_[0]))
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((47165, 16, 5), (47165, 8, 5), (10478, 16, 5), (10478, 8, 5))

In [44]:
def tensorboard_callback(log_dir, hist_freq=1):
    return keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=hist_freq)

In [45]:
def create_model(hparams, timestamp):
    model = keras.models.Sequential([
        keras.layers.Bidirectional(keras.layers.LSTM(hparams[HP_LAYER_1_UNITS])),
        keras.layers.Dropout(hparams[HP_LAYER_1_DROPOUT]),
        keras.layers.RepeatVector(static_params["FUTURE_TARGET"]),
        keras.layers.Bidirectional(keras.layers.LSTM(hparams[HP_LAYER_2_UNITS], return_sequences=True)),
        keras.layers.Dropout(hparams[HP_LAYER_2_DROPOUT]),
        keras.layers.TimeDistributed(keras.layers.Dense(len(encoder.categories_[0]), activation=static_params["ACTIVATION"]))
    ])
    model.compile(
        optimizer=static_params["OPTIMIZER"],
        loss=static_params["LOSS_FUNCTION"],
        metrics=static_params["METRIC_ACCURACY"]
    )
    history = model.fit(x_train, y_train, batch_size=hparams[HP_BATCH_SIZE], validation_split=static_params["VAL_SPLIT"], epochs=hparams[HP_EPOCHS], callbacks=[tensorboard_callback(log_dir + timestamp)])
    loss, accuracy = model.evaluate(x_test, y_test)
    return history, accuracy, loss

In [46]:
def run(run_dir, timestamp, hparams):
    with tf.summary.create_file_writer(run_dir).as_default():
        hp.hparams(hparams)
        history, accuracy, loss = create_model(hparams, timestamp)
        tf.summary.scalar(static_params["METRIC_ACCURACY"], accuracy, step=1)

In [52]:
for batch_size in HP_BATCH_SIZE.domain.values:
    for epochs in HP_EPOCHS.domain.values:
        for layer_1_units in HP_LAYER_1_UNITS.domain.values:
            for layer_1_dropout in tf.linspace(HP_LAYER_1_DROPOUT.domain.min_value,HP_LAYER_1_DROPOUT.domain.max_value,3):
                for layer_2_units in HP_LAYER_2_UNITS.domain.values:
                    for layer_2_dropout in tf.linspace(HP_LAYER_2_DROPOUT.domain.min_value,HP_LAYER_2_DROPOUT.domain.max_value,3):
                        for optimizer in HP_OPTIMIZER.domain.values:
                            hparams = {
                                HP_LAYER_1_UNITS: layer_1_units,
                                HP_LAYER_1_DROPOUT: layer_1_dropout,
                                HP_LAYER_2_UNITS: layer_2_units,
                                HP_LAYER_2_DROPOUT: layer_2_dropout,
                                HP_BATCH_SIZE: batch_size,
                                HP_EPOCHS: epochs
                            }

                            timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
                            run_name = "Session : {}".format(timestamp)
                            print('--- Starting trial: {}'.format(run_name))
                            print({h.name: hparams[h] for h in hparams})

                            run('logs/hparam_tuning/', timestamp, hparams)

3456
