In [174]:
import tensorflow as tf
from tensorflow import keras
from tensorboard.plugins.hparams import api as hp
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import os
import json
import datetime

plt.rcParams["figure.figsize"] = (20, 5)

physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

In [175]:
timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
log_dir = "logs/fit/" + timestamp
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
version_dir = "version/" + timestamp 

os.makedirs(log_dir)
os.makedirs(version_dir)
timestamp

'20200822-234051'

In [176]:
dataset_name = "StreamBench_1G1P"

In [177]:
dataset = pd.read_csv("data/{}_train_set.csv".format(dataset_name))
dataset.head()

Unnamed: 0,t,t+1,delta,tokenized_data
0,104291368960,104291373056,4096,4096
1,104291373056,104291377152,4096,4096
2,104291377152,104291381248,4096,4096
3,104291381248,104291385344,4096,4096
4,104291385344,104291389440,4096,4096


In [178]:
# Static Parameters 
static_params = dict()
static_params["PAST_HISTORY"] = 16
static_params["FUTURE_TARGET"] = 8
static_params["BUFFER_SIZE"] = 200000
static_params["ACTIVATION"] = 'softmax'
static_params["LOSS_FUNCTION"] = 'categorical_crossentropy'
static_params["VAL_SPLIT"] = 0.2
static_params["METRIC_ACCURACY"] = 'accuracy'

# Hyper Parameters
HP_BATCH_SIZE = hp.HParam('batch_size', hp.Discrete([256, 512, 1024]))
HP_EPOCHS = hp.HParam('epochs', hp.Discrete([5, 10]))   #250, 500, 750, 1000
HP_LAYER_1_UNITS = hp.HParam('layer_1_units', hp.Discrete([4, 8]))  #, 16, 32, 64, 128
HP_LAYER_2_UNITS = hp.HParam('layer_2_units', hp.Discrete([4, 8, 16, 32, 64, 128]))
HP_LAYER_1_DROPOUT = hp.HParam('layer_1_dropout', hp.RealInterval(0.1, 0.2))
HP_LAYER_2_DROPOUT = hp.HParam('layer_2_dropout', hp.RealInterval(0.1, 0.2))
HP_OPTIMIZER = hp.HParam('optimizer', hp.Discrete(['adam', 'sgd']))

with tf.summary.create_file_writer('logs/hparam_tuning').as_default():
  hp.hparams_config(
    hparams=[HP_BATCH_SIZE, HP_EPOCHS, HP_LAYER_1_UNITS, HP_LAYER_2_UNITS, HP_LAYER_1_DROPOUT, HP_LAYER_2_DROPOUT, HP_OPTIMIZER],
    metrics=[hp.Metric(static_params["METRIC_ACCURACY"], display_name='Accuracy')],
  )

In [179]:
def generate_timeseries(dataset, start_index, end_index, history_size, target_size, n_features):
    data = []
    labels = []

    start_index = start_index + history_size
    if end_index is None:
        end_index = len(dataset) - target_size

    for i in range(start_index, end_index):
        indices = range(i-history_size, i)
        # Reshape data from (history_size,) to (history_size, n_feature)
        data.append(np.reshape(dataset[indices], (history_size, n_features)))
        labels.append(dataset[i:i+target_size])
    return np.array(data), np.array(labels)

In [180]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(dtype=np.float32)
encoded_data = encoder.fit_transform(dataset["tokenized_data"].values.reshape(-1, 1))
encoded_data[0], encoder.categories_

(<1x5 sparse matrix of type '<class 'numpy.float32'>'
 	with 1 stored elements in Compressed Sparse Row format>,
 [array([-6291456,       -1,        0,     4096,  2097152], dtype=int64)])

In [181]:
x_train, y_train = generate_timeseries(encoded_data.toarray(), 0, None, static_params["PAST_HISTORY"], static_params["FUTURE_TARGET"], len(encoder.categories_[0]))
x_train.shape, y_train.shape

((33634, 16, 5), (33634, 8, 5))

In [182]:
test_dataset = pd.read_csv("data/{}_test_set.csv".format(dataset_name))
test_dataset.head()

Unnamed: 0,t,t+1,delta,tokenized_data
0,103653834752,103655931904,2097152,2097152
1,103655931904,103655931904,0,0
2,103655931904,103649640448,-6291456,-6291456
3,103649640448,103649640448,0,0
4,103649640448,103651737600,2097152,2097152


In [183]:
encoded_test_data = encoder.transform(test_dataset["tokenized_data"].values.reshape(-1, 1))
encoded_test_data[0]

<1x5 sparse matrix of type '<class 'numpy.float32'>'
	with 1 stored elements in Compressed Sparse Row format>

In [184]:
x_test, y_test = generate_timeseries(encoded_test_data.toarray(), 0, None, static_params["PAST_HISTORY"], static_params["FUTURE_TARGET"], len(encoder.categories_[0]))
x_test.shape

(11196, 16, 5)

In [186]:
def create_model(hparams):
    print(hparams[HP_LAYER_1_UNITS])
    print(hparams[HP_OPTIMIZER])
    model = keras.models.Sequential([
        keras.layers.Bidirectional(keras.layers.LSTM(hparams[HP_LAYER_1_UNITS])),
        keras.layers.Dropout(hparams[HP_LAYER_1_DROPOUT]),
        keras.layers.RepeatVector(static_params["FUTURE_TARGET"]),
        keras.layers.Bidirectional(keras.layers.LSTM(hparams[HP_LAYER_2_UNITS], return_sequences=True)),
        keras.layers.Dropout(hparams[HP_LAYER_2_DROPOUT]),
        keras.layers.TimeDistributed(keras.layers.Dense(len(encoder.categories_[0]), activation=static_params["ACTIVATION"]))
    ])
    model.compile(
        optimizer=hparams[HP_OPTIMIZER],
        loss=static_params["LOSS_FUNCTION"],
        metrics=static_params["METRIC_ACCURACY"]
    )
    history = model.fit(x_train, y_train, batch_size=hparams[HP_BATCH_SIZE], validation_split=static_params["VAL_SPLIT"], epochs=hparams[HP_EPOCHS])
    _, accuracy = model.evaluate(x_test, y_test)
    return history, accuracy

In [187]:
def run(run_dir, hparams):
    with tf.summary.create_file_writer(run_dir).as_default():
        hp.hparams(hparams)
        history, accuracy = create_model(hparams)
        tf.summary.scalar(static_params["METRIC_ACCURACY"], accuracy, step=1)

In [188]:
session_num = 0

for layer_1_units in HP_LAYER_1_UNITS.domain.values:
    for layer_1_dropout in (HP_LAYER_1_DROPOUT.domain.min_value, HP_LAYER_1_DROPOUT.domain.max_value):
        for optimizer in HP_OPTIMIZER.domain.values:
            hparams = {
                HP_LAYER_1_UNITS: layer_1_units,
                HP_LAYER_1_DROPOUT: layer_1_dropout,
                HP_LAYER_2_UNITS: 8,
                HP_LAYER_2_DROPOUT: 0.1,
                HP_BATCH_SIZE: 256,
                HP_EPOCHS: 5,
                HP_OPTIMIZER: optimizer,
            }
            run_name = "run-{}".format(session_num)
            print('--- Starting trial: {}'.format(run_name))
            print({h.name: hparams[h] for h in hparams})


            run('logs/hparam_tuning/' + run_name, hparams)
            session_num += 1

--- Starting trial: run-0
{'layer_1_units': 4, 'layer_1_dropout': 0.1, 'layer_2_units': 8, 'layer_2_dropout': 0.1, 'batch_size': 256, 'epochs': 5, 'optimizer': 'adam'}
4
adam
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
--- Starting trial: run-1
{'layer_1_units': 4, 'layer_1_dropout': 0.1, 'layer_2_units': 8, 'layer_2_dropout': 0.1, 'batch_size': 256, 'epochs': 5, 'optimizer': 'sgd'}
4
sgd
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
--- Starting trial: run-2
{'layer_1_units': 4, 'layer_1_dropout': 0.2, 'layer_2_units': 8, 'layer_2_dropout': 0.1, 'batch_size': 256, 'epochs': 5, 'optimizer': 'adam'}
4
adam
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
--- Starting trial: run-3
{'layer_1_units': 4, 'layer_1_dropout': 0.2, 'layer_2_units': 8, 'layer_2_dropout': 0.1, 'batch_size': 256, 'epochs': 5, 'optimizer': 'sgd'}
4
sgd
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
--- Starting trial: run-4
{'layer_1_units': 8, 'layer_1_dropout': 0.1, 'layer_2_units': 8, 'layer_2_drop

y_true = []
for i in range(y_test.shape[0]):
    y_true.extend(np.argmax(y_test[i], axis=1).tolist())

y_pred = []
for i in range(x_test.shape[0]):
    y_pred.extend(np.argmax(model.predict(x_test[i].reshape(1, 16, 5))[0], axis=1).tolist())

model.predict(x_test[0].reshape(1, 16, 5))

np.argmax(model.predict(x_test[0].reshape(1, 16, 5))[0], axis=1)

from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_true, y_pred)
with open("version/{}/accuracy.txt".format(timestamp), "w") as t:
    t.write(str(accuracy.tolist()))

accuracy

from sklearn.metrics import classification_report

report = pd.DataFrame(classification_report(y_true, y_pred, output_dict=True)).transpose()
report.to_csv("version/{}/report.csv".format(timestamp))
print(report)