In [214]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import os
import json
import datetime

plt.rcParams["figure.figsize"] = (20, 5)

physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

In [215]:
timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
log_dir = "logs/fit/" + timestamp
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
version_dir = "version/" + timestamp 

os.makedirs(version_dir)
timestamp

'20200817-092343'

In [216]:
dataset_name = "StreamBench_1G1P"

In [217]:
dataset = pd.read_csv("data/{}_train_set.csv".format(dataset_name))
dataset.head()

Unnamed: 0,t,t+1,delta,tokenized_data
0,104291368960,104291373056,4096,4096
1,104291373056,104291377152,4096,4096
2,104291377152,104291381248,4096,4096
3,104291381248,104291385344,4096,4096
4,104291385344,104291389440,4096,4096


In [218]:
param_list = dict()

param_list["PAST_HISTORY"] = 16
param_list["FUTURE_TARGET"] = 8
param_list["BATCH_SIZE"] = 1024
param_list["EPOCHS"] = 1000
param_list["BUFFER_SIZE"] = 200000
param_list["NUM_1_NEURONS"] = 128
param_list["NUM_2_NEURONS"] = 64

with open("version/{}/params.json".format(timestamp), "w") as p:
    json.dump(param_list, p, indent=4)

In [219]:
def generate_timeseries(dataset, start_index, end_index, history_size, target_size, n_features):
    data = []
    labels = []

    start_index = start_index + history_size
    if end_index is None:
        end_index = len(dataset) - target_size

    for i in range(start_index, end_index):
        indices = range(i-history_size, i)
        # Reshape data from (history_size,) to (history_size, n_feature)
        data.append(np.reshape(dataset[indices], (history_size, n_features)))
        labels.append(dataset[i:i+target_size])
    return np.array(data), np.array(labels)

In [220]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(dtype=np.float32)
encoded_data = encoder.fit_transform(dataset["tokenized_data"].values.reshape(-1, 1))
encoded_data[0], encoder.categories_

(<1x5 sparse matrix of type '<class 'numpy.float32'>'
 	with 1 stored elements in Compressed Sparse Row format>,
 [array([-6291456,       -1,        0,     4096,  2097152], dtype=int64)])

In [221]:
x_train, y_train = generate_timeseries(encoded_data.toarray(), 0, None, param_list["PAST_HISTORY"], param_list["FUTURE_TARGET"], len(encoder.categories_[0]))
x_train.shape, y_train.shape

((33634, 16, 5), (33634, 8, 5))

In [222]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(param_list["NUM_1_NEURONS"])))
model.add(tf.keras.layers.Dropout(0.1))
model.add(tf.keras.layers.RepeatVector(param_list["FUTURE_TARGET"]))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(param_list["NUM_2_NEURONS"], return_sequences=True)))
model.add(tf.keras.layers.Dropout(0.1))
model.add(keras.layers.TimeDistributed(tf.keras.layers.Dense(len(encoder.categories_[0]), activation="softmax")))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [223]:
model_history = model.fit(x_train, y_train, batch_size=param_list["BATCH_SIZE"], validation_split=0.2, epochs=param_list["EPOCHS"])
model.save("version/{}/model.h5".format(timestamp))

000
Epoch 864/1000
Epoch 865/1000
Epoch 866/1000
Epoch 867/1000
Epoch 868/1000
Epoch 869/1000
Epoch 870/1000
Epoch 871/1000
Epoch 872/1000
Epoch 873/1000
Epoch 874/1000
Epoch 875/1000
Epoch 876/1000
Epoch 877/1000
Epoch 878/1000
Epoch 879/1000
Epoch 880/1000
Epoch 881/1000
Epoch 882/1000
Epoch 883/1000
Epoch 884/1000
Epoch 885/1000
Epoch 886/1000
Epoch 887/1000
Epoch 888/1000
Epoch 889/1000
Epoch 890/1000
Epoch 891/1000
Epoch 892/1000
Epoch 893/1000
Epoch 894/1000
Epoch 895/1000
Epoch 896/1000
Epoch 897/1000
Epoch 898/1000
Epoch 899/1000
Epoch 900/1000
Epoch 901/1000
Epoch 902/1000
Epoch 903/1000
Epoch 904/1000
Epoch 905/1000
Epoch 906/1000
Epoch 907/1000
Epoch 908/1000
Epoch 909/1000
Epoch 910/1000
Epoch 911/1000
Epoch 912/1000
Epoch 913/1000
Epoch 914/1000
Epoch 915/1000
Epoch 916/1000
Epoch 917/1000
Epoch 918/1000
Epoch 919/1000
Epoch 920/1000
Epoch 921/1000
Epoch 922/1000
Epoch 923/1000
Epoch 924/1000
Epoch 925/1000
Epoch 926/1000
Epoch 927/1000
Epoch 928/1000
Epoch 929/1000
Epoch 

In [224]:
test_dataset = pd.read_csv("data/{}_test_set.csv".format(dataset_name))
test_dataset.head()

Unnamed: 0,t,t+1,delta,tokenized_data
0,103653834752,103655931904,2097152,2097152
1,103655931904,103655931904,0,0
2,103655931904,103649640448,-6291456,-6291456
3,103649640448,103649640448,0,0
4,103649640448,103651737600,2097152,2097152


In [225]:
test_encoder = OneHotEncoder(dtype=np.float32)
encoded_test_data = test_encoder.fit_transform(test_dataset["tokenized_data"].values.reshape(-1, 1))
encoded_test_data[0], test_encoder.categories_

(<1x5 sparse matrix of type '<class 'numpy.float32'>'
 	with 1 stored elements in Compressed Sparse Row format>,
 [array([-6291456,       -1,        0,     4096,  2097152], dtype=int64)])

In [226]:
x_test, y_test = generate_timeseries(encoded_test_data.toarray(), 0, None, 16, 8, len(test_encoder.categories_[0]))

In [227]:
y_true = []
for i in range(y_test.shape[0]):
    y_true.extend(np.argmax(y_test[i], axis=1).tolist())

In [228]:
y_pred = []
for i in range(x_test.shape[0]):
    y_pred.extend(np.argmax(model.predict(x_test[i].reshape(1, 16, 5))[0], axis=1).tolist())

In [229]:
model.predict(x_test[0].reshape(1, 16, 5))

array([[[1.6143678e-05, 1.1879786e-05, 1.1160756e-04, 9.9985051e-01,
         9.8830214e-06],
        [5.2995347e-06, 4.7371395e-06, 1.0321221e-05, 9.9993849e-01,
         4.1070514e-05],
        [1.5543282e-06, 2.1834480e-06, 1.1994189e-05, 9.9996161e-01,
         2.2663262e-05],
        [5.7104393e-07, 6.1036695e-07, 3.0910587e-06, 9.9999094e-01,
         4.7377498e-06],
        [6.4900462e-07, 6.7179059e-07, 3.2362925e-06, 9.9998915e-01,
         6.3334351e-06],
        [5.4870782e-07, 5.1940106e-07, 2.2633951e-06, 9.9999356e-01,
         3.0409103e-06],
        [5.7147599e-07, 4.2658905e-07, 3.0220210e-06, 9.9999380e-01,
         2.2642050e-06],
        [7.1840248e-07, 5.1157451e-07, 3.3709866e-06, 9.9999285e-01,
         2.5908096e-06]]], dtype=float32)

In [230]:
np.argmax(model.predict(x_test[0].reshape(1, 16, 5))[0], axis=1)

array([3, 3, 3, 3, 3, 3, 3, 3], dtype=int64)

In [231]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_true, y_pred)
with open("version/{}/accuracy.txt".format(timestamp), "w") as t:
    t.write(str(accuracy.tolist()))

accuracy

0.9576634512325831

In [232]:
from sklearn.metrics import classification_report

report = pd.DataFrame(classification_report(y_true, y_pred, output_dict=True)).transpose()
report.to_csv("version/{}/report.csv".format(timestamp))
print(report)

precision    recall  f1-score       support
0              0.136571  0.703125  0.228717    128.000000
1              0.000000  0.000000  0.000000     80.000000
2              0.177722  0.169048  0.173276   1680.000000
3              0.983073  0.974338  0.978686  87560.000000
4              0.189765  0.741667  0.302207    120.000000
accuracy       0.957663  0.957663  0.957663      0.957663
macro avg      0.297426  0.517635  0.336577  89568.000000
weighted avg   0.964816  0.957663  0.960727  89568.000000
