In [20]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import os
import json
import datetime

plt.rcParams["figure.figsize"] = (20, 5)

physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

In [21]:
timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
log_dir = "logs/fit/" + timestamp
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
version_dir = "version/" + timestamp 

os.makedirs(version_dir)
timestamp

'20200817-215220'

In [22]:
dataset_name = "StreamBench_2G1P"

In [23]:
dataset = pd.read_csv("data/{}_train_set.csv".format(dataset_name))
dataset.head()

Unnamed: 0,t,t+1,delta,tokenized_data
0,106749231104,106749231104,0,0
1,106749231104,106751328256,2097152,2097152
2,106751328256,106751328256,0,0
3,106751328256,106745036800,-6291456,-6291456
4,106745036800,106745036800,0,0


In [24]:
param_list = dict()

param_list["PAST_HISTORY"] = 16
param_list["FUTURE_TARGET"] = 8
param_list["BATCH_SIZE"] = 1024
param_list["EPOCHS"] = 500
param_list["BUFFER_SIZE"] = 200000
param_list["NUM_1_NEURONS"] = 64
param_list["NUM_2_NEURONS"] = 64

with open("version/{}/params.json".format(timestamp), "w") as p:
    json.dump(param_list, p, indent=4)

In [25]:
def generate_timeseries(dataset, start_index, end_index, history_size, target_size, n_features):
    data = []
    labels = []

    start_index = start_index + history_size
    if end_index is None:
        end_index = len(dataset) - target_size

    for i in range(start_index, end_index):
        indices = range(i-history_size, i)
        # Reshape data from (history_size,) to (history_size, n_feature)
        data.append(np.reshape(dataset[indices], (history_size, n_features)))
        labels.append(dataset[i:i+target_size])
    return np.array(data), np.array(labels)

In [26]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(dtype=np.float32)
encoded_data = encoder.fit_transform(dataset["tokenized_data"].values.reshape(-1, 1))
encoded_data[0], encoder.categories_

(<1x5 sparse matrix of type '<class 'numpy.float32'>'
 	with 1 stored elements in Compressed Sparse Row format>,
 [array([-6291456,       -1,        0,     4096,  2097152], dtype=int64)])

In [27]:
x_train, y_train = generate_timeseries(encoded_data.toarray(), 0, None, param_list["PAST_HISTORY"], param_list["FUTURE_TARGET"], len(encoder.categories_[0]))
x_train.shape, y_train.shape

((112878, 16, 5), (112878, 8, 5))

In [28]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(param_list["NUM_1_NEURONS"])))
model.add(tf.keras.layers.Dropout(0.1))
model.add(tf.keras.layers.RepeatVector(param_list["FUTURE_TARGET"]))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(param_list["NUM_2_NEURONS"], return_sequences=True)))
model.add(tf.keras.layers.Dropout(0.1))
model.add(keras.layers.TimeDistributed(tf.keras.layers.Dense(len(encoder.categories_[0]), activation="softmax")))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [29]:
model_history = model.fit(x_train, y_train, batch_size=param_list["BATCH_SIZE"], validation_split=0.2, epochs=param_list["EPOCHS"])
model.save("version/{}/model.h5".format(timestamp))

Epoch 363/500
Epoch 364/500
Epoch 365/500
Epoch 366/500
Epoch 367/500
Epoch 368/500
Epoch 369/500
Epoch 370/500
Epoch 371/500
Epoch 372/500
Epoch 373/500
Epoch 374/500
Epoch 375/500
Epoch 376/500
Epoch 377/500
Epoch 378/500
Epoch 379/500
Epoch 380/500
Epoch 381/500
Epoch 382/500
Epoch 383/500
Epoch 384/500
Epoch 385/500
Epoch 386/500
Epoch 387/500
Epoch 388/500
Epoch 389/500
Epoch 390/500
Epoch 391/500
Epoch 392/500
Epoch 393/500
Epoch 394/500
Epoch 395/500
Epoch 396/500
Epoch 397/500
Epoch 398/500
Epoch 399/500
Epoch 400/500
Epoch 401/500
Epoch 402/500
Epoch 403/500
Epoch 404/500
Epoch 405/500
Epoch 406/500
Epoch 407/500
Epoch 408/500
Epoch 409/500
Epoch 410/500
Epoch 411/500
Epoch 412/500
Epoch 413/500
Epoch 414/500
Epoch 415/500
Epoch 416/500
Epoch 417/500
Epoch 418/500
Epoch 419/500
Epoch 420/500
Epoch 421/500
Epoch 422/500
Epoch 423/500
Epoch 424/500
Epoch 425/500
Epoch 426/500
Epoch 427/500
Epoch 428/500
Epoch 429/500
Epoch 430/500
Epoch 431/500
Epoch 432/500
Epoch 433/500
Epoch 

In [30]:
test_dataset = pd.read_csv("data/{}_test_set.csv".format(dataset_name))
test_dataset.head()

Unnamed: 0,t,t+1,delta,tokenized_data
0,102762393600,102762393600,0,0
1,102762393600,102762397696,4096,4096
2,102762397696,102762397696,0,0
3,102762397696,102762401792,4096,4096
4,102762401792,102762401792,0,0


In [31]:
test_encoder = OneHotEncoder(dtype=np.float32)
encoded_test_data = test_encoder.fit_transform(test_dataset["tokenized_data"].values.reshape(-1, 1))
encoded_test_data[0], test_encoder.categories_

(<1x5 sparse matrix of type '<class 'numpy.float32'>'
 	with 1 stored elements in Compressed Sparse Row format>,
 [array([-6291456,       -1,        0,     4096,  2097152], dtype=int64)])

In [32]:
x_test, y_test = generate_timeseries(encoded_test_data.toarray(), 0, None, 16, 8, len(test_encoder.categories_[0]))

In [33]:
y_true = []
for i in range(y_test.shape[0]):
    y_true.extend(np.argmax(y_test[i], axis=1).tolist())

In [34]:
y_pred = []
for i in range(x_test.shape[0]):
    y_pred.extend(np.argmax(model.predict(x_test[i].reshape(1, 16, 5))[0], axis=1).tolist())

In [35]:
model.predict(x_test[0].reshape(1, 16, 5))

array([[[1.0693275e-09, 2.1834114e-07, 9.9999499e-01, 4.7102244e-06,
         3.3979104e-11],
        [2.3365993e-04, 7.2977581e-04, 1.9987993e-04, 9.9882382e-01,
         1.2825866e-05],
        [1.6047409e-09, 1.0699457e-07, 9.9985671e-01, 1.4315771e-04,
         2.0312559e-11],
        [2.5425880e-04, 7.4768625e-04, 3.5318063e-04, 9.9835891e-01,
         2.8583620e-04],
        [9.2985941e-10, 5.9050532e-08, 9.9969339e-01, 3.0654480e-04,
         1.0667801e-10],
        [5.1631802e-04, 7.2454673e-04, 4.7993491e-04, 9.9799734e-01,
         2.8180063e-04],
        [4.3889949e-09, 3.1290847e-07, 9.9958223e-01, 4.1735955e-04,
         1.2785907e-10],
        [4.7631745e-04, 6.7538983e-04, 5.6909717e-04, 9.9774998e-01,
         5.2916544e-04]]], dtype=float32)

In [36]:
np.argmax(model.predict(x_test[0].reshape(1, 16, 5))[0], axis=1)

array([2, 3, 2, 3, 2, 3, 2, 3], dtype=int64)

In [37]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_true, y_pred)
with open("version/{}/accuracy.txt".format(timestamp), "w") as t:
    t.write(str(accuracy.tolist()))

accuracy

0.9481288221217761

In [38]:
from sklearn.metrics import classification_report

report = pd.DataFrame(classification_report(y_true, y_pred, output_dict=True)).transpose()
report.to_csv("version/{}/report.csv".format(timestamp))
print(report)

precision    recall  f1-score        support
0              0.918061  0.910908  0.914470    2952.000000
1              0.180929  0.113497  0.139491     652.000000
2              0.994090  0.915866  0.953376  163264.000000
3              0.903605  0.994154  0.946719  131036.000000
4              0.916836  0.911290  0.914055    2976.000000
accuracy       0.948129  0.948129  0.948129       0.948129
macro avg      0.782704  0.769143  0.773622  300880.000000
weighted avg   0.951411  0.948129  0.947943  300880.000000
