In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import os
import json
import datetime

plt.rcParams["figure.figsize"] = (20, 5)

physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

In [2]:
timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
'''
log_dir = "logs/fit/" + timestamp
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
version_dir = "version/" + timestamp 

os.makedirs(version_dir)
'''
timestamp

'20200825-111751'

In [3]:
dataset_name = "StreamBench_2G1P"

In [4]:
dataset = pd.read_csv("data/{}_train_set.csv".format(dataset_name))
dataset.head()

Unnamed: 0,t,t+1,delta,tokenized_data
0,106749231104,106749231104,0,0
1,106749231104,106751328256,2097152,2097152
2,106751328256,106751328256,0,0
3,106751328256,106745036800,-6291456,-6291456
4,106745036800,106745036800,0,0


In [54]:
param_list = dict()

param_list["PAST_HISTORY"] = 16
param_list["FUTURE_TARGET"] = 8
param_list["BATCH_SIZE"] = 1024
param_list["EPOCHS"] = 5
param_list["BUFFER_SIZE"] = 200000
param_list["NUM_1_NEURONS"] = 64
param_list["NUM_2_NEURONS"] = 64

In [11]:
def generate_timeseries(dataset, start_index, end_index, history_size, target_size, n_features):
    data = []
    labels = []

    start_index = start_index + history_size
    if end_index is None:
        end_index = len(dataset) - target_size

    for i in range(start_index, end_index):
        indices = range(i-history_size, i)
        # Reshape data from (history_size,) to (history_size, n_feature)
        data.append(np.reshape(dataset[indices], (history_size, n_features)))
        labels.append(dataset[i:i+target_size])
    return np.array(data), np.array(labels)

In [28]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(dtype=np.float32)
encoded_data = encoder.fit_transform(dataset["tokenized_data"].values.reshape(-1, 1))
encoded_data[0], encoder.categories_

(<1x5 sparse matrix of type '<class 'numpy.float32'>'
 	with 1 stored elements in Compressed Sparse Row format>,
 [array([-6291456,       -1,        0,     4096,  2097152], dtype=int64)])

In [29]:
x_train, y_train = generate_timeseries(encoded_data.toarray(), 0, None, param_list["PAST_HISTORY"], param_list["FUTURE_TARGET"], len(encoder.categories_[0]))
x_train.shape, y_train.shape

((112878, 16, 5), (112878, 8, 5))

In [32]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(param_list["NUM_1_NEURONS"], return_sequences=True)))
'''
model.add(tf.keras.layers.Dropout(0.1))
model.add(tf.keras.layers.RepeatVector(param_list["FUTURE_TARGET"]))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(param_list["NUM_2_NEURONS"], return_sequences=True)))
model.add(tf.keras.layers.Dropout(0.1))
model.add(keras.layers.TimeDistributed(tf.keras.layers.Dense(len(encoder.categories_[0]), activation="softmax")))
'''
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [31]:
model_history = model.fit(x_train, y_train, batch_size=param_list["BATCH_SIZE"], validation_split=0.2, epochs=param_list["EPOCHS"])
#model.save("version/{}/model.h5".format(timestamp))

Epoch 1/5


ValueError: in user code:

    C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\keras\engine\training.py:571 train_function  *
        outputs = self.distribute_strategy.run(
    C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:951 run  **
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:2290 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:2649 _call_for_each_replica
        return fn(*args, **kwargs)
    C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\keras\engine\training.py:531 train_step  **
        y_pred = self(x, training=True)
    C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\keras\engine\base_layer.py:927 __call__
        outputs = call_fn(cast_inputs, *args, **kwargs)
    C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\keras\engine\sequential.py:291 call
        outputs = layer(inputs, **kwargs)
    C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\keras\engine\base_layer.py:886 __call__
        self.name)
    C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\keras\engine\input_spec.py:180 assert_input_compatibility
        str(x.shape.as_list()))

    ValueError: Input 0 of layer repeat_vector is incompatible with the layer: expected ndim=2, found ndim=3. Full shape received: [None, 16, 128]


In [17]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional (Bidirectional multiple                  35840     
_________________________________________________________________
dropout (Dropout)            multiple                  0         
_________________________________________________________________
time_distributed (TimeDistri multiple                  645       
Total params: 36,485
Trainable params: 36,485
Non-trainable params: 0
_________________________________________________________________


In [18]:
test_dataset = pd.read_csv("data/{}_test_set.csv".format(dataset_name))
test_dataset.head()

Unnamed: 0,t,t+1,delta,tokenized_data
0,102762393600,102762393600,0,0
1,102762393600,102762397696,4096,4096
2,102762397696,102762397696,0,0
3,102762397696,102762401792,4096,4096
4,102762401792,102762401792,0,0


In [19]:
test_encoder = OneHotEncoder(dtype=np.float32)
encoded_test_data = test_encoder.fit_transform(test_dataset["tokenized_data"].values.reshape(-1, 1))
encoded_test_data[0], test_encoder.categories_

(<1x5 sparse matrix of type '<class 'numpy.float32'>'
 	with 1 stored elements in Compressed Sparse Row format>,
 [array([-6291456,       -1,        0,     4096,  2097152], dtype=int64)])

In [20]:
x_test, y_test = generate_timeseries(encoded_test_data.toarray(), 0, None, param_list["PAST_HISTORY"], param_list["FUTURE_TARGET"], len(test_encoder.categories_[0]))

## Model Architecture Experiment

In [49]:
x_test[0].reshape(1, 16, 5)

array([[[0., 0., 1., 0., 0.],
        [0., 0., 0., 1., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 0., 1., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 0., 1., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 0., 1., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 0., 1., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 0., 1., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 0., 1., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 0., 1., 0.]]], dtype=float32)

In [47]:
model_1 = tf.keras.models.Sequential()
model_1.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(param_list["NUM_1_NEURONS"])))
'''
model.add(tf.keras.layers.Dropout(0.1))
model.add(tf.keras.layers.RepeatVector(param_list["FUTURE_TARGET"]))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(param_list["NUM_2_NEURONS"], return_sequences=True)))
model.add(tf.keras.layers.Dropout(0.1))
model.add(keras.layers.TimeDistributed(tf.keras.layers.Dense(len(encoder.categories_[0]), activation="softmax")))
'''
model_1.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [48]:
result_1 = model_1.predict(x_test[0].reshape(1, 16, 5))
result_1, result_1.shape

(array([[-0.0614571 ,  0.04286239,  0.09877291,  0.00537634,  0.07741823,
         -0.07921369, -0.1213546 ,  0.06493104,  0.05055659,  0.08410605,
          0.00962282, -0.01917225, -0.00697964,  0.03229579,  0.12848236,
         -0.03197474,  0.00627654, -0.10345893, -0.00950697,  0.10500506,
         -0.0647539 ,  0.07930844,  0.0372098 , -0.0008913 ,  0.1151884 ,
          0.00179977,  0.11090717,  0.03821872,  0.01609948, -0.01904089,
          0.00039658,  0.01559336, -0.06325941,  0.02163648,  0.0958841 ,
          0.08016168,  0.01608705,  0.04657632, -0.10204333, -0.00924805,
         -0.05625045,  0.04965116, -0.0600381 , -0.02039548,  0.07882448,
          0.04489338,  0.02653247, -0.05445531, -0.08821502,  0.11298406,
         -0.04620604,  0.01812863,  0.01889674,  0.06423473,  0.06163333,
         -0.03357677, -0.07878125,  0.04681071, -0.08930261,  0.04560812,
          0.04589961,  0.04168834, -0.00599837,  0.07009937,  0.03474264,
          0.02559287, -0.00345814,  0.

In [41]:
model_2 = tf.keras.models.Sequential()
model_2.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(param_list["NUM_1_NEURONS"])))
#model.add(tf.keras.layers.Dropout(0.1))
model_2.add(tf.keras.layers.RepeatVector(param_list["FUTURE_TARGET"]))
'''
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(param_list["NUM_2_NEURONS"], return_sequences=True)))
model.add(tf.keras.layers.Dropout(0.1))
model.add(keras.layers.TimeDistributed(tf.keras.layers.Dense(len(encoder.categories_[0]), activation="softmax")))
'''
model_2.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [42]:
result_2 = model_2.predict(x_test[0].reshape(1, 16, 5))
result_2, result_2.shape

(array([[[ 0.08522998,  0.00066322,  0.00604119, ..., -0.06457821,
          -0.03525865,  0.05164128],
         [ 0.08522998,  0.00066322,  0.00604119, ..., -0.06457821,
          -0.03525865,  0.05164128],
         [ 0.08522998,  0.00066322,  0.00604119, ..., -0.06457821,
          -0.03525865,  0.05164128],
         ...,
         [ 0.08522998,  0.00066322,  0.00604119, ..., -0.06457821,
          -0.03525865,  0.05164128],
         [ 0.08522998,  0.00066322,  0.00604119, ..., -0.06457821,
          -0.03525865,  0.05164128],
         [ 0.08522998,  0.00066322,  0.00604119, ..., -0.06457821,
          -0.03525865,  0.05164128]]], dtype=float32),
 (1, 8, 128))

In [50]:
model_3 = tf.keras.models.Sequential()
model_3.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(param_list["NUM_1_NEURONS"])))
#model.add(tf.keras.layers.Dropout(0.1))
model_3.add(tf.keras.layers.RepeatVector(param_list["FUTURE_TARGET"]))
model_3.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(param_list["NUM_2_NEURONS"], return_sequences=True)))
#model.add(tf.keras.layers.Dropout(0.1))
#model.add(keras.layers.TimeDistributed(tf.keras.layers.Dense(len(encoder.categories_[0]), activation="softmax")))
model_3.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [51]:
result_3 = model_3.predict(x_test[0].reshape(1, 16, 5))
result_3, result_3.shape

(array([[[ 0.00244632, -0.00456948,  0.01194109, ...,  0.01630096,
           0.03530882, -0.03044449],
         [ 0.00603336, -0.0065718 ,  0.02030258, ...,  0.01648008,
           0.03366842, -0.02884687],
         [ 0.01008894, -0.00725893,  0.02606528, ...,  0.0166139 ,
           0.03155206, -0.02692162],
         ...,
         [ 0.02121945, -0.00741401,  0.0342004 , ...,  0.01504393,
           0.02093054, -0.01823865],
         [ 0.02403621, -0.00761928,  0.03521886, ...,  0.01261888,
           0.01536341, -0.01377663],
         [ 0.02636102, -0.00798908,  0.0358013 , ...,  0.00803431,
           0.00844396, -0.00793399]]], dtype=float32),
 (1, 8, 128))

In [45]:
model_4 = tf.keras.models.Sequential()
model_4.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(param_list["NUM_1_NEURONS"])))
#model.add(tf.keras.layers.Dropout(0.1))
model_4.add(tf.keras.layers.RepeatVector(param_list["FUTURE_TARGET"]))
model_4.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(param_list["NUM_2_NEURONS"], return_sequences=True)))
#model.add(tf.keras.layers.Dropout(0.1))
model_4.add(keras.layers.TimeDistributed(tf.keras.layers.Dense(len(encoder.categories_[0]), activation="softmax")))
model_4.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [46]:
result_4 = model_4.predict(x_test[0].reshape(1, 16, 5))
result_4, result_4.shape

(array([[[0.19121872, 0.20027323, 0.21393429, 0.20777583, 0.18679796],
         [0.19202779, 0.20250033, 0.21510379, 0.20329042, 0.18707766],
         [0.1924712 , 0.20397352, 0.21620464, 0.19970706, 0.18764362],
         [0.19282226, 0.20494585, 0.2170467 , 0.19666614, 0.18851909],
         [0.19328287, 0.20562245, 0.21746068, 0.19388086, 0.18975314],
         [0.19403245, 0.2061691 , 0.21726114, 0.19109827, 0.19143906],
         [0.19526736, 0.20671973, 0.21620859, 0.18806872, 0.19373558],
         [0.19723672, 0.2073801 , 0.21396603, 0.18452017, 0.19689704]]],
       dtype=float32),
 (1, 8, 5))