In [11]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import os
import json
import datetime

plt.rcParams["figure.figsize"] = (20, 5)

physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

In [12]:
timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
log_dir = "logs/fit/" + timestamp
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
version_dir = "version/" + timestamp 

os.makedirs(version_dir)
timestamp

'20200818-002519'

In [13]:
dataset_name = "SEG"

In [14]:
dataset = pd.read_csv("data/{}_train_set.csv".format(dataset_name))
dataset.head()

Unnamed: 0,t,t+1,delta,tokenized_data
0,93220738952,93220738952,0,0
1,93220738952,93220738952,0,0
2,93220738952,93220738952,0,0
3,93220738952,93220738952,0,0
4,93220738952,93220738952,0,0


In [15]:
param_list = dict()

param_list["PAST_HISTORY"] = 16
param_list["FUTURE_TARGET"] = 8
param_list["BATCH_SIZE"] = 1024
param_list["EPOCHS"] = 500
param_list["BUFFER_SIZE"] = 200000
param_list["NUM_1_NEURONS"] = 128
param_list["NUM_2_NEURONS"] = 64

with open("version/{}/params.json".format(timestamp), "w") as p:
    json.dump(param_list, p, indent=4)

In [16]:
def generate_timeseries(dataset, start_index, end_index, history_size, target_size, n_features):
    data = []
    labels = []

    start_index = start_index + history_size
    if end_index is None:
        end_index = len(dataset) - target_size

    for i in range(start_index, end_index):
        indices = range(i-history_size, i)
        # Reshape data from (history_size,) to (history_size, n_feature)
        data.append(np.reshape(dataset[indices], (history_size, n_features)))
        labels.append(dataset[i:i+target_size])
    return np.array(data), np.array(labels)

In [17]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(dtype=np.float32)
encoded_data = encoder.fit_transform(dataset["tokenized_data"].values.reshape(-1, 1))
encoded_data[0], encoder.categories_

(<1x87 sparse matrix of type '<class 'numpy.float32'>'
 	with 1 stored elements in Compressed Sparse Row format>,
 [array([-909517620, -515913384,    -192512,     -61440,     -32768,
             -20480,     -16384,     -12288,      -8304,      -8192,
              -5888,      -4096,      -4040,      -2884,      -2744,
               -432,       -240,        -48,        -24,        -16,
                -12,         -8,         -4,         -1,          0,
                  2,          4,          6,          8,         16,
                 24,         28,         32,         36,         44,
                 48,         56,         64,         76,        240,
                256,        432,       2416,       3072,       3520,
               3856,       3904,       3936,       4080,       4092,
               4096,       4104,       4112,       4128,       5280,
               6144,       8192,      11776,      12288,      16384,
              17664,      20480,      24576,      28672,  

In [18]:
x_train, y_train = generate_timeseries(encoded_data.toarray(), 0, None, param_list["PAST_HISTORY"], param_list["FUTURE_TARGET"], len(encoder.categories_[0]))
x_train.shape, y_train.shape

((176109, 16, 87), (176109, 8, 87))

In [19]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(param_list["NUM_1_NEURONS"])))
model.add(tf.keras.layers.Dropout(0.1))
model.add(tf.keras.layers.RepeatVector(param_list["FUTURE_TARGET"]))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(param_list["NUM_2_NEURONS"], return_sequences=True)))
model.add(tf.keras.layers.Dropout(0.1))
model.add(keras.layers.TimeDistributed(tf.keras.layers.Dense(len(encoder.categories_[0]), activation="softmax")))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [20]:
model_history = model.fit(x_train, y_train, batch_size=param_list["BATCH_SIZE"], validation_split=0.2, epochs=param_list["EPOCHS"])
model.save("version/{}/model.h5".format(timestamp))

och 364/500
Epoch 365/500
Epoch 366/500
Epoch 367/500
Epoch 368/500
Epoch 369/500
Epoch 370/500
Epoch 371/500
Epoch 372/500
Epoch 373/500
Epoch 374/500
Epoch 375/500
Epoch 376/500
Epoch 377/500
Epoch 378/500
Epoch 379/500
Epoch 380/500
Epoch 381/500
Epoch 382/500
Epoch 383/500
Epoch 384/500
Epoch 385/500
Epoch 386/500
Epoch 387/500
Epoch 388/500
Epoch 389/500
Epoch 390/500
Epoch 391/500
Epoch 392/500
Epoch 393/500
Epoch 394/500
Epoch 395/500
Epoch 396/500
Epoch 397/500
Epoch 398/500
Epoch 399/500
Epoch 400/500
Epoch 401/500
Epoch 402/500
Epoch 403/500
Epoch 404/500
Epoch 405/500
Epoch 406/500
Epoch 407/500
Epoch 408/500
Epoch 409/500
Epoch 410/500
Epoch 411/500
Epoch 412/500
Epoch 413/500
Epoch 414/500
Epoch 415/500
Epoch 416/500
Epoch 417/500
Epoch 418/500
Epoch 419/500
Epoch 420/500
Epoch 421/500
Epoch 422/500
Epoch 423/500
Epoch 424/500
Epoch 425/500
Epoch 426/500
Epoch 427/500
Epoch 428/500
Epoch 429/500
Epoch 430/500
Epoch 431/500
Epoch 432/500
Epoch 433/500
Epoch 434/500
Epoch 43

In [21]:
test_dataset = pd.read_csv("data/{}_test_set.csv".format(dataset_name))
test_dataset.head()

Unnamed: 0,t,t+1,delta,tokenized_data
0,92892336072,92805776480,-86559592,-1
1,92805776480,92805776656,176,-1
2,92805776656,92806873264,1096608,-1
3,92806873264,92806868361,-4903,-1
4,92806868361,92806801536,-66825,-1


In [37]:
test_encoder = OneHotEncoder(dtype=np.float32)
encoded_test_data = encoder.transform(test_dataset["tokenized_data"].values.reshape(-1, 1))
encoded_test_data[0], encoder.categories_

(<1x87 sparse matrix of type '<class 'numpy.float32'>'
 	with 1 stored elements in Compressed Sparse Row format>,
 [array([-909517620, -515913384,    -192512,     -61440,     -32768,
             -20480,     -16384,     -12288,      -8304,      -8192,
              -5888,      -4096,      -4040,      -2884,      -2744,
               -432,       -240,        -48,        -24,        -16,
                -12,         -8,         -4,         -1,          0,
                  2,          4,          6,          8,         16,
                 24,         28,         32,         36,         44,
                 48,         56,         64,         76,        240,
                256,        432,       2416,       3072,       3520,
               3856,       3904,       3936,       4080,       4092,
               4096,       4104,       4112,       4128,       5280,
               6144,       8192,      11776,      12288,      16384,
              17664,      20480,      24576,      28672,  

In [39]:
x_test, y_test = generate_timeseries(encoded_test_data.toarray(), 0, None, 16, 8, len(encoder.categories_[0]))

In [40]:
x_test[0].shape

(16, 87)

In [41]:
x_test[0]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [42]:
y_true = []
for i in range(y_test.shape[0]):
    y_true.extend(np.argmax(y_test[i], axis=1).tolist())

In [43]:
y_pred = []
for i in range(x_test.shape[0]):
    y_pred.extend(np.argmax(model.predict(x_test[i].reshape(1, 16, len(encoder.categories_[0])))[0], axis=1).tolist())

In [44]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_true, y_pred)
with open("version/{}/accuracy.txt".format(timestamp), "w") as t:
    t.write(str(accuracy.tolist()))

accuracy

0.9018202497997853

In [45]:
from sklearn.metrics import classification_report

report = pd.DataFrame(classification_report(y_true, y_pred, output_dict=True)).transpose()
report.to_csv("version/{}/report.csv".format(timestamp))
print(report)

precision    recall  f1-score       support
0              0.000000  0.000000  0.000000       0.00000
2              0.964912  0.859375  0.909091     128.00000
3              0.969697  0.857143  0.909953     112.00000
4              0.817460  0.715278  0.762963     144.00000
5              1.000000  0.750000  0.857143     128.00000
...                 ...       ...       ...           ...
83             0.750000  0.573529  0.650000     136.00000
84             0.880597  0.460938  0.605128     128.00000
accuracy       0.901820  0.901820  0.901820       0.90182
macro avg      0.539198  0.362106  0.416084  469496.00000
weighted avg   0.877072  0.901820  0.884304  469496.00000

[87 rows x 4 columns]
