In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import os
import json
import datetime

plt.rcParams["figure.figsize"] = (20, 5)

physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

In [2]:
timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
log_dir = "logs/fit/" + timestamp
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
version_dir = "version/" + timestamp 

os.makedirs(version_dir)
timestamp

'20200815-165215'

In [3]:
dataset = pd.read_csv("data/NU_train_set.csv")
dataset.head()

Unnamed: 0,t,t+1,delta,tokenized_data
0,104289271808,104282980352,-6291456,-6291456
1,104282980352,104282984448,4096,4096
2,104282984448,104282988544,4096,4096
3,104282988544,104282992640,4096,4096
4,104282992640,104282996736,4096,4096


In [4]:
param_list = dict()

param_list["PAST_HISTORY"] = 16
param_list["FUTURE_TARGET"] = 8
param_list["BATCH_SIZE"] = 1024
param_list["EPOCHS"] = 250
param_list["BUFFER_SIZE"] = 200000

with open("version/{}/params.json".format(timestamp), "w") as p:
    json.dump(param_list, p, indent=4)

In [5]:
def generate_timeseries(dataset, start_index, end_index, history_size, target_size):
    data = []
    labels = []

    start_index = start_index + history_size
    if end_index is None:
        end_index = len(dataset) - target_size

    for i in range(start_index, end_index):
        indices = range(i-history_size, i)
        # Reshape data from (history_size,) to (history_size, n_feature)
        data.append(np.reshape(dataset[indices], (history_size, 5)))
        labels.append(dataset[i:i+target_size])
    return np.array(data), np.array(labels)

In [6]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(dtype=np.float32)   #dtype=np.int64
encoded_data = encoder.fit_transform(dataset["tokenized_data"].values.reshape(-1, 1))
encoded_data[0], encoder.categories_

(<1x5 sparse matrix of type '<class 'numpy.float32'>'
 	with 1 stored elements in Compressed Sparse Row format>,
 [array([-6291456,       -1,        0,     4096,  2097152], dtype=int64)])

import joblib

joblib.dump(encoder, "data/encoder.pkl")

In [7]:
x_train, y_train = generate_timeseries(encoded_data.toarray(), 0, None, param_list["PAST_HISTORY"], param_list["FUTURE_TARGET"])

#train_data = tf.data.Dataset.from_tensor_slices((x_train, y_train))
#train_data = train_data.cache().shuffle(param_list["BUFFER_SIZE"]).batch(param_list["BATCH_SIZE"])

In [8]:
x_train.shape

(47165, 16, 5)

In [9]:
y_train.shape

(47165, 8, 5)

model = tf.keras.models.Sequential()
model.add(tf.keras.layers.LSTM(5))  #x_train.shape[-2:] , input_shape=[16, 5] , return_sequences=True
#model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.RepeatVector(8))
model.add(keras.layers.TimeDistributed(tf.keras.layers.Dense(5, activation="softmax")))
#model.add(tf.keras.layers.Flatten())
#model.add(tf.keras.layers.Dense(5, activation="softmax"))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [10]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(5)))  #x_train.shape[-2:] , input_shape=[16, 5] , return_sequences=True
model.add(tf.keras.layers.RepeatVector(8))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(5, return_sequences=True)))
model.add(keras.layers.TimeDistributed(tf.keras.layers.Dense(5, activation="softmax")))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [11]:
model_history = model.fit(x_train, y_train, batch_size=param_list["BATCH_SIZE"], validation_split=0.2, epochs=param_list["EPOCHS"])
model.save("version/{}/model.h5".format(timestamp))

Epoch 113/250
Epoch 114/250
Epoch 115/250
Epoch 116/250
Epoch 117/250
Epoch 118/250
Epoch 119/250
Epoch 120/250
Epoch 121/250
Epoch 122/250
Epoch 123/250
Epoch 124/250
Epoch 125/250
Epoch 126/250
Epoch 127/250
Epoch 128/250
Epoch 129/250
Epoch 130/250
Epoch 131/250
Epoch 132/250
Epoch 133/250
Epoch 134/250
Epoch 135/250
Epoch 136/250
Epoch 137/250
Epoch 138/250
Epoch 139/250
Epoch 140/250
Epoch 141/250
Epoch 142/250
Epoch 143/250
Epoch 144/250
Epoch 145/250
Epoch 146/250
Epoch 147/250
Epoch 148/250
Epoch 149/250
Epoch 150/250
Epoch 151/250
Epoch 152/250
Epoch 153/250
Epoch 154/250
Epoch 155/250
Epoch 156/250
Epoch 157/250
Epoch 158/250
Epoch 159/250
Epoch 160/250
Epoch 161/250
Epoch 162/250
Epoch 163/250
Epoch 164/250
Epoch 165/250
Epoch 166/250
Epoch 167/250
Epoch 168/250
Epoch 169/250
Epoch 170/250
Epoch 171/250
Epoch 172/250
Epoch 173/250
Epoch 174/250
Epoch 175/250
Epoch 176/250
Epoch 177/250
Epoch 178/250
Epoch 179/250
Epoch 180/250
Epoch 181/250
Epoch 182/250
Epoch 183/250
Epoch 

In [12]:
test_dataset = pd.read_csv("data/NU_test_set.csv")
test_dataset.head()

Unnamed: 0,t,t+1,delta,tokenized_data
0,103591821312,103591825408,4096,4096
1,103591825408,103591829504,4096,4096
2,103591829504,103591833600,4096,4096
3,103591833600,103591837696,4096,4096
4,103591837696,103591841792,4096,4096


In [13]:
test_encoder = OneHotEncoder(dtype=np.float32)
encoded_test_data = test_encoder.fit_transform(test_dataset["tokenized_data"].values.reshape(-1, 1))
encoded_test_data[0], test_encoder.categories_

(<1x5 sparse matrix of type '<class 'numpy.float32'>'
 	with 1 stored elements in Compressed Sparse Row format>,
 [array([-6291456,       -1,        0,     4096,  2097152], dtype=int64)])

In [68]:
x_test, y_test = generate_timeseries(encoded_test_data.toarray(), 0, None, 16, 8)

In [58]:
y_true = []
for i in range(y_test.shape[0]):
    y_true.extend(np.argmax(y_test[i], axis=1).tolist())

In [60]:
y_pred = []
for i in range(x_test.shape[0]):
    y_pred.extend(np.argmax(model.predict(x_test[i].reshape(1, 16, 5))[0], axis=1).tolist())

In [72]:
model.predict(x_test[0].reshape(1, 16, 5))

array([[[2.9187178e-04, 1.0927758e-03, 3.3078957e-04, 9.9801874e-01,
         2.6592493e-04],
        [2.5373511e-04, 8.9197798e-04, 2.7462136e-04, 9.9834847e-01,
         2.3117388e-04],
        [3.6898401e-04, 9.1667345e-04, 4.0495221e-04, 9.9796343e-01,
         3.4592798e-04],
        [5.6339521e-04, 9.4378972e-04, 6.2860060e-04, 9.9732000e-01,
         5.4424530e-04],
        [8.0418430e-04, 9.3528023e-04, 9.0130442e-04, 9.9656588e-01,
         7.9331436e-04],
        [1.0053831e-03, 8.7055092e-04, 1.1066241e-03, 9.9602121e-01,
         9.9636358e-04],
        [1.0566743e-03, 7.8139035e-04, 1.0919169e-03, 9.9604708e-01,
         1.0229854e-03],
        [1.2105748e-03, 1.2207702e-03, 1.0832353e-03, 9.9538875e-01,
         1.0966696e-03]]], dtype=float32)

In [74]:
np.argmax(model.predict(x_test[0].reshape(1, 16, 5))[0], axis=1)

array([3, 3, 3, 3, 3, 3, 3, 3], dtype=int64)

In [81]:
test_encoder.inverse_transform(model.predict(x_test[0].reshape(1, 16, 5))[0])

array([[4096],
       [4096],
       [4096],
       [4096],
       [4096],
       [4096],
       [4096],
       [4096]], dtype=int64)

In [73]:
test_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10502 entries, 0 to 10501
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   t               10502 non-null  int64
 1   t+1             10502 non-null  int64
 2   delta           10502 non-null  int64
 3   tokenized_data  10502 non-null  int64
dtypes: int64(4)
memory usage: 328.3 KB


In [61]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_true, y_pred)
with open("version/{}/accuracy.txt".format(timestamp), "w") as t:
    t.write(str(accuracy.tolist()))

accuracy

0.9850520137430807

In [62]:
from sklearn.metrics import classification_report

report = classification_report(y_true, y_pred)
report

'              precision    recall  f1-score   support\n\n           0       0.15      0.25      0.19        24\n           1       0.00      0.00      0.00        80\n           2       0.40      0.02      0.03      1080\n           3       0.99      1.00      0.99     82616\n           4       0.07      0.17      0.10        24\n\n    accuracy                           0.99     83824\n   macro avg       0.32      0.29      0.26     83824\nweighted avg       0.98      0.99      0.98     83824\n'

In [63]:
print(report)

precision    recall  f1-score   support

           0       0.15      0.25      0.19        24
           1       0.00      0.00      0.00        80
           2       0.40      0.02      0.03      1080
           3       0.99      1.00      0.99     82616
           4       0.07      0.17      0.10        24

    accuracy                           0.99     83824
   macro avg       0.32      0.29      0.26     83824
weighted avg       0.98      0.99      0.98     83824



In [64]:
y_pred_recheck = []
for i in range(x_test.shape[0]):
    y_pred_recheck.extend(np.argmax(model.predict(x_test[i].reshape(1, 16, 5))[0], axis=1).tolist())

In [65]:
accuracy_recheck = accuracy_score(y_true, y_pred_recheck)
accuracy_recheck

0.9850520137430807

In [67]:
report_recheck = classification_report(y_true, y_pred_recheck)
print(report_recheck)

precision    recall  f1-score   support

           0       0.15      0.25      0.19        24
           1       0.00      0.00      0.00        80
           2       0.40      0.02      0.03      1080
           3       0.99      1.00      0.99     82616
           4       0.07      0.17      0.10        24

    accuracy                           0.99     83824
   macro avg       0.32      0.29      0.26     83824
weighted avg       0.98      0.99      0.98     83824

