In [2]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import os
import json
import datetime

plt.rcParams["figure.figsize"] = (20, 5)

physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

In [3]:
timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
log_dir = "logs/fit/" + timestamp
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
version_dir = "version/" + timestamp 

os.makedirs(version_dir)

In [4]:
dataset = pd.read_csv("data/NU_train_set.csv")
dataset.head()

Unnamed: 0,t,t+1,delta,tokenized_data
0,105950216192,105943924736,-6291456,-6291456
1,105943924736,105946021888,2097152,2097152
2,105946021888,105939873792,-6148096,-1
3,105939873792,105941845296,1971504,-1
4,105941845296,105935536128,-6309168,-1


In [5]:
param_list = dict()

param_list["PAST_HISTORY"] = 16
param_list["FUTURE_TARGET"] = 8
param_list["TRAIN_SPLIT"] = 40000
param_list["BATCH_SIZE"] = 256
param_list["EPOCHS"] = 100
param_list["BUFFER_SIZE"] = 200000
param_list["EVALUATION_INTERVAL"] = 300
param_list["VAL_STEPS"] = 50

In [86]:
def generate_timeseries(dataset, start_index, end_index, history_size, target_size):
    data = []
    labels = []

    start_index = start_index + history_size
    if end_index is None:
        end_index = len(dataset) - target_size

    for i in range(start_index, end_index):
        indices = range(i-history_size, i)
        # Reshape data from (history_size,) to (history_size, n_feature)
        data.append(np.reshape(dataset[indices], (history_size, 5)))
        labels.append(dataset[i:i+target_size])
    return np.array(data), np.array(labels)

In [108]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()   #dtype=np.int64
encoded_data = encoder.fit_transform(dataset["tokenized_data"].values.reshape(-1, 1))
encoded_data[0], encoder.categories_

(<1x5 sparse matrix of type '<class 'numpy.float64'>'
 	with 1 stored elements in Compressed Sparse Row format>,
 [array([-6291456,       -1,        0,     4096,  2097152], dtype=int64)])

In [48]:
import joblib

joblib.dump(encoder, "data/encoder.pkl")

['data/encoder.pkl']

In [109]:
x_train, y_train = generate_timeseries(encoded_data.toarray(), 0, param_list["TRAIN_SPLIT"], param_list["PAST_HISTORY"], param_list["FUTURE_TARGET"])

train_data = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_data = train_data.cache().shuffle(param_list["BUFFER_SIZE"]).batch(param_list["BATCH_SIZE"])

x_train, y_train = generate_timeseries(dataset["tokenized_data"].values, 0, param_list["TRAIN_SPLIT"], param_list["PAST_HISTORY"], param_list["FUTURE_TARGET"])

train_data = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_data = train_data.cache().shuffle(param_list["BUFFER_SIZE"]).batch(param_list["BATCH_SIZE"])

In [88]:
x_train.shape

(39984, 16, 5)

In [89]:
y_train.shape

(39984, 8, 5)

In [154]:
x_train[0], x_train[0].shape, y_train[0]

(array([[1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1.],
        [0., 1., 0., 0., 0.],
        [0., 1., 0., 0., 0.],
        [0., 1., 0., 0., 0.],
        [0., 0., 0., 0., 1.],
        [1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1.],
        [1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1.],
        [1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1.],
        [1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1.],
        [1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1.]]),
 (16, 5),
 array([[1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1.],
        [1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1.],
        [1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1.],
        [1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1.]]))

In [169]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.LSTM(5))  #x_train.shape[-2:] , input_shape=[16, 5] , return_sequences=True
#model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.RepeatVector(8))
model.add(keras.layers.TimeDistributed(tf.keras.layers.Dense(5, activation="softmax")))
#model.add(tf.keras.layers.Flatten())
#model.add(tf.keras.layers.Dense(5, activation="softmax"))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [171]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.LSTM(5))  #x_train.shape[-2:] , input_shape=[16, 5] , return_sequences=True
model.add(tf.keras.layers.RepeatVector(8))
model.add(tf.keras.layers.LSTM(5, return_sequences=True))
model.add(keras.layers.TimeDistributed(tf.keras.layers.Dense(5, activation="softmax")))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [172]:
model.predict(x_train[0].reshape(1, 16, 5))

array([[[0.22238833, 0.19106907, 0.19172554, 0.19223823, 0.20257884],
        [0.24101113, 0.18313375, 0.18547711, 0.18602203, 0.20435604],
        [0.25601158, 0.17647327, 0.18081243, 0.18121079, 0.20549195],
        [0.2678933 , 0.1710138 , 0.17734246, 0.17753445, 0.20621604],
        [0.27723068, 0.16658232, 0.17475441, 0.17473131, 0.20670128],
        [0.28454748, 0.16299984, 0.17281125, 0.17258793, 0.20705351],
        [0.2902807 , 0.16010882, 0.1713391 , 0.17094089, 0.2073305 ],
        [0.2947797 , 0.15777797, 0.17021255, 0.16966821, 0.2075616 ]]],
      dtype=float32)

In [194]:
model_history = model.fit(train_data, epochs=1000)

===] - 1s 6ms/step - loss: 0.0340 - accuracy: 0.9926
Epoch 810/1000
Epoch 811/1000
Epoch 812/1000
Epoch 813/1000
Epoch 814/1000
Epoch 815/1000
Epoch 816/1000
Epoch 817/1000
Epoch 818/1000
Epoch 819/1000
Epoch 820/1000
Epoch 821/1000
Epoch 822/1000
Epoch 823/1000
Epoch 824/1000
Epoch 825/1000
Epoch 826/1000
Epoch 827/1000
Epoch 828/1000
Epoch 829/1000
Epoch 830/1000
Epoch 831/1000
Epoch 832/1000
Epoch 833/1000
Epoch 834/1000
Epoch 835/1000
Epoch 836/1000
Epoch 837/1000
Epoch 838/1000
Epoch 839/1000
Epoch 840/1000
Epoch 841/1000
Epoch 842/1000
Epoch 843/1000
Epoch 844/1000
Epoch 845/1000
Epoch 846/1000
Epoch 847/1000
Epoch 848/1000
Epoch 849/1000
Epoch 850/1000
Epoch 851/1000
Epoch 852/1000
Epoch 853/1000
Epoch 854/1000
Epoch 855/1000
Epoch 856/1000
Epoch 857/1000
Epoch 858/1000
Epoch 859/1000
Epoch 860/1000
Epoch 861/1000
Epoch 862/1000
Epoch 863/1000
Epoch 864/1000
Epoch 865/1000
Epoch 866/1000
Epoch 867/1000
Epoch 868/1000
Epoch 869/1000
Epoch 870/1000
Epoch 871/1000
Epoch 872/1000
Ep

In [195]:
model.save("version/{}/model.h5".format(timestamp))

In [177]:
result = model.predict(x_train[0].reshape(1, 16, 5))
result

array([[[9.4397795e-01, 1.0718737e-03, 3.8029414e-03, 3.7143909e-04,
         5.0775904e-02],
        [4.4536553e-02, 4.5961610e-04, 2.4975899e-03, 7.9860707e-04,
         9.5170760e-01],
        [9.5983911e-01, 9.6130832e-05, 5.5822893e-04, 2.7524511e-05,
         3.9479043e-02],
        [2.8245650e-02, 2.2623039e-04, 1.1987765e-03, 5.6314887e-04,
         9.6976626e-01],
        [9.7081548e-01, 6.9745853e-05, 3.7502675e-04, 1.9664711e-05,
         2.8720098e-02],
        [2.7528344e-02, 2.1486718e-04, 1.1259078e-03, 5.5788038e-04,
         9.7057301e-01],
        [9.7169387e-01, 6.7843052e-05, 3.6082236e-04, 1.9129735e-05,
         2.7858259e-02],
        [2.7486285e-02, 2.1387670e-04, 1.1190403e-03, 5.5771106e-04,
         9.7062308e-01]]], dtype=float32)

In [178]:
y_train[0]

array([[1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1.]])

In [192]:
for i in range(8):
    print(np.argmax(result[0][i]))

0
4
0
4
0
4
0
4


In [193]:
for i in range(8):
    print(np.argmax(y_train[0][i]))

0
4
0
4
0
4
0
4
