In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
import tqdm     # loop progress bar

In [2]:
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

In [3]:
dataset_name = "SEG_wavenet"

In [4]:
# Not Scalar Input
receptive_field = 257

vocab_size = 16293

## Train Set

In [5]:
train_set_original = np.genfromtxt("data/{}_train_set.csv".format(dataset_name), delimiter="\n", dtype=np.int64)

### Y

In [6]:
train_y_set_original = train_set_original.copy()
train_y_set_original = train_y_set_original[receptive_field:]

In [7]:
train_y_set_indices = [[i, y] for i, y in enumerate(train_y_set_original)]
train_y_set_indices[:10]

[[0, 1],
 [1, 1],
 [2, 1],
 [3, 1],
 [4, 1],
 [5, 1],
 [6, 1],
 [7, 1],
 [8, 1],
 [9, 1]]

In [8]:
train_y = tf.data.Dataset.from_tensor_slices(tf.SparseTensor(indices=train_y_set_indices, values=[1]*len(train_y_set_indices), dense_shape=[len(train_y_set_indices), vocab_size]))

In [9]:
# Size of dataset
train_y.cardinality()

<tf.Tensor: shape=(), dtype=int64, numpy=161034>

In [10]:
# Example
ex = train_y_set_indices[:10]
tf.sparse.to_dense(tf.SparseTensor(indices=ex, values=[1]*len(ex), dense_shape=[len(ex), vocab_size]))

<tf.Tensor: shape=(10, 16293), dtype=int32, numpy=
array([[0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0]])>

### X as Onehot

In [11]:
train_x_set_original = train_set_original.copy()
train_x_set_original = train_x_set_original[:-1]

In [12]:
train_x_set = []

for i in tqdm.trange(len(train_x_set_original) - receptive_field + 1):
    seq = train_x_set_original[i:i+receptive_field]
    indices = [[i, x] for i, x in enumerate(seq)]
    sparse = tf.SparseTensor(indices=indices, values=[1]*receptive_field, dense_shape=[receptive_field, vocab_size])
    sparse = tf.cast(tf.sparse.expand_dims(sparse, axis=0), tf.float32)
    train_x_set.append(sparse)

100%|██████████| 161034/161034 [05:01<00:00, 534.27it/s]


In [13]:
train_x_set = tf.sparse.concat(0, train_x_set)

In [14]:
train_x = tf.data.Dataset.from_tensor_slices(train_x_set)

In [15]:
train_set = tf.data.Dataset.zip((train_x, train_y))

In [16]:
tf.data.experimental.save(train_set, "data/dataset")

In [17]:
load_data = tf.data.experimental.load("data/dataset", train_set.element_spec)

In [18]:
train_set.element_spec

(SparseTensorSpec(TensorShape([257, 16293]), tf.float32),
 SparseTensorSpec(TensorShape([16293]), tf.int32))

In [19]:
for e in load_data:
    print(tf.sparse.to_dense(e[0]))
    print(tf.sparse.to_dense(e[1]))
    break

tf.Tensor(
[[1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 ...
 [0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]], shape=(257, 16293), dtype=float32)
tf.Tensor([0 1 0 ... 0 0 0], shape=(16293,), dtype=int32)


In [27]:
train_set.element_spec == (tf.SparseTensorSpec(tf.TensorShape([257, 16293]), tf.dtypes.float32), tf.SparseTensorSpec(tf.TensorShape([16293]), tf.dtypes.int32))

True

In [28]:
t = tf.data.experimental.load("data/dataset", (tf.SparseTensorSpec(tf.TensorShape([257, 16293]), tf.dtypes.float32), tf.SparseTensorSpec(tf.TensorShape([16293]), tf.dtypes.int32)))

In [29]:
for e in t:
    print(tf.sparse.to_dense(e[0]))
    print(tf.sparse.to_dense(e[1]))
    break

tf.Tensor(
[[1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 ...
 [0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]], shape=(257, 16293), dtype=float32)
tf.Tensor([0 1 0 ... 0 0 0], shape=(16293,), dtype=int32)


In [30]:
for e in t:
    print(e[0])
    print(e[1])
    break

SparseTensor(indices=tf.Tensor(
[[    0     0]
 [    1     0]
 [    2     0]
 [    3     0]
 [    4     0]
 [    5     0]
 [    6     0]
 [    7     0]
 [    8     0]
 [    9     0]
 [   10     0]
 [   11     0]
 [   12     0]
 [   13     0]
 [   14     0]
 [   15     0]
 [   16     0]
 [   17     0]
 [   18     0]
 [   19     0]
 [   20     0]
 [   21   926]
 [   22     0]
 [   23     0]
 [   24     0]
 [   25     0]
 [   26     0]
 [   27     0]
 [   28    49]
 [   29     0]
 [   30     0]
 [   31     0]
 [   32     0]
 [   33     0]
 [   34 14061]
 [   35     0]
 [   36     0]
 [   37     0]
 [   38     0]
 [   39     0]
 [   40     0]
 [   41     0]
 [   42     0]
 [   43     0]
 [   44     0]
 [   45     2]
 [   46     2]
 [   47     2]
 [   48     0]
 [   49     0]
 [   50     0]
 [   51     0]
 [   52     0]
 [   53     0]
 [   54     0]
 [   55     0]
 [   56     2]
 [   57     2]
 [   58     2]
 [   59     2]
 [   60     2]
 [   61     2]
 [   62     2]
 [   63     2]
 [   64 

## Test Set

### Y

In [5]:
test_set_original = np.genfromtxt("data/{}_test_set.csv".format(dataset_name), delimiter="\n", dtype=np.int64)

In [6]:
test_y_set_original = test_set_original.copy()
test_y_set_original = test_y_set_original[receptive_field:]

In [7]:
test_y_set_indices = [[i, y] for i, y in enumerate(test_y_set_original)]
test_y_set_indices[:10]

[[0, 0],
 [1, 0],
 [2, 0],
 [3, 0],
 [4, 0],
 [5, 250],
 [6, 127],
 [7, 0],
 [8, 338],
 [9, 185]]

In [8]:
test_y = tf.data.Dataset.from_tensor_slices(tf.SparseTensor(indices=test_y_set_indices, values=[1]*len(test_y_set_indices), dense_shape=[len(test_y_set_indices), vocab_size]))

In [9]:
# Size of dataset
test_y.cardinality()

<tf.Tensor: shape=(), dtype=int64, numpy=35322>

### X

In [10]:
test_x_set_original = test_set_original.copy()
test_x_set_original = test_x_set_original[:-1]

In [11]:
test_x_set = []

for i in tqdm.trange(len(test_x_set_original) - receptive_field + 1):
    seq = test_x_set_original[i:i+receptive_field]
    indices = [[i, x] for i, x in enumerate(seq)]
    sparse = tf.SparseTensor(indices=indices, values=[1]*receptive_field, dense_shape=[receptive_field, vocab_size])
    sparse = tf.cast(tf.sparse.expand_dims(sparse, axis=0), tf.float32)
    test_x_set.append(sparse)

100%|██████████| 35322/35322 [01:12<00:00, 488.42it/s]


In [12]:
test_x_set = tf.sparse.concat(0, test_x_set)

In [13]:
test_x = tf.data.Dataset.from_tensor_slices(test_x_set)

test_set = tf.data.Dataset.zip((test_x, test_y))

In [16]:
tf.data.experimental.save(test_x, "data/test_dataset_x")

In [17]:
tf.data.experimental.save(test_y, "data/test_dataset_y")