In [1]:
import tensorflow as tf
import numpy as np
import os

# Dataset 

Fiddling with `tf.data.Dataset`-related things. Presentation page [here](https://www.tensorflow.org/guide/datasets).

In [2]:
with tf.Session() as sess:
    print(sess.run(tf.random_uniform([4, 10])))

[[0.68676233 0.0086329  0.8658116  0.45277607 0.94584095 0.47857428
  0.4139799  0.24188924 0.5272958  0.47962856]
 [0.9060365  0.0873791  0.9680308  0.14595628 0.59796417 0.12068391
  0.04482651 0.8628911  0.8021622  0.4293126 ]
 [0.1597904  0.52541065 0.40359116 0.23508751 0.48473155 0.0376991
  0.04705763 0.86750424 0.71314085 0.25864637]
 [0.06989145 0.29347718 0.30275798 0.2497195  0.10967302 0.18269348
  0.94954455 0.7470466  0.6513107  0.47883558]]


In [3]:
dataset1 = tf.data.Dataset.from_tensor_slices(tf.random_uniform([4, 10]))
print(dataset1.output_types)
print(dataset1.output_shapes)

<dtype: 'float32'>
(10,)


In [4]:
with tf.Session() as sess:
    print(sess.run(tf.random_uniform([4])), end='\n\n')
    print(sess.run(tf.random_uniform([4, 100], maxval=100, dtype=tf.int32)))

[0.01913965 0.8863044  0.08992255 0.70749855]

[[34 71 79 24 61 94 52 20 65 51 59 66 95 47 14 46 40 44 30  7 97 94  7 20
  13 13 88 64 90 48 42 60 59 27 76 25 87 95  1 89 49 33 29 13 23 60 44 84
  78 27 73 42  0 29 47 76 15 20 69 83 91 23 77 28 60 47 61 99 66 56  9 80
  92 60 77 78 56 78 50 45  1 92 25 18 91 50 49 22 75 81 97 47 58 59 96 39
   6 80 71 89]
 [74 75  8 56 76 25 68  6 81 44 31 50 32 40 77 25 92 50 87 93 27 85 25 57
  98 89 96 68 42  8 52 26 89 58 54 76 50 69 96 10 77 73 11 60 16  9 13 80
  32  9  1 21 45 51 85  2 33 80 34 60 48 80 76 77 22 63 58 30 93 69 90 73
   0 45 22 95 87 88 10 96 99  8 16 30 40 12 81 57 21 16 67 62 27 65 56 23
  68 67 45 61]
 [58 98 61 87 38 63 74 49 98  3 30 88 77 17 93 20 41 32 54 84 36 87  2 96
  57 92 60 31 23 49 33 99 11 42 35 77  1 39 71 77 70 53 43 66 64 96 42 60
  83 40 77 91 96 81 53 67 16  8 72 73 48 71 24  4 64 21 40 44 38 49  2 76
  98 17 54 22 51 12 31 92 92 53  1 64 67 94 77 57 89  2 67 25 68 20 80 73
  94 93 26 71]
 [13 64 20 15 89 90 

In [5]:
dataset2 = tf.data.Dataset.from_tensor_slices(
               (
                    tf.random_uniform([4]),
                    tf.random_uniform([4, 100], 
                                      maxval=100, 
                                      dtype=tf.int32)
               ))
print(dataset2.output_types) 
print(dataset2.output_shapes) 

(tf.float32, tf.int32)
(TensorShape([]), TensorShape([Dimension(100)]))


In [6]:
dataset3 = tf.data.Dataset.zip((dataset1, dataset2))
print(dataset3.output_types)  
print(dataset3.output_shapes) 

(tf.float32, (tf.float32, tf.int32))
(TensorShape([Dimension(10)]), (TensorShape([]), TensorShape([Dimension(100)])))


In [7]:
dataset4 = tf.data.Dataset.zip((dataset1, dataset1))
print(dataset4.output_types)  
print(dataset4.output_shapes) 

(tf.float32, tf.float32)
(TensorShape([Dimension(10)]), TensorShape([Dimension(10)]))


In [8]:
dataset5 = tf.data.Dataset.from_tensor_slices(
   {"a": tf.random_uniform([4]),
    "b": tf.random_uniform([4, 100], maxval=100, dtype=tf.int32)})
print(dataset5.output_types)  
print(dataset5.output_shapes) 

{'a': tf.float32, 'b': tf.int32}
{'a': TensorShape([]), 'b': TensorShape([Dimension(100)])}


---

# Map, Flat_Map, Filter

In [None]:
dataset1 = dataset1.map(lambda x: ...)

dataset2 = dataset2.flat_map(lambda x, y: ...)

# Note: Argument destructuring is not available in Python 3.
dataset3 = dataset3.filter(lambda x, (y, z): ...)

---

# Iterators 

### One-shot
The `one_shot` version, the simplest of all, which allows for a pass through all of the elements of the dataset.

In [11]:
size = 20
dataset6 = tf.data.Dataset.range(size)
iterator6 = dataset6.make_one_shot_iterator()

with tf.Session() as sess:
    values = []
    for i in range(size):
        value = sess.run(iterator6.get_next())
        values.append(value)
        assert i == value
    print(values)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]


In [19]:
# can we be Python-cheeky? Yes! With a warning.
size = 20
with tf.Session() as sess:
    print([sess.run(iterator6.get_next()) for x in range(size)])



[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]


### Initializable
The `initializable` version, which allows for a parametrization of the definition of the dataset. (Using placeholders <> shape size can be left as `None`.)

In [23]:
max_value = tf.placeholder(tf.int64, shape=[]) # used as a parameter in the definition of the dataset
dataset7 = tf.data.Dataset.range(max_value)
iterator7 = dataset7.make_initializable_iterator()

# Initialize an iterator over a dataset with 10 elements.
size = 10
feed = {max_value: size}
with tf.Session() as sess:
    
    sess.run(iterator7.initializer, feed_dict=feed) # you need to initialize
    
    values = []
    for i in range(size):
        value = sess.run(iterator7.get_next())
        values.append(value)
        assert i == value
    print(values)

    # Initialize the same iterator over a dataset with 50 elements.
    size = 20
    
    values = []
    sess.run(iterator7.initializer, feed_dict={max_value: size})
    for i in range(size):
        value = sess.run(iterator7.get_next())
        values.append(value)
        assert i == value
    print(values)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]


### Reinitializable 

The `reinitializable` iterator can be initialized from different `Dataset` objects > you can have two different objects with the same structure (shapes/types), e.g. train & dev sets.

In [62]:
training_ds = tf.data.Dataset.range(100).map(
                lambda x: x + tf.random_uniform([], -10, 10, tf.int64))
validation_ds = tf.data.Dataset.range(50)

reiniterator = tf.data.Iterator.from_structure(training_ds.output_types,
                                               training_ds.output_shapes)
nxt = reiniterator.get_next()

tr_init_op = reiniterator.make_initializer(training_ds)
val_init_op = reiniterator.make_initializer(validation_ds)

epochs = 2
with tf.Session() as sess:
    for _ in range(epochs):
        print('training | epoch {}'.format(_))
        sess.run(tr_init_op)
        values = []
        for _ in range(100):
            values.append(sess.run(nxt))
        print(values, end='\n\n')

        print('validating')
        sess.run(val_init_op)
        values = []
        for _ in range(50):
            values.append(sess.run(nxt))
        print(values, end='\n\n')
        print('-'*30)

training | epoch 0
[-7, 10, 7, -5, 8, 2, 1, 13, 7, 6, 5, 4, 20, 12, 4, 15, 11, 19, 21, 17, 26, 19, 14, 13, 14, 24, 24, 24, 25, 37, 26, 37, 33, 38, 31, 25, 28, 36, 31, 31, 41, 34, 46, 36, 38, 38, 42, 53, 49, 39, 46, 52, 60, 56, 53, 50, 53, 64, 61, 59, 62, 65, 71, 57, 71, 55, 66, 58, 61, 61, 76, 71, 63, 72, 67, 80, 85, 80, 73, 71, 88, 76, 74, 83, 75, 84, 76, 92, 96, 90, 86, 97, 86, 95, 100, 91, 93, 87, 102, 102]

validating
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]

------------------------------
training | epoch 1
[-1, -6, -6, 2, -5, -2, 1, -1, 8, 12, 18, 1, 6, 21, 21, 24, 13, 22, 15, 20, 26, 16, 14, 14, 24, 31, 19, 17, 27, 19, 23, 40, 33, 23, 33, 40, 37, 27, 42, 33, 43, 44, 34, 51, 45, 53, 54, 55, 56, 45, 55, 47, 56, 50, 50, 55, 59, 63, 67, 60, 63, 66, 59, 59, 71, 71, 73, 67, 75, 74, 61, 61, 70, 64, 82, 72, 67, 73, 81, 75, 75, 77, 72, 79, 

### Feedable
Can be used together with `tf.placeholder` to select what `Iterator` will be used. Similar to the reinitializable one, but no initialization required from the dataset.

---

Using [repeat](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#repeat). If no number is specified, the dataset repeats indefinitely.

In [63]:
repeat_example = tf.data.Dataset.range(10).repeat()
repeat_one_shot = repeat_example.make_one_shot_iterator()
repeat_nxt = repeat_one_shot.get_next()
with tf.Session() as sess:
    values = []
    for _ in range(50):
        values.append(sess.run(repeat_nxt))
    print(values)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


In [69]:
training_ds = tf.data.Dataset.range(10).map(
                lambda x: x**2).repeat()
validation_ds = tf.data.Dataset.range(5).repeat(10)

handle = tf.placeholder(tf.string, shape=[])
it_feedable = tf.data.Iterator.from_string_handle(
                                    handle, 
                                    training_ds.output_types,
                                    training_ds.output_shapes)

it_feed_nxt = it_feedable.get_next()

train_it_one_shot = training_ds.make_one_shot_iterator()
val_it_initializable = validation_ds.make_initializable_iterator()

# returns tensor that can be evaluated & fed to the `handle` placeholder
with tf.Session() as sess:
    train_handle = sess.run(train_it_one_shot.string_handle())
    val_handle = sess.run(val_it_initializable.string_handle())

    i = 0
    while i < 2:
        
        # training steps
        values = []
        print('training, epoch {}'.format(i))
        for _ in range(100):
            values.append(sess.run(it_feed_nxt,
                                   feed_dict={handle: train_handle}))
        print(values, end='\n\n')
        
        values = []
        sess.run(val_it_initializable.initializer)
        print('validating')
        for _ in range(50):
            values.append(sess.run(it_feed_nxt,
                                   feed_dict={handle: val_handle}))
        print(values, end='\n\n')
        print('*'*30)
        i += 1

training, epoch 0
[0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 0, 1, 4, 9, 16, 25, 36, 49, 64, 81]

validating
[0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4]

******************************
training, epoch 1
[0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 0, 1, 4, 9, 16, 25, 36, 49, 64, 81]

validating
[0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1

In [163]:
outrange_ds = tf.data.Dataset.range(5)
outr_it = outrange_ds.make_initializable_iterator()
outr_nxt = outr_it.get_next()

thing = tf.multiply(outr_nxt, outr_nxt)

with tf.Session() as sess:
    sess.run(outr_it.initializer)
    for _ in range(10):
        try:
            print(sess.run(thing))
        except tf.errors.OutOfRangeError:
            print('the end, my only friend!')
            break

0
1
4
9
16
the end, my only friend!


As a reminder, each time you call `get_next()` even if only retrieving one tensor among many, the iterator advances one step. Hence it is best to include all results in one expression.

In [126]:
retrieve_ds1 = tf.data.Dataset.from_tensor_slices(tf.random_uniform([5,50]))
retrieve_ds2 = tf.data.Dataset.from_tensor_slices((tf.random_uniform([5]), tf.random_uniform([5,100])))
retrieve_ds3 = tf.data.Dataset.zip((retrieve_ds1, retrieve_ds2))
retrieve_it = retrieve_ds3.make_initializable_iterator()

with tf.Session() as sess:
    sess.run(retrieve_it.initializer)
    nxt1, (nxt2, nxt3) = retrieve_it.get_next()
    print(nxt1.shape, nxt2.shape, nxt3.shape)

(50,) () (100,)


### Saving an iterator state
It is possible! Similar to saving variables

In [9]:
save_ds = tf.data.Dataset.range(100)
save_it = save_ds.make_one_shot_iterator()
save_nxt = save_it.get_next()

saveable = tf.data.experimental.make_saveable_from_iterator(save_it)
tf.add_to_collection(tf.GraphKeys.SAVEABLE_OBJECTS, saveable)
saver = tf.train.Saver()
path = (os.path.join(os.path.abspath('.'), 'saveable.ckpt'))

# save
with tf.Session() as sess:
    values = []
    i = 0
    while i < 30:
        values.append(sess.run(save_nxt))
        i += 1
    print('first pass, values:\n', values, end='\n\n')
    saver.save(sess, path)

# restore: it starts from 30!
with tf.Session() as sess:
    values = []
    saver.restore(sess, path)
    while True:
        try:
            values.append(sess.run(save_nxt))
        except tf.errors.OutOfRangeError:
            print(values)
            break

trainable_variables
first pass, values:
 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]

INFO:tensorflow:Restoring parameters from /media/default/linux-data/learning2/TensorFlow/bits/saveable.ckpt
[30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99]


---

## Group by window

Documentation [here](https://www.tensorflow.org/api_docs/python/tf/data/experimental/group_by_window).  
Stack discussion [here](https://stackoverflow.com/questions/45292517/how-do-i-use-the-group-by-window-function-in-tensorflow).

In [89]:
components = np.arange(100).astype(np.int64)
print(components)

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
 96 97 98 99]


In [116]:
components_ds = tf.data.Dataset.from_tensor_slices(components)
components_ds = components_ds.apply(
                    tf.data.experimental.group_by_window(
                                key_func=lambda x: x%2, 
                                reduce_func=lambda key, x: x.batch(50), # takes key & dataset as arguments
                                window_size=100))
components_it = components_ds.make_one_shot_iterator()
features = components_it.get_next()
with tf.Session() as sess:
    for _ in range(10):
        try:
            values = sess.run(features)
            print(values, '| size:', values.shape)
        except tf.errors.OutOfRangeError:
            print('over and out!')
            break

54 | size: ()
88 | size: ()
64 | size: ()
94 | size: ()
4 | size: ()
74 | size: ()
6 | size: ()
18 | size: ()
58 | size: ()
68 | size: ()
