In [1]:
import tensorflow as tf
import random
import numpy as np
import os
import cv2

## Feed_dict system의 문제점

우리는 이번 tutorial을 통해서 Data pipelining을 효율적으로 만드는 것에 중점적으로 확인하려고 한다.

TensorFlow를 통해서 학습을 위한 코드를 작성할 때 크게 세 부분으로 나뉘는데,

첫 째 input data를 로드하는 부분, 

둘 째 model을 디자인하고 input data를 받아 prediction을 출력하는 부분, 

셋 째 prediction을 정답과 비교하고 model의 파라미터를 갱신하는 부분으로 나눌 수 있다.

우리는 코드를 통해 학습 연산을 구현할 때 우리의 연산 자원인 GPU의 효율성을 최대로 발휘하도록 해야 한다. 이때 많은 경우에 걸쳐 Bottle-neck으로 작용하는 것이 바로 첫 번째 input loading part이다. GPU를 효율적으로 사용하여 학습 속도를 가속하기 위해서는 지속적으로 쉬지 않고 data를 전달해주어야 한다. 만약 program이 data를 가져와서 model에 전달하는 부분과 data를 통해 연산하는 부분이 순차적으로 수행되게 된다면 GPU는 data를 가져와 넣어줄 때까지 놀게 될 것이다.

TensorFlow의 feed_dict는 이점에서 문제가 있다. feed_dict는 python data를 session에게 복사하여 넘겨준다. 만약 single threading을 하는 program이라면 GPU는 data를 대기하며 idle이 발생할 것이다.

In [1]:
import time
import tensorflow as tf

# We simulate some raw input data 
# (think about it as fetching some data from the file system)
# let's say: batches of 128 samples, each containing 1024 data points
x_inputs_data = tf.random_normal([128, 1024], mean=0, stddev=1)
# We will try to predict this law:
# predict 1 if the sum of the elements is positive and 0 otherwise
y_inputs_data = tf.cast(tf.reduce_sum(x_inputs_data, axis=1, keep_dims=True) > 0, tf.int32)

# We build our small model: a basic two layers neural net with ReLU
with tf.variable_scope("placeholder"):
    input = tf.placeholder(tf.float32, shape=[None, 1024])
    y_true = tf.placeholder(tf.int32, shape=[None, 1])
with tf.variable_scope('FullyConnected'):
    w = tf.get_variable('w', shape=[1024, 1024], initializer=tf.random_normal_initializer(stddev=1e-1))
    b = tf.get_variable('b', shape=[1024], initializer=tf.constant_initializer(0.1))
    z = tf.matmul(input, w) + b
    y = tf.nn.relu(z)

    w2 = tf.get_variable('w2', shape=[1024, 1], initializer=tf.random_normal_initializer(stddev=1e-1))
    b2 = tf.get_variable('b2', shape=[1], initializer=tf.constant_initializer(0.1))
    z = tf.matmul(y, w2) + b2
with tf.variable_scope('Loss'):
    losses = tf.nn.sigmoid_cross_entropy_with_logits(None, tf.cast(y_true, tf.float32), z)
    loss_op = tf.reduce_mean(losses)
with tf.variable_scope('Accuracy'):
    y_pred = tf.cast(z > 0, tf.int32)
    accuracy = tf.reduce_mean(tf.cast(tf.equal(y_pred, y_true), tf.float32))
    accuracy = tf.Print(accuracy, data=[accuracy], message="accuracy:")

# We add the training operation, ...
adam = tf.train.AdamOptimizer(1e-2)
train_op = adam.minimize(loss_op, name="train_op")

startTime = time.time()
with tf.Session() as sess:
    # ... init our variables, ...
    sess.run(tf.global_variables_initializer())

    # ... check the accuracy before training, ...
    x_input, y_input = sess.run([x_inputs_data, y_inputs_data])
    sess.run(accuracy, feed_dict={
        input: x_input,
        y_true: y_input
    })

    # ... train ...
    for i in range(5000):
        #  ... by sampling some input data (fetching) ...
        x_input, y_input = sess.run([x_inputs_data, y_inputs_data])
        # ... and feeding it to our model
        _, loss = sess.run([train_op, loss_op], feed_dict={
            input: x_input,
            y_true: y_input
        })

        # We regularly check the loss
        if i % 500 == 0:
            print('iter:%d - loss:%f' % (i, loss))

    # Finally, we check our final accuracy
    x_input, y_input = sess.run([x_inputs_data, y_inputs_data])
    sess.run(accuracy, feed_dict={
        input: x_input,
        y_true: y_input
    })

print("Time taken: %f" % (time.time() - startTime))

Instructions for updating:
keep_dims is deprecated, use keepdims instead
iter:0 - loss:2.912185
iter:500 - loss:0.369858
iter:1000 - loss:0.851633
iter:1500 - loss:1.058526
iter:2000 - loss:0.988433
iter:2500 - loss:0.315225
iter:3000 - loss:0.246361
iter:3500 - loss:0.849347
iter:4000 - loss:0.897075
iter:4500 - loss:2.264365
Time taken: 7.456434


# Solution

위에서 언급한 바대로 '데이터 입력' 부분과 '연산' 부분을 비동기화하여 GPU가 쉬지 않고 일할 수 잇다면, 훨씬 효율적으로 연산 자원을 활용할 수 있을 것이다. 우리는 multi-threading을 사용하여 데이터 입력 부분을 효율적으로 만들어줄 수 있다. TensorFlow에서는 이를 위해 Queue와 Queue runner를 API로 제공하고 있다.

reference : https://blog.metaflow.fr/tensorflow-how-to-optimise-your-input-pipeline-with-queues-and-multi-threading-e7c3874157e0

## Queue_runner 사용 방법

In [None]:

import tensorflow as tf

# This time, let's start with 6 samples of 1 data point
x_input_data = tf.random_normal([6], mean=-1, stddev=4)

# Note that the FIFO queue has still a capacity of 3
q = tf.FIFOQueue(capacity=3, dtypes=tf.float32)

# To check what is happening in this case:
# we will print a message each time "x_input_data" is actually computed
# to be used in the "enqueue_many" operation
x_input_data = tf.Print(x_input_data, data=[x_input_data], message="Raw inputs data generated:", summarize=6)
enqueue_op = q.enqueue_many(x_input_data)

# To leverage multi-threading we create a "QueueRunner"
# that will handle the "enqueue_op" outside of the main thread
# We don't need much parallelism here, so we will use only 1 thread
numberOfThreads = 1 
qr = tf.train.QueueRunner(q, [enqueue_op] * numberOfThreads)
# Don't forget to add your "QueueRunner" to the QUEUE_RUNNERS collection
tf.train.add_queue_runner(qr) 

input = q.dequeue() 
input = tf.Print(input, data=[q.size(), input], message="Nb elements left, input:")

# fake graph: START
y = input + 1
# fake graph: END 

# We start the session as usual ...
with tf.Session() as sess:
    # But now we build our coordinator to coordinate our child threads with
    # the main thread
    coord = tf.train.Coordinator()
    # Beware, if you don't start all your queues before runnig anything
    # The main threads will wait for them to start and you will hang again
    # This helper start all queues in tf.GraphKeys.QUEUE_RUNNERS
    threads = tf.train.start_queue_runners(coord=coord)

    # The QueueRunner will automatically call the enqueue operation
    # asynchronously in its own thread ensuring that the queue is always full
    # No more hanging for the main process, no more waiting for the GPU
    sess.run(y)
    sess.run(y) 
    sess.run(y)
    sess.run(y)
    sess.run(y)
    sess.run(y)
    sess.run(y)
    sess.run(y)
    sess.run(y)
    sess.run(y)

    # We request our child threads to stop ...
    coord.request_stop()
    # ... and we wait for them to do so before releasing the main thread
    coord.join(threads)

## 개선된 문제점

In [3]:
import time
import tensorflow as tf

# We simulate some raw input data 
# (think about it as fetching some data from the file system)
# let's say: batches of 128 samples, each containing 1024 data points
x_input_data = tf.random_normal([128, 1024], mean=0, stddev=1)

# We build our small model: a basic two layers neural net with ReLU

with tf.variable_scope("While_Queue_runner"):

    with tf.variable_scope("queue"):
        q = tf.FIFOQueue(capacity=5, dtypes=tf.float32) # enqueue 5 batches
        # We use the "enqueue" operation so 1 element of the queue is the full batch
        enqueue_op = q.enqueue(x_input_data)
        numberOfThreads = 1
        qr = tf.train.QueueRunner(q, [enqueue_op] * numberOfThreads)
        tf.train.add_queue_runner(qr)
        input = q.dequeue() # It replaces our input placeholder
        # We can also compute y_true right into the graph now
        y_true = tf.cast(tf.reduce_sum(input, axis=1, keep_dims=True) > 0, tf.int32)

    with tf.variable_scope('FullyConnected'):
        w = tf.get_variable('w', shape=[1024, 1024], initializer=tf.random_normal_initializer(stddev=1e-1))
        b = tf.get_variable('b', shape=[1024], initializer=tf.constant_initializer(0.1))
        z = tf.matmul(input, w) + b
        y = tf.nn.relu(z)

        w2 = tf.get_variable('w2', shape=[1024, 1], initializer=tf.random_normal_initializer(stddev=1e-1))
        b2 = tf.get_variable('b2', shape=[1], initializer=tf.constant_initializer(0.1))
        z = tf.matmul(y, w2) + b2

    with tf.variable_scope('Loss'):
        losses = tf.nn.sigmoid_cross_entropy_with_logits(None, tf.cast(y_true, tf.float32), z)
        loss_op = tf.reduce_mean(losses)

    with tf.variable_scope('Accuracy'):
        y_pred = tf.cast(z > 0, tf.int32)
        accuracy = tf.reduce_mean(tf.cast(tf.equal(y_pred, y_true), tf.float32))
        accuracy = tf.Print(accuracy, data=[accuracy], message="accuracy:")

    # We add the training op ...
    adam = tf.train.AdamOptimizer(1e-2)
    train_op = adam.minimize(loss_op, name="train_op")

startTime = time.time()
with tf.Session() as sess:
    # ... init our variables, ...
    sess.run(tf.global_variables_initializer())

    # ... add the coordinator, ...
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(coord=coord)

    # ... check the accuracy before training (without feed_dict!), ...
    sess.run(accuracy)

    # ... train ...
    for i in range(5000):
        #  ... without sampling from Python and without a feed_dict !
        _, loss = sess.run([train_op, loss_op])

        # We regularly check the loss
        if i % 500 == 0:
            print('iter:%d - loss:%f' % (i, loss))

    # Finally, we check our final accuracy
    sess.run(accuracy)

    coord.request_stop()
    coord.join(threads)

print("Time taken: %f" % (time.time() - startTime))

iter:0 - loss:2.171846
iter:500 - loss:0.469907
iter:1000 - loss:0.633990
iter:1500 - loss:1.030779
iter:2000 - loss:0.545669
iter:2500 - loss:0.323410
iter:3000 - loss:0.787239
iter:3500 - loss:0.652673
iter:4000 - loss:0.695005
iter:4500 - loss:1.328690
Time taken: 4.120565


# reference

https://www.tensorflow.org/api_docs/python/tf/data/Dataset

https://www.tensorflow.org/guide/datasets

# importing data with Tensorflow tf.data API

tf.data는 간단한 수준에서 복잡한 수준까지의 Input pipeline을 구성할 수 있도록 하는 API이다. 구체적으로 제공하는 기능은 다음과 같다.

# 1. tf.data.Dataset

tf.data.Dataset은 연속된 element 집합으로, 각각의 element가 Tensor object로 구성되어있다. 이때 각각의 element는 Training을 위한 data와 label의 pair로 볼 수 있을 것이다.

크게 다음과 같은 구성을 가진다.

- Creating source : 다수의 tf.Tensor object 또는 file로부터 dataset을 구성한다.
  
  다양한 source로부터의 입력
   1. local file system
   2. distributed file system
   3. On-memory data
   4. real-time data generator
  
  e.g : Dataset.from_tensor_slices() , on-memory data
  
  그 외에도 from_generator, list_files, interleave, tfrecordReader


- Applying a transformation : 하나 또는 여러 개의 dataset object들로부터 새로운 dataset을 구성한다.

  e.g : Dataset.batch()
  
  그 외에도 concatenate, filter, reduce, map, flat_map, padded_batch, shard, zip, shuffle

### Definition of data source
1. tf.data.Dataset.from_tensors() or tf.data.Dataset.from_tensor_slices() :
   memory 상의 tensor들로 dataset을 만드는 경우
2. tf.data.TFRecordDataset :
   Disk 상의 file들로 dataset을 만드는 경우
3. tf.data.from_generator  :
   python iterator로부터 dataset을 만드는 경우

## Create from_tensor_slice dataset and Show information
output_shapes : dataset의 각 element의 shape 정보
output_types : dataset의 각 element의 type

In [2]:
dataset1 = tf.data.Dataset.from_tensor_slices( tf.random_uniform([4, 10], dtype = tf.float32) )
print("dataset1 : ")
print(dataset1.output_types)
print(dataset1.output_shapes)

dataset2 = tf.data.Dataset.from_tensor_slices( 
    (tf.random_uniform( [4, 1] ),
     tf.random_uniform( [4, 10], maxval=100, dtype = tf.int32)))

print("dataset2 : ")
print(dataset2.output_types)
print(dataset2.output_shapes)
    
dataset3 = tf.data.Dataset.zip( (dataset1, dataset2) )
print("dataset3 : ")
print(dataset3.output_types)
print(dataset3.output_shapes)
    

dataset1 : 
<dtype: 'float32'>
(10,)
dataset2 : 
(tf.float32, tf.int32)
(TensorShape([Dimension(1)]), TensorShape([Dimension(10)]))
dataset3 : 
(tf.float32, (tf.float32, tf.int32))
(TensorShape([Dimension(10)]), (TensorShape([Dimension(1)]), TensorShape([Dimension(10)])))


## Create dataset and Show information

In [35]:
file_list_dataset = tf.data.Dataset.list_files(
    '/home/dan/datasets/flower_photos/daisy/*.jpg',
    shuffle=None,
    seed=None
)
iterator = file_list_dataset.make_one_shot_iterator()
next_element = iterator.get_next()

with tf.Session() as sess:
    for i in range(10):
        file = sess.run(next_element)
        print(file)

b'/home/dan/datasets/flower_photos/daisy/3506866918_61dd5fc53b_n.jpg'
b'/home/dan/datasets/flower_photos/daisy/450128527_fd35742d44.jpg'
b'/home/dan/datasets/flower_photos/daisy/6910811638_aa6f17df23.jpg'
b'/home/dan/datasets/flower_photos/daisy/2349640101_212c275aa7.jpg'
b'/home/dan/datasets/flower_photos/daisy/8120563761_ed5620664f_m.jpg'
b'/home/dan/datasets/flower_photos/daisy/14613443462_d4ed356201.jpg'
b'/home/dan/datasets/flower_photos/daisy/8887005939_b19e8305ee.jpg'
b'/home/dan/datasets/flower_photos/daisy/4511693548_20f9bd2b9c_m.jpg'
b'/home/dan/datasets/flower_photos/daisy/3703643767_dee82cdef9_n.jpg'
b'/home/dan/datasets/flower_photos/daisy/10437770546_8bb6f7bdd3_m.jpg'


# 2. Creating iterator

우리의 데이터를 표현하는 데이터셋을 만든 뒤에, 데이터셋의 element에 접근하는 iterator를 만들어야 한다. tf.data API는 다음과 같은 iterator를 제공하고 있다. 

- one-shot,
- initializable,
- reinitializable, and
- feedable.

## 2.1 one_shot_iterator

one shot iterator는 가장 간단한 형태의 iterator이다. 이 iterator는 단 한 번의 iterating을 수행하는 것을 지원한다. 어떠한 초기값도 필요하지 않고, 대부분의 queue-base input pipeline에서 사용할 수 있다.

In [8]:
dataset = tf.data.Dataset.range(100)
iterator = dataset.make_one_shot_iterator()
next_element = iterator.get_next()

with tf.Session() as sess:
    for i in range(10):
      value = sess.run(next_element)
      print(value)
      assert i == value

0
1
2
3
4
5
6
7
8
9


In [14]:
dataset = tf.data.Dataset.range(100)
dataset = dataset.batch(11)
iterator = dataset.make_one_shot_iterator()
next_element = iterator.get_next()

with tf.Session() as sess:
    for i in range(10):
      value = sess.run(next_element)
      print(value)

[ 0  1  2  3  4  5  6  7  8  9 10]
[11 12 13 14 15 16 17 18 19 20 21]
[22 23 24 25 26 27 28 29 30 31 32]
[33 34 35 36 37 38 39 40 41 42 43]
[44 45 46 47 48 49 50 51 52 53 54]
[55 56 57 58 59 60 61 62 63 64 65]
[66 67 68 69 70 71 72 73 74 75 76]
[77 78 79 80 81 82 83 84 85 86 87]
[88 89 90 91 92 93 94 95 96 97 98]
[99]


# OutOfRangeError when index == 100

In [42]:
dataset = tf.data.Dataset.range(100)
iterator = dataset.make_one_shot_iterator()
next_element = iterator.get_next()

with tf.Session() as sess:
    for i in range(101):
      value = sess.run(next_element)
      assert i == value

OutOfRangeError: End of sequence
	 [[node IteratorGetNext_19 (defined at <ipython-input-42-29d07066bdb3>:3)  = IteratorGetNext[output_shapes=[[]], output_types=[DT_INT64], _device="/job:localhost/replica:0/task:0/device:CPU:0"](OneShotIterator_15)]]

Caused by op 'IteratorGetNext_19', defined at:
  File "/usr/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/dan/.virtualenvs/tf/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/home/dan/.virtualenvs/tf/lib/python3.6/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/home/dan/.virtualenvs/tf/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 505, in start
    self.io_loop.start()
  File "/home/dan/.virtualenvs/tf/lib/python3.6/site-packages/tornado/platform/asyncio.py", line 132, in start
    self.asyncio_loop.run_forever()
  File "/usr/lib/python3.6/asyncio/base_events.py", line 422, in run_forever
    self._run_once()
  File "/usr/lib/python3.6/asyncio/base_events.py", line 1434, in _run_once
    handle._run()
  File "/usr/lib/python3.6/asyncio/events.py", line 145, in _run
    self._callback(*self._args)
  File "/home/dan/.virtualenvs/tf/lib/python3.6/site-packages/tornado/ioloop.py", line 758, in _run_callback
    ret = callback()
  File "/home/dan/.virtualenvs/tf/lib/python3.6/site-packages/tornado/stack_context.py", line 300, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/dan/.virtualenvs/tf/lib/python3.6/site-packages/tornado/gen.py", line 1233, in inner
    self.run()
  File "/home/dan/.virtualenvs/tf/lib/python3.6/site-packages/tornado/gen.py", line 1147, in run
    yielded = self.gen.send(value)
  File "/home/dan/.virtualenvs/tf/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 357, in process_one
    yield gen.maybe_future(dispatch(*args))
  File "/home/dan/.virtualenvs/tf/lib/python3.6/site-packages/tornado/gen.py", line 326, in wrapper
    yielded = next(result)
  File "/home/dan/.virtualenvs/tf/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 267, in dispatch_shell
    yield gen.maybe_future(handler(stream, idents, msg))
  File "/home/dan/.virtualenvs/tf/lib/python3.6/site-packages/tornado/gen.py", line 326, in wrapper
    yielded = next(result)
  File "/home/dan/.virtualenvs/tf/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 534, in execute_request
    user_expressions, allow_stdin,
  File "/home/dan/.virtualenvs/tf/lib/python3.6/site-packages/tornado/gen.py", line 326, in wrapper
    yielded = next(result)
  File "/home/dan/.virtualenvs/tf/lib/python3.6/site-packages/ipykernel/ipkernel.py", line 294, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/home/dan/.virtualenvs/tf/lib/python3.6/site-packages/ipykernel/zmqshell.py", line 536, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/home/dan/.virtualenvs/tf/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2819, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "/home/dan/.virtualenvs/tf/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2845, in _run_cell
    return runner(coro)
  File "/home/dan/.virtualenvs/tf/lib/python3.6/site-packages/IPython/core/async_helpers.py", line 67, in _pseudo_sync_runner
    coro.send(None)
  File "/home/dan/.virtualenvs/tf/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3020, in run_cell_async
    interactivity=interactivity, compiler=compiler, result=result)
  File "/home/dan/.virtualenvs/tf/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3185, in run_ast_nodes
    if (yield from self.run_code(code, result)):
  File "/home/dan/.virtualenvs/tf/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3267, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-42-29d07066bdb3>", line 3, in <module>
    next_element = iterator.get_next()
  File "/home/dan/.virtualenvs/tf/lib/python3.6/site-packages/tensorflow/python/data/ops/iterator_ops.py", line 421, in get_next
    name=name)), self._output_types,
  File "/home/dan/.virtualenvs/tf/lib/python3.6/site-packages/tensorflow/python/ops/gen_dataset_ops.py", line 2069, in iterator_get_next
    output_shapes=output_shapes, name=name)
  File "/home/dan/.virtualenvs/tf/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "/home/dan/.virtualenvs/tf/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py", line 488, in new_func
    return func(*args, **kwargs)
  File "/home/dan/.virtualenvs/tf/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 3274, in create_op
    op_def=op_def)
  File "/home/dan/.virtualenvs/tf/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1770, in __init__
    self._traceback = tf_stack.extract_stack()

OutOfRangeError (see above for traceback): End of sequence
	 [[node IteratorGetNext_19 (defined at <ipython-input-42-29d07066bdb3>:3)  = IteratorGetNext[output_shapes=[[]], output_types=[DT_INT64], _device="/job:localhost/replica:0/task:0/device:CPU:0"](OneShotIterator_15)]]


## 2.2 initializable iterator

경우에 따라 항상 일정한 dataset을 사용하는 것이 아니라 적절한 parameter를 주어서 dataset을 제어해야하는 경우가 있다.
이 경우 initializable iterator는 session을 통해 한 번 초기화가 필요한 대신 dataset의 정의를 제어할 수 있다.

다음의 예시는 tf.placeholder를 통해 dataset의 parameter를 initializable iterator에 fed하는 방식을 소개한다.
위에서와 마찬가지로 Dataset.range 함수를 사용하나 max_value를 parameter로 제어하는 예시이다.

In [3]:
max_value = tf.placeholder(tf.int64, shape=[])
dataset = tf.data.Dataset.range(max_value)
iterator = dataset.make_initializable_iterator()
next_element = iterator.get_next()

with tf.Session() as sess:
    # Initialize an iterator over a dataset with 10 elements.
    sess.run(iterator.initializer, feed_dict={max_value: 10})
    for i in range(10):
      value = sess.run(next_element)
      assert i == value

    # Initialize the same iterator over a dataset with 100 elements.
    sess.run(iterator.initializer, feed_dict={max_value: 100})
    for i in range(100):
      value = sess.run(next_element)
      assert i == value

## 2.3 reinitializable iterator

reinitializable iterator는 두 개 이상의 dataset을 다른 방식으로 제어하고 싶을 때 사용한다.
예를 들면 아래의 예시는 training dataset과 validation dataset 두 가지를 사용할 때, 우리는 경우에 따라서 training dataset에 generalization을 위해 perturbation을 추가하는 경우가 많다. 반면에 validation dataset에는 그러한 처리가 들어가지 않기를 바랄 수 있다. 그런 경우 reinitializable iterator를 사용하면 편리하다.

In [39]:
# Define training and validation datasets with the same structure.
training_dataset = tf.data.Dataset.range(100).map(
    lambda x: x + tf.random_uniform([], -10, 10, tf.int64))
validation_dataset = tf.data.Dataset.range(50)

# A reinitializable iterator is defined by its structure. We could use the
# `output_types` and `output_shapes` properties of either `training_dataset`
# or `validation_dataset` here, because they are compatible.
iterator = tf.data.Iterator.from_structure(training_dataset.output_types,
                                           training_dataset.output_shapes)
next_element = iterator.get_next()

training_init_op = iterator.make_initializer(training_dataset)
validation_init_op = iterator.make_initializer(validation_dataset)

# Run 20 epochs in which the training dataset is traversed, followed by the
# validation dataset.

with tf.Session() as sess:
    for _ in range(20):
      # Initialize an iterator over the training dataset.
      sess.run(training_init_op)
      for _ in range(100):
        sess.run(next_element)

      # Initialize an iterator over the validation dataset.
      sess.run(validation_init_op)
      for _ in range(50):
        sess.run(next_element)

## 2.4 feedable iterator

feedable iterator는 기본적으로 reinitializable iterator와 동일한 mechanism을 갖는다. 그러나 좀 더 iterator를 유연하게 스위칭할 수 있게 해준다. reinitialzable iterator는 iterator 스위칭을 위해 매번 초기화를 다시 해줘야 하지만 feedable iterator는 우리에게 익숙한 feed_dict와 placeholder를 사용하여 좀 더 유연하게 iterator를 변경하며 사용할 수 있게 해준다.

In [6]:
# Define training and validation datasets with the same structure.
training_dataset = tf.data.Dataset.range(100).map(
    lambda x: x + tf.random_uniform([], -10, 10, tf.int64)).repeat()
validation_dataset = tf.data.Dataset.range(50)

# A feedable iterator is defined by a handle placeholder and its structure. We
# could use the `output_types` and `output_shapes` properties of either
# `training_dataset` or `validation_dataset` here, because they have
# identical structure.
handle = tf.placeholder(tf.string, shape=[])
iterator = tf.data.Iterator.from_string_handle(
    handle, training_dataset.output_types, training_dataset.output_shapes)
next_element = iterator.get_next()

# You can use feedable iterators with a variety of different kinds of iterator
# (such as one-shot and initializable iterators).
training_iterator = training_dataset.make_one_shot_iterator()
validation_iterator = validation_dataset.make_initializable_iterator()


sess = tf.Session()

# The `Iterator.string_handle()` method returns a tensor that can be evaluated
# and used to feed the `handle` placeholder.
training_handle = sess.run(training_iterator.string_handle())
validation_handle = sess.run(validation_iterator.string_handle())

# Loop forever, alternating between training and validation.
for i in range(10):
  # Run 200 steps using the training dataset. Note that the training dataset is
  # infinite(training_dataset has repeat() method at the end), 
  # and we resume from where we left off in the previous for loop iteration.
  for _ in range(200):
    sess.run(next_element, feed_dict={handle: training_handle})

  # Run one pass over the validation dataset.
  sess.run(validation_iterator.initializer)
  for _ in range(50):
    sess.run(next_element, feed_dict={handle: validation_handle})
    
sess.close()