# TFDS로 input pipeline 구축하기

다음 colab을 따라한 것임
* https://colab.research.google.com/github/tensorflow/docs/blob/master/site/en/guide/data.ipynb

In [1]:
import tensorflow as tf

import pathlib
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

np.set_printoptions(precision=4)

## Basic mechanism

* from_tensor_slice로 Dataset 생성 
* for in 이나 next로 개별 data 접근하기
* lambda 함수 적용 예시 (reduce)

In [2]:
dataset = tf.data.Dataset.from_tensor_slices([1,2,3])
dataset

<TensorSliceDataset shapes: (), types: tf.int32>

In [3]:
for elem in dataset:
    print(elem.numpy())

1
2
3


In [4]:
it = iter(dataset)
it

<tensorflow.python.data.ops.iterator_ops.OwnedIterator at 0x7f6252fde7b8>

In [5]:
print(next(it).numpy())

1


## TFDS.TypeSpec 확인하기

In [6]:
dataset1 = tf.data.Dataset.from_tensor_slices(tf.random.uniform([4, 10]))
dataset1.element_spec

TensorSpec(shape=(10,), dtype=tf.float32, name=None)

In [7]:
dataset2 = tf.data.Dataset.from_tensor_slices(
    (tf.random.uniform([4]), tf.random.uniform([4,100], maxval=100, dtype=tf.int32)))
dataset2.element_spec

(TensorSpec(shape=(), dtype=tf.float32, name=None),
 TensorSpec(shape=(100,), dtype=tf.int32, name=None))

In [8]:
dataset3 = tf.data.Dataset.zip((dataset1, dataset2))
dataset3.element_spec

(TensorSpec(shape=(10,), dtype=tf.float32, name=None),
 (TensorSpec(shape=(), dtype=tf.float32, name=None),
  TensorSpec(shape=(100,), dtype=tf.int32, name=None)))

In [9]:
# with sparse tensor
dataset4 = tf.data.Dataset.from_tensors(
    tf.SparseTensor(indices=[[0,0], [1,2]], values=[1,2], dense_shape=[3,4]))
print(dataset4.element_spec)
print(dataset4.element_spec.value_type)

SparseTensorSpec(TensorShape([3, 4]), tf.int32)
<class 'tensorflow.python.framework.sparse_tensor.SparseTensor'>


## Numpy array 로 dataset 생성하기

In [10]:
train, test = tf.keras.datasets.fashion_mnist.load_data()

images, labels = train
images = images / 255

print(type(images))

dataset = tf.data.Dataset.from_tensor_slices((images, labels))
dataset

<class 'numpy.ndarray'>


<TensorSliceDataset shapes: ((28, 28), ()), types: (tf.float64, tf.uint8)>

## Python generator로 Dataset 생성하기 

In [11]:
# generator 
def pyfunc_count(stop):
    i = 0
    while i<stop:
        yield i
        i +=1
        
for n in pyfunc_count(3):
    print(n)        

0
1
2


In [12]:
ds_counter = tf.data.Dataset.from_generator(
    pyfunc_count, 
    args=[3], 
    output_types=tf.int32,
    output_shapes=(), )

In [13]:
for count in ds_counter.take(10):   # single element generation
    print(count.numpy())

0
1
2


In [14]:
for count_batch in ds_counter.repeat().batch(5).take(3): # mini-batch generation
    print(count_batch.numpy())

[0 1 2 0 1]
[2 0 1 2 0]
[1 2 0 1 2]


## 가변 series를 generate 하고, padded batch 만들기

In [15]:
def gen_series():
    i = 0
    while True:
        size = np.random.randint(0, 5)
        yield i, np.random.normal(size=(size,))
        i += 1
        
for i, series in gen_series():
    print(i, ":", str(series))
    if i > 5:
        break        

0 : [-0.2034  1.9789  0.1841  0.8326]
1 : [0.0763 0.0736 0.3123 0.1224]
2 : [-1.2074 -0.4304]
3 : [ 1.8337 -0.0877  0.395  -0.2962]
4 : [ 0.8646  0.7961 -1.0588 -0.5157]
5 : [-0.125  -1.2264 -0.381   0.1834]
6 : [-0.0046  0.9751 -1.1306]


In [16]:
ds_series = tf.data.Dataset.from_generator(
    gen_series,
    output_types=(tf.int32, tf.float32),
    output_shapes=((), (None, )))    # None으로 가변형 표현 
ds_series

<FlatMapDataset shapes: ((), (None,)), types: (tf.int32, tf.float32)>

In [17]:
ds_series_batch = ds_series.padded_batch(5)

ids, seq_batch = next(iter(ds_series_batch))

seq_batch.numpy()   # zero-padded 고정 길이형 확인

array([[-0.301 ,  0.    ,  0.    ,  0.    ],
       [ 0.    ,  0.    ,  0.    ,  0.    ],
       [-0.4074,  0.    ,  0.    ,  0.    ],
       [ 0.    ,  0.    ,  0.    ,  0.    ],
       [-1.3198,  1.3002,  0.8058,  0.5828]], dtype=float32)

## Image Generator로부터 dataset 만들기

In [18]:
flowers = tf.keras.utils.get_file(
    'flower_photos',
    'https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz',
    untar=True)

In [19]:
img_gen = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1./255, rotation_range=20)

images, labels = next(img_gen.flow_from_directory(flowers)) # tf.keras의 generator

print(images.dtype, images.shape)
print(labels.dtype, labels.shape)

Found 3670 images belonging to 5 classes.
float32 (32, 256, 256, 3)
float32 (32, 5)


In [20]:
ds = tf.data.Dataset.from_generator(
    lambda: img_gen.flow_from_directory(flowers),
    output_types = (tf.float32, tf.float32),
    output_shapes = ([32,256,256,3], [32,5])
)
ds.element_spec

(TensorSpec(shape=(32, 256, 256, 3), dtype=tf.float32, name=None),
 TensorSpec(shape=(32, 5), dtype=tf.float32, name=None))

In [21]:
images, labels = next(iter(ds))   # tfds로부터 iteration

print(images.dtype, images.shape)
print(labels.dtype, labels.shape)

Found 3670 images belonging to 5 classes.
<dtype: 'float32'> (32, 256, 256, 3)
<dtype: 'float32'> (32, 5)


## TFRecord 로부터 TFRecordDataset 만들기

In [24]:
fsns_test_file = tf.keras.utils.get_file("fsns.tfrec", "https://storage.googleapis.com/download.tensorflow.org/data/fsns-20160927/testdata/fsns-00000-of-00001")

In [27]:
dataset = tf.data.TFRecordDataset(filenames=[fsns_test_file])
dataset

<TFRecordDatasetV2 shapes: (), types: tf.string>

In [32]:
raw_example = next(iter(dataset))
parsed = tf.train.Example.FromString(raw_example.numpy())
parsed.features.feature['image/text']

bytes_list {
  value: "Rue Perreyon"
}

## Text data로부터 TextLineDataset 만들기

In [33]:
# 텍스트 파일들 가져오기
directory_url = 'https://storage.googleapis.com/download.tensorflow.org/data/illiad/'
file_names = ['cowper.txt', 'derby.txt', 'butler.txt']

file_paths = [
    tf.keras.utils.get_file(file_name, directory_url + file_name)
    for file_name in file_names
]

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/illiad/cowper.txt
Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/illiad/derby.txt
Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/illiad/butler.txt


In [34]:
dataset = tf.data.TextLineDataset(file_paths)
dataset

<TextLineDatasetV2 shapes: (), types: tf.string>

In [36]:
line = next(iter(dataset))
print(line.numpy())

b"\xef\xbb\xbfAchilles sing, O Goddess! Peleus' son;"


#### interleave를 사용해서 여러 개의 file로부터 온 line들을 섞기

In [40]:
files_ds = tf.data.Dataset.from_tensor_slices(file_paths)
lines_ds = files_ds.interleave(tf.data.TextLineDataset, cycle_length=3)

for i, line in enumerate(lines_ds.take(9)):
    if i % 3 == 0:
        print()
    print(line.numpy())


b"\xef\xbb\xbfAchilles sing, O Goddess! Peleus' son;"
b"\xef\xbb\xbfOf Peleus' son, Achilles, sing, O Muse,"
b'\xef\xbb\xbfSing, O goddess, the anger of Achilles son of Peleus, that brought'

b'His wrath pernicious, who ten thousand woes'
b'The vengeance, deep and deadly; whence to Greece'
b'countless ills upon the Achaeans. Many a brave soul did it send'

b"Caused to Achaia's host, sent many a soul"
b'Unnumbered ills arose; which many a soul'
b'hurrying down to Hades, and many a hero did it yield a prey to dogs and'


#### 불필요한 line skip 및 line filter 조건 

In [42]:
titanic_file = tf.keras.utils.get_file("train.csv", "https://storage.googleapis.com/tf-datasets/titanic/train.csv")
titanic_lines = tf.data.TextLineDataset(titanic_file)

for line in titanic_lines.take(10):
    print(line.numpy())

b'survived,sex,age,n_siblings_spouses,parch,fare,class,deck,embark_town,alone'
b'0,male,22.0,1,0,7.25,Third,unknown,Southampton,n'
b'1,female,38.0,1,0,71.2833,First,C,Cherbourg,n'
b'1,female,26.0,0,0,7.925,Third,unknown,Southampton,y'
b'1,female,35.0,1,0,53.1,First,C,Southampton,n'
b'0,male,28.0,0,0,8.4583,Third,unknown,Queenstown,y'
b'0,male,2.0,3,1,21.075,Third,unknown,Southampton,n'
b'1,female,27.0,0,2,11.1333,Third,unknown,Southampton,n'
b'1,female,14.0,1,0,30.0708,Second,unknown,Cherbourg,n'
b'1,female,4.0,1,1,16.7,Third,G,Southampton,n'


In [44]:
def survived(line):
    return tf.not_equal(tf.strings.substr(line, 0, 1), "0")  
    
survivors = titanic_lines.skip(1).filter(survived)

for line in survivors.take(10):
    print(line.numpy())

b'1,female,38.0,1,0,71.2833,First,C,Cherbourg,n'
b'1,female,26.0,0,0,7.925,Third,unknown,Southampton,y'
b'1,female,35.0,1,0,53.1,First,C,Southampton,n'
b'1,female,27.0,0,2,11.1333,Third,unknown,Southampton,n'
b'1,female,14.0,1,0,30.0708,Second,unknown,Cherbourg,n'
b'1,female,4.0,1,1,16.7,Third,G,Southampton,n'
b'1,male,28.0,0,0,13.0,Second,unknown,Southampton,y'
b'1,female,28.0,0,0,7.225,Third,unknown,Cherbourg,y'
b'1,male,28.0,0,0,35.5,First,A,Southampton,y'
b'1,female,38.0,1,5,31.3875,Third,unknown,Southampton,n'


## CSV -> Pandas -> Dataset 만들기

In [46]:
titanic_file = tf.keras.utils.get_file("train.csv", "https://storage.googleapis.com/tf-datasets/titanic/train.csv")
df = pd.read_csv(titanic_file)

titanic_ds = tf.data.Dataset.from_tensor_slices(dict(df))

feature_batch = next(iter(titanic_ds))
for key, value in feature_batch.items():
    print("  {!r:20s}: {}".format(key, value))

  'survived'          : 0
  'sex'               : b'male'
  'age'               : 22.0
  'n_siblings_spouses': 1
  'parch'             : 0
  'fare'              : 7.25
  'class'             : b'Third'
  'deck'              : b'unknown'
  'embark_town'       : b'Southampton'
  'alone'             : b'n'


## CSV 로부터 한번에 Dataset 만들기 (from disk directly)

In [51]:
titanic_batches = tf.data.experimental.make_csv_dataset(
    titanic_file, batch_size=4,
    select_columns=['class', 'fare', 'survived'],
    label_name="survived"
)
titanic_batches

<PrefetchDataset shapes: (OrderedDict([(fare, (4,)), (class, (4,))]), (4,)), types: (OrderedDict([(fare, tf.float32), (class, tf.string)]), tf.int32)>

In [53]:
feature_batch, label_batch = next(iter(titanic_batches))
print("'survived': {}".format(label_batch))
for key, value in feature_batch.items():
    print("  {!r:20s}: {}".format(key, value))

'survived': [1 0 1 0]
  'fare'              : [ 26.2875   7.0542 146.5208  26.25  ]
  'class'             : [b'First' b'Third' b'First' b'Second']


## CSV 로부터 한번에 CsvDataset 만들기

In [54]:
titanic_types  = [tf.int32, tf.string, tf.float32, tf.int32, tf.int32, tf.float32, tf.string, tf.string, tf.string, tf.string] 
dataset = tf.data.experimental.CsvDataset(titanic_file, titanic_types, header=True)
dataset

<CsvDatasetV2 shapes: ((), (), (), (), (), (), (), (), (), ()), types: (tf.int32, tf.string, tf.float32, tf.int32, tf.int32, tf.float32, tf.string, tf.string, tf.string, tf.string)>

In [56]:
line = next(iter(dataset))
print( [item.numpy() for item in line] )

[0, b'male', 22.0, 1, 0, 7.25, b'Third', b'unknown', b'Southampton', b'n']


## 복수 파일로부터 dataset 만들기

In [59]:
flowers_root = tf.keras.utils.get_file(
    'flower_photos',
    'https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz',
    untar=True)
flowers_root = pathlib.Path(flowers_root)

for item in flowers_root.glob("*"):
    print(item.name)   # subdirectory로 개별 클래스 인스턴스 파일들을 가지고 있다.

daisy
tulips
roses
dandelion
sunflowers
LICENSE.txt


In [62]:
list_ds = tf.data.Dataset.list_files(str(flowers_root/'*/*'))

for f in list_ds.take(5):
    print(f.numpy())

b'/home/hoondori/.keras/datasets/flower_photos/tulips/7094415739_6b29e5215c_m.jpg'
b'/home/hoondori/.keras/datasets/flower_photos/roses/6687138903_ff6ae12758_n.jpg'
b'/home/hoondori/.keras/datasets/flower_photos/daisy/18023717391_e2c9089e10.jpg'
b'/home/hoondori/.keras/datasets/flower_photos/sunflowers/4933823922_911ac40b0d.jpg'
b'/home/hoondori/.keras/datasets/flower_photos/dandelion/4226758402_a1b75ce3ac_n.jpg'


In [63]:
# path에서 label을 추출해서 label을 가진 ds 만들기

def process_path(file_path):
    label = tf.strings.split(file_path, os.sep)[-2]
    return tf.io.read_file(file_path), label

labeled_ds = list_ds.map(process_path)

image_raw, label_text = next(iter(labeled_ds))
print(repr(image_raw.numpy())[:100])
print()
print(label_text.numpy())




b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00\x00\x01\x00\x01\x00\x00\xff\xdb\x00C\x00\x03\x02\x02\

b'dandelion'


## BatchDataset 만들기

In [74]:
dataset = tf.data.Dataset.range(100)
dataset = dataset.map(lambda x: tf.fill([tf.cast(x,tf.int32)],x))

for elem in dataset.take(5):
    print(elem.numpy())

[]
[1]
[2 2]
[3 3 3]
[4 4 4 4]


In [77]:
# padded batch 만들기: Mini-Batch 안의 가장 긴 시퀀스대로 패치한다.

pad_dataset = dataset.padded_batch(batch_size=3, padded_shapes=(None,))

for elem in pad_dataset.take(5):
    print(elem.numpy())

[[0 0]
 [1 0]
 [2 2]]
[[3 3 3 0 0]
 [4 4 4 4 0]
 [5 5 5 5 5]]
[[6 6 6 6 6 6 0 0]
 [7 7 7 7 7 7 7 0]
 [8 8 8 8 8 8 8 8]]
[[ 9  9  9  9  9  9  9  9  9  0  0]
 [10 10 10 10 10 10 10 10 10 10  0]
 [11 11 11 11 11 11 11 11 11 11 11]]
[[12 12 12 12 12 12 12 12 12 12 12 12  0  0]
 [13 13 13 13 13 13 13 13 13 13 13 13 13  0]
 [14 14 14 14 14 14 14 14 14 14 14 14 14 14]]
