In [1]:
# # This code allows to output more than one variable value without using a print statement.
# from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity = "all"

# # Default
# # InteractiveShell.ast_node_interactivity = "last_expr"

In [2]:
# Version Check
import sys
import tensorflow as tf
import tensorflow_datasets as tfds
import PIL
import pandas as pd
import numpy as np
import scipy
print("python", sys.version)
print("tensorflow", tf.__version__)
print("tensorflow-datasets", tfds.__version__)
print("Pillow", PIL.__version__)
print("pandas", pd.__version__)
print("numpy", np.__version__)
print("scipy", scipy.__version__)
print()
print("Num GPUs Available:", len(tf.config.list_physical_devices('GPU')))
print("Built with CUDA:", tf.test.is_built_with_cuda())
print("Built with GPU support:", tf.test.is_built_with_gpu_support())

python 3.8.0 (default, Nov  6 2019, 16:00:02) [MSC v.1916 64 bit (AMD64)]
tensorflow 2.6.2
tensorflow-datasets 4.4.0
Pillow 8.3.2
pandas 1.3.3
numpy 1.19.5
scipy 1.7.1

Num GPUs Available: 1
Built with CUDA: True
Built with GPU support: True


## 1. as_numpy_iterator (Dataset class -> numpy array)

In [3]:
dataset = tf.data.Dataset.range(10)
list(dataset.as_numpy_iterator())

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

## 2. apply

In [4]:
dataset = tf.data.Dataset.range(10)

def filter_five(x):
    return x.filter(lambda x: x < 5)

list(dataset.apply(filter_five).as_numpy_iterator())

[0, 1, 2, 3, 4]

## 3. from_tensor_slices (list/numpy array -> Dataset class)

In [5]:
ds = tf.data.Dataset.from_tensor_slices([1, 2, 3, 4, 5])
print(type(ds))
list(ds.as_numpy_iterator())

<class 'tensorflow.python.data.ops.dataset_ops.TensorSliceDataset'>


[1, 2, 3, 4, 5]

## 4. iteration

In [6]:
ds = tf.data.Dataset.from_tensor_slices([1, 2, 3, 4, 5])

for d in ds:
    print(d)

tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)


## 5. range (same as python range)

In [7]:
ds = tf.data.Dataset.range(1, 10, 2)

for d in ds:
    print(d)

tf.Tensor(1, shape=(), dtype=int64)
tf.Tensor(3, shape=(), dtype=int64)
tf.Tensor(5, shape=(), dtype=int64)
tf.Tensor(7, shape=(), dtype=int64)
tf.Tensor(9, shape=(), dtype=int64)


## 6. batch

In [8]:
ds = tf.data.Dataset.range(8)

for d in ds.batch(3):
    print(d)

tf.Tensor([0 1 2], shape=(3,), dtype=int64)
tf.Tensor([3 4 5], shape=(3,), dtype=int64)
tf.Tensor([6 7], shape=(2,), dtype=int64)


## 7. drop_remainder (useful for NLP, time series)

In [9]:
ds = tf.data.Dataset.range(8)

for d in ds.batch(3, drop_remainder=True):
    print(d)

tf.Tensor([0 1 2], shape=(3,), dtype=int64)
tf.Tensor([3 4 5], shape=(3,), dtype=int64)


## 8. window (useful for time series)

In [10]:
ds = tf.data.Dataset.range(10)
ds = ds.window(5, shift=1, drop_remainder=False)  # drop_remainder=False

for d in ds:
    print(list(d.as_numpy_iterator()))

[0, 1, 2, 3, 4]
[1, 2, 3, 4, 5]
[2, 3, 4, 5, 6]
[3, 4, 5, 6, 7]
[4, 5, 6, 7, 8]
[5, 6, 7, 8, 9]
[6, 7, 8, 9]
[7, 8, 9]
[8, 9]
[9]


In [11]:
ds = tf.data.Dataset.range(10)
ds = ds.window(5, shift=1, drop_remainder=True)  # drop_remainder=True

for d in ds:
    print(list(d.as_numpy_iterator()))

[0, 1, 2, 3, 4]
[1, 2, 3, 4, 5]
[2, 3, 4, 5, 6]
[3, 4, 5, 6, 7]
[4, 5, 6, 7, 8]
[5, 6, 7, 8, 9]


In [12]:
ds = tf.data.Dataset.range(10)
ds = ds.window(5, shift=2, drop_remainder=True)  # shift=2

for d in ds:
    print(list(d.as_numpy_iterator()))

[0, 1, 2, 3, 4]
[2, 3, 4, 5, 6]
[4, 5, 6, 7, 8]


## 9. flat_map

In [13]:
ds = tf.data.Dataset.range(10)
ds = ds.window(5, shift=1, drop_remainder=True)

for d in ds:
    print(d)

<_VariantDataset shapes: (), types: tf.int64>
<_VariantDataset shapes: (), types: tf.int64>
<_VariantDataset shapes: (), types: tf.int64>
<_VariantDataset shapes: (), types: tf.int64>
<_VariantDataset shapes: (), types: tf.int64>
<_VariantDataset shapes: (), types: tf.int64>


In [14]:
ds = tf.data.Dataset.range(10)
ds = ds.window(5, shift=1, drop_remainder=True)
ds = ds.flat_map(lambda w: w.batch(5))

for d in ds:
    print(d)

tf.Tensor([0 1 2 3 4], shape=(5,), dtype=int64)
tf.Tensor([1 2 3 4 5], shape=(5,), dtype=int64)
tf.Tensor([2 3 4 5 6], shape=(5,), dtype=int64)
tf.Tensor([3 4 5 6 7], shape=(5,), dtype=int64)
tf.Tensor([4 5 6 7 8], shape=(5,), dtype=int64)
tf.Tensor([5 6 7 8 9], shape=(5,), dtype=int64)


## 10. shuffle

In [15]:
ds = tf.data.Dataset.from_tensor_slices(np.arange(10))

for d in ds:
    print(d)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)


In [16]:
ds = tf.data.Dataset.from_tensor_slices(np.arange(10)).shuffle(buffer_size=5)  # .shuffle(buffer_size=5)

for d in ds:
    print(d)

tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)


## 11. map

In [17]:
window_size = 5

ds = tf.data.Dataset.range(10)
ds = ds.window(window_size, shift=1, drop_remainder=True)
ds = ds.flat_map(lambda w: w.batch(window_size))
ds = ds.shuffle(10)

ds = ds.map(lambda x: (x[:-1], x[-1:]))
for x, y in ds:
    print('train set: {}'.format(x))
    print('label set: {}'.format(y))
    print('==='*10)

train set: [5 6 7 8]
label set: [9]
train set: [3 4 5 6]
label set: [7]
train set: [0 1 2 3]
label set: [4]
train set: [1 2 3 4]
label set: [5]
train set: [2 3 4 5]
label set: [6]
train set: [4 5 6 7]
label set: [8]
