<a href="https://colab.research.google.com/github/iypc-team/CoLab/blob/master/data_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
print()

In [None]:
try: import tensorflow as tf
except ModuleNotFoundError:
    %pip install tensorflow
    import tensorflow as tf

try: import matplotlib.pyplot as plt
except ModuleNotFoundError:
    %pip install matplotlib
    import matplotlib.pyplot as plt

try: import pandas as pd
except ModuleNotFoundError:
    %pip install pandas
    import pandas as pd

try: 
    import numpy as np
    np.set_printoptions(precision=4)
except ModuleNotFoundError:
    %pip install numpy
    import numpy as np
    np.set_printoptions(precision=4)

import pathlib
import os
from os.path import *
# import matplotlib.pyplot as plt
# import pandas as pd
# import numpy as np
import shutil

contentPth = os.getcwd()

if contentPth == '/content':
    gdrivePth = join(contentPth, 'gdrive')
    myDrivePth = join(gdrivePth, 'My Drive')
    tfImagesPth = join(myDrivePth, 'TensorflowImages')
else: tfImagesPth = join(contentPth, 'TensorflowImages')

if os.path.exists('/content/sample_data'):
    shutil.rmtree('/content/sample_data')

try:
    from google.colab import drive
    drive.flush_and_unmount()
    drive.mount('/content/gdrive', force_remount=True)
    os.chdir(myDrivePth)
    from BashColors import C
    from TarfileFunctions import tff
    os.chdir(contentPth)
except ModuleNotFoundError:
    os.chdir(contentPth)
    from BashColors import C
    from TarfileFunctions import tff
    tfImagesPth = join(contentPth, 'TensorflowImages')

print(f'cwd: {C.IBlue}{pathlib.Path.cwd()}')

In [None]:
# flowers = tf.keras.utils.get_file(
    # 'flower_photos',
    # 'https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz',
    # untar=True)

Create the `image.ImageDataGenerator`

In [None]:
img_gen = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1./255)

In [None]:
images, labels = next(img_gen.flow_from_directory(tfImagesPth))

In [None]:
print(images.dtype, images.shape)
print(labels.dtype, labels.shape)

In [None]:
ds = tf.data.Dataset.from_generator(
    lambda: img_gen.flow_from_directory(tfImagesPth), 
    output_types=(tf.float32, tf.float32), 
    output_shapes=([13,256,256,3], [13,3])
)

ds.element_spec

In [None]:
for images, label in ds.take(10):
    print(images.__class__)
    print('images.shape: ', images.shape)
    print('labels.shape: ', labels.shape)
    print()

In [None]:
tf.data.Dataset.list_files

Write a function that manipulates the dataset elements.

In [None]:
# Reads an image from a file, decodes it into a dense tensor, and resizes it
# to a fixed shape.
def parse_image(filename):
  parts = tf.strings.split(filename, os.sep)
  label = parts[-2]

  image = tf.io.read_file(filename)
  image = tf.image.decode_jpeg(image)
  image = tf.image.convert_image_dtype(image, tf.float32)
  image = tf.image.resize(image, [256, 256])
  return image, label

Test that it works.

In [None]:
file_path = next(iter(list_ds))
image, label = parse_image(file_path)

def show(image, label):
  plt.figure()
  plt.imshow(image)
  plt.title(label.numpy().decode('utf-8'))
  plt.axis('off')

show(image, label)

Map it over the dataset.

In [None]:
images_ds = list_ds.map(parse_image)

for image, label in images_ds.take(2):
  show(image, label)

### Applying arbitrary Python logic

For performance reasons, use TensorFlow operations for
preprocessing your data whenever possible. However, it is sometimes useful to
call external Python libraries when parsing your input data. You can use the `tf.py_function()` operation in a `Dataset.map()` transformation.

For example, if you want to apply a random rotation, the `tf.image` module only has `tf.image.rot90`, which is not very useful for image augmentation. 

Note: `tensorflow_addons` has a TensorFlow compatible `rotate` in `tensorflow_addons.image.rotate`.

To demonstrate `tf.py_function`, try using the `scipy.ndimage.rotate` function instead:

In [None]:
import scipy.ndimage as ndimage

def random_rotate_image(image):
  image = ndimage.rotate(image, np.random.uniform(-30, 30), reshape=False)
  return image

In [None]:
image, label = next(iter(images_ds))
image = random_rotate_image(image)
show(image, label)

To use this function with `Dataset.map` the same caveats apply as with `Dataset.from_generator`, you need to describe the return shapes and types when you apply the function:

In [None]:
def tf_random_rotate_image(image, label):
  im_shape = image.shape
  [image,] = tf.py_function(random_rotate_image, [image], [tf.float32])
  image.set_shape(im_shape)
  return image, label

In [None]:
rot_ds = images_ds.map(tf_random_rotate_image)

for image, label in rot_ds.take(2):
  show(image, label)

### Parsing `tf.Example` protocol buffer messages

Many input pipelines extract `tf.train.Example` protocol buffer messages from a
TFRecord format. Each `tf.train.Example` record contains one or more "features",
and the input pipeline typically converts these features into tensors.

In [None]:
fsns_test_file = tf.keras.utils.get_file("fsns.tfrec", "https://storage.googleapis.com/download.tensorflow.org/data/fsns-20160927/testdata/fsns-00000-of-00001")
dataset = tf.data.TFRecordDataset(filenames = [fsns_test_file])
dataset

You can work with `tf.train.Example` protos outside of a `tf.data.Dataset` to understand the data:

In [None]:
raw_example = next(iter(dataset))
parsed = tf.train.Example.FromString(raw_example.numpy())

feature = parsed.features.feature
raw_img = feature['image/encoded'].bytes_list.value[0]
img = tf.image.decode_png(raw_img)
plt.imshow(img)
plt.axis('off')
_ = plt.title(feature["image/text"].bytes_list.value[0])

In [None]:
raw_example = next(iter(dataset))

In [None]:
def tf_parse(eg):
  example = tf.io.parse_example(
      eg[tf.newaxis], {
          'image/encoded': tf.io.FixedLenFeature(shape=(), dtype=tf.string),
          'image/text': tf.io.FixedLenFeature(shape=(), dtype=tf.string)
      })
  return example['image/encoded'][0], example['image/text'][0]

In [None]:
img, txt = tf_parse(raw_example)
print(txt.numpy())
print(repr(img.numpy()[:20]), "...")

In [None]:
decoded = dataset.map(tf_parse)
decoded

In [None]:
image_batch, text_batch = next(iter(decoded.batch(10)))
image_batch.shape

## Iterator Checkpointing

Tensorflow supports [taking checkpoints](https://www.tensorflow.org/guide/checkpoint) so that when your training process restarts it can restore the latest checkpoint to recover most of its progress. In addition to checkpointing the model variables, you can also checkpoint the progress of the dataset iterator. This could be useful if you have a large dataset and don't want to start the dataset from the beginning on each restart. Note however that iterator checkpoints may be large, since transformations such as `shuffle` and `prefetch` require buffering elements within the iterator. 

To include your iterator in a checkpoint, pass the iterator to the `tf.train.Checkpoint` constructor.

In [None]:
range_ds = tf.data.Dataset.range(20)

iterator = iter(range_ds)
ckpt = tf.train.Checkpoint(step=tf.Variable(0), iterator=iterator)
manager = tf.train.CheckpointManager(ckpt, '/tmp/my_ckpt', max_to_keep=3)

print([next(iterator).numpy() for _ in range(5)])

save_path = manager.save()

print([next(iterator).numpy() for _ in range(5)])

ckpt.restore(manager.latest_checkpoint)

print([next(iterator).numpy() for _ in range(5)])

Note: It is not possible to checkpoint an iterator which relies on external state such as a `tf.py_function`. Attempting to do so will raise an exception complaining about the external state.

## Using tf.data with tf.keras

The `tf.keras` API simplifies many aspects of creating and executing machine
learning models. Its `.fit()` and `.evaluate()` and `.predict()` APIs support datasets as inputs. Here is a quick dataset and model setup:

In [None]:
train, test = tf.keras.datasets.fashion_mnist.load_data()

images, labels = train
images = images/255.0
labels = labels.astype(np.int32)

In [None]:
fmnist_train_ds = tf.data.Dataset.from_tensor_slices((images, labels))
fmnist_train_ds = fmnist_train_ds.shuffle(5000).batch(32)

model = tf.keras.Sequential([
  tf.keras.layers.Flatten(),
  tf.keras.layers.Dense(10)
])

model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=['accuracy'])

 Passing a dataset of `(feature, label)` pairs is all that's needed for `Model.fit` and `Model.evaluate`:

In [None]:
model.fit(fmnist_train_ds, epochs=2)

If you pass an infinite dataset, for example by calling `Dataset.repeat()`, you just need to also pass the `steps_per_epoch` argument:

In [None]:
model.fit(fmnist_train_ds.repeat(), epochs=2, steps_per_epoch=20)

For evaluation you can pass the number of evaluation steps:

In [None]:
loss, accuracy = model.evaluate(fmnist_train_ds)
print("Loss :", loss)
print("Accuracy :", accuracy)

For long datasets, set the number of steps to evaluate:

In [None]:
loss, accuracy = model.evaluate(fmnist_train_ds.repeat(), steps=10)
print("Loss :", loss)
print("Accuracy :", accuracy)

The labels are not required in when calling `Model.predict`. 

In [None]:
predict_ds = tf.data.Dataset.from_tensor_slices(images).batch(32)
result = model.predict(predict_ds, steps = 10)
print(result.shape)

But the labels are ignored if you do pass a dataset containing them:

In [None]:
result = model.predict(fmnist_train_ds, steps = 10)
print(result.shape)