### Intro: how to use tf.data and how to compare it to other methods

In [12]:
#Tensorflow
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.datasets import cifar100
from tensorflow.data import AUTOTUNE
import tensorflow as tf
#others
from imutils import paths
import numpy as np
import time
import os

#### 1. Create a function to benchmark time

In [3]:
def benchmark(datasetGen, numSteps):
    start = time.time()

    for i in range(0, numSteps):
        (images, labels) = next(datasetGen) #Use the "next" function from python to get each batch of data
    
    end = time.time()
    return(end-start) #Difference between the time it started and the above loop finishes

#### 2. Get the data

In [4]:
BS = 64
NUM_STEPS = 5000
((trainX, trainY), (testX, testY)) = cifar100.load_data()

Downloading data from https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz


#### 3. Program ImageDataGenerator

In [5]:
imageGen = ImageDataGenerator() #initialize it
dataGen = imageGen.flow(x=trainX, y=trainY, batch_size=64, shuffle=True)

#### 4. Program tf.data

In [6]:
dataset = tf.data.Dataset.from_tensor_slices((trainX, trainY))
dataset = (dataset.shuffle(1024).cache().repeat().batch(BS).prefetch(AUTOTUNE))
datasetGen = iter(dataset) #create a dataset iterator that goes trough the time function

#.shuffle: Builds a buffer of 1024 elements from the dataset and shuffles it.
#.cache(): Caches the result. This makes subsequent data reads/accesses faster.
#.repeat(): loops through batches of data.
#.prefetch(): Builds batches of data behind the scenes. Improves through/put rate.

#### 5. Evaluate the methods

In [7]:
totalTime_idg = benchmark(dataGen, NUM_STEPS)
print("total time tf method:", BS*NUM_STEPS, totalTime_idg)

total time tf method: 320000 10.27061939239502


In [8]:
totalTime_tf = benchmark(datasetGen, NUM_STEPS)
print("total time tf method:", BS*NUM_STEPS, totalTime_tf)

total time tf method: 320000 1.52409029006958


###### So, tf.data() takes 1.52409029006958 seconds to go through all data. This is 6.7 times faster.

#### But, how do we use tf.data() when the data is on disk?

In [17]:
np.array(sorted(os.listdir("G:/pyimage_univ/CNN_tf/tf.data/fruits")))

array(['apple', 'broccoli', 'grape', 'lemon', 'mango', 'orange',
       'strawberry'], dtype='<U10')

In [21]:
#get the information
imagePaths = list(paths.list_images("G:/pyimage_univ/CNN_tf/tf.data/fruits"))#goes through all folders and looks for images
classNames = np.array(sorted(os.listdir("G:/pyimage_univ/CNN_tf/tf.data/fruits"))) #get all folders names

In [11]:
#create a function to load the data and pre-process it usinf tf
def load_images(imagePath):
    #--images
    image = tf.io.read_file(imagePath) #read the data
    image = tf.image.decode_png(image, channels=3) #it can also decode jpeg, gif, bmp...
    image = tf.image.resize(image, (96,96))/ 255.0 #don't forget to rescale the pixels
    #--labels
    label = tf.strings.split(imagePath, os.path.sep)[-2]#the name of the folder 
    oneHot = label == classNames #see if the obtained labels match with the classes names we got before
    encodedLabel = tf.argmax(oneHot)#get the highest value

    return(image, encodedLabel)

#### Build the pipeline

In [22]:
dataset = tf.data.Dataset.from_tensor_slices(imagePaths)
print(dataset)
dataset = (dataset.shuffle(1024).map(load_images, num_parallel_calls=AUTOTUNE).cache().repeat().
            batch(BS).prefetch(AUTOTUNE))
print(dataset)
#map works like the normal map function of python. Every image will go through the load_images function

<TensorSliceDataset shapes: (), types: tf.string>
<PrefetchDataset shapes: ((None, 96, 96, 3), (None,)), types: (tf.float32, tf.int64)>


In [23]:
### Compare to ImageDataGenerator
print("[INFO] creating a ImageDataGenerator object...")
imageGen = ImageDataGenerator(rescale=1.0/255)
dataGen = imageGen.flow_from_directory(
	"G:/pyimage_univ/CNN_tf/tf.data/fruits",
	target_size=(96, 96),
	batch_size=BS,
	class_mode="categorical",
	color_mode="rgb")

[INFO] creating a ImageDataGenerator object...
Found 6688 images belonging to 7 classes.


In [28]:
totalTime = benchmark(dataGen, NUM_STEPS)
print("idg", BS * NUM_STEPS, totalTime)

In [25]:
datasetGen = iter(dataset)
totalTime = benchmark(datasetGen, NUM_STEPS)
print("tf.data()", BS * NUM_STEPS, totalTime)

[INFO] tf.data generated 320000 images in 35.85 seconds...
