In [1]:
import matplotlib.pyplot as plt
import numpy as np
import PIL
import tensorflow as tf

from tensorflow import keras
from keras import layers
from keras.models import Sequential

In [2]:
import pathlib

#tgz is a type of compressed folder, tar is a utility to collect many files into one archive file.
#pathlib
dataset_url = "https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz"
data_dir = tf.keras.utils.get_file('flower_photos.tar', origin=dataset_url, extract=True)
data_dir = pathlib.Path(data_dir).with_suffix('')
print(data_dir)

Downloading data from https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz
/Users/andrewhowe/.keras/datasets/flower_photos


In [3]:
image_count = len(list(data_dir.glob('*/*.jpg')))
print(image_count)

3670


In [4]:
roses = list(data_dir.glob('roses/*.jpg'))
print (roses[1])
# PIL.Image.open(str(roses[2]))

/Users/andrewhowe/.keras/datasets/flower_photos/roses/5777669976_a205f61e5b.jpg


In [5]:
batch_size = 32
img_height = 180
img_width = 180

training_dataset, validating_dataset = tf.keras.utils.image_dataset_from_directory(
    data_dir, 
    validation_split=0.1, 
    subset="both", 
    image_size=(img_height, img_width),
    batch_size=batch_size,
    seed=123
)
class_names = training_dataset.class_names
print (training_dataset.class_names)

Found 3670 files belonging to 5 classes.
Using 3303 files for training.
Using 367 files for validation.
['daisy', 'dandelion', 'roses', 'sunflowers', 'tulips']


In [16]:
#the images in the dataset is represented as a tensor of the shape (32, 180, 180, 3).
#This means 32 images, of shape 180x180x3 (last dimension refers to # of color channels. R G B.)
#labels represented as tensor of shape (32,) corresponding to the 32 images.
for images, labels in training_dataset:
    print (images.shape)
    print(labels.shape)
    break

#buffered prefetching - prevent I/O blocking.

#Dataset.cache keeps images in memory after they are loaded off the disk during first epoch. So we dont have to refetch after every epoch.
#Dataset.prefetch allows the preprocessing of data while the model execution is happening during training.
#tf.data.AUTOTUNE dynamically sets the buffer time during execution.

training_dataset = training_dataset.cache(buffer_size=tf.data.AUTOTUNE)


(32, 180, 180, 3)
(32,)
