# Chapter 3: Keras and Data Retrieval in TensorFlow 2

<table align="left">
    <td>
        <a target="_blank" href="https://colab.research.google.com/github/thushv89/manning_tf2_in_action/blob/master/Ch03-Keras-and-Data-Retrieval/3.2.Creating_Input_Pipelines.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
    </td>
</table>

## Importing necessary libraries and some setup

In [5]:
import os
import random
import numpy as np
import requests
from zipfile import ZipFile
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import Dense, Conv2D, Flatten
from tensorflow.keras.models import Sequential
import pandas as pd
import tensorflow_datasets as tfds

def fix_random_seed(seed):
    try:
        np.random.seed(seed)
    except NameError:
        print("Warning: Numpy is not imported. Setting the seed for Numpy failed.")
    try:
        tf.random.set_seed(seed)
    except NameError:
        print("Warning: TensorFlow is not imported. Setting the seed for TensorFlow failed.")
    try:
        random.seed(seed)
    except NameError:
        print("Warning: random module is not imported. Setting the seed for random failed.")

# Fixing the random seed
fix_random_seed(4321)
print("TensorFlow version: {}".format(tf.__version__))

data_dir = 'data'

if not os.path.exists(data_dir):
    os.makedirs(data_dir)


TensorFlow version: 2.9.0


## Using the `tf.data` API to retrieve data

Here we will be using the `tf.data` API to feed a dataset containing images of flowers. The dataset has a folder containing the images and a CSV file listing filenames and their corresponding label as an integer. We will write a TensorFlow data pipeline that does the following.

* Extract filenames and classes from the CSV
* Read in the images from the extracted filenames and resize them to 64x64
* Convert the class labels to one-hot encoded vectors
* Combine the processed images and one-hot encoded vectors to a single dataset
* Finally, shuffle the data and output as batches

### Downloading the data
The dataset is available at https://www.kaggle.com/olgabelitskaya/flower-color-images/data . 

You need to download the zip file available in this URL and place it in the `data` folder in the `Ch03-Keras-and-Data-Retrieval` folder. You **do not** need to extract it as the following code will do it for you.

In [None]:
# Section 3.2

import os
from zipfile import ZipFile

# Extracting the flowers image data to a directory
zip_filepath = [os.path.join('data',f) for f in os.listdir('data') if f.endswith('.zip')]

if len(zip_filepath)==0:
    print("Did you download the dataset as a zip file and place it in the Ch03-Keras-and-Data-Retrieval/data folder?")
elif len(zip_filepath)>1:
    print("There's too many .zip files. There should be only 1")

zfile = ZipFile(zip_filepath[0])
zfile.extractall('data')

## Creating a tf.data.Dataset 

Here we are creating the `tf.data` pipeline that executes the above steps.

In [29]:
# Section 3.2
# Code listing 3.5

import tensorflow as tf
import os
import tensorflow.keras.backend as K

K.clear_session() # Making sure we are clearing out the TensorFlow graph

# Read the CSV file with TensorFlow
# The os.path.sep at the end is important for the get_image function
data_dir = os.path.join('data', 'flower_images', 'flower_images') + os.path.sep
assert os.path.exists(data_dir)
csv_ds = tf.data.experimental.CsvDataset(
    os.path.join(data_dir,'flower_labels.csv') , ("",-1), header=True
)
# Separate the image names and labels to two separate sets
fname_ds = csv_ds.map(lambda a,b: a)
label_ds = csv_ds.map(lambda a,b: b)

def get_image(file_path):
    
    img = tf.io.read_file(data_dir + file_path)
    # convert the compressed string to a 3D uint8 tensor
    img = tf.image.decode_png(img, channels=3)
    # Use `convert_image_dtype` to convert to floats in the [0,1] range.
    img = tf.image.convert_image_dtype(img, tf.float32)
    # resize the image to the desired size.
    return tf.image.resize(img, [64, 64])

# Get the images by running get_image across all the filenames
image_ds = fname_ds.map(get_image)
print("The image dataset contains: {}".format(image_ds))
# Create onehot encoded labels from label data

print(label_ds)
labels_ds = label_ds
label_ds = label_ds.map(lambda x: tf.one_hot(x, depth=10))

# Zip the images and labels together
data_ds = tf.data.Dataset.zip((image_ds, label_ds))

# Shuffle the data so that we get a mix of labels in every batch
data_ds = data_ds.shuffle(buffer_size= 20)
# Define a batch of size 5 
data_ds = data_ds.batch(5)


The image dataset contains: <MapDataset element_spec=TensorSpec(shape=(64, 64, 3), dtype=tf.float32, name=None)>
<MapDataset element_spec=TensorSpec(shape=(), dtype=tf.int32, name=None)>


In [32]:
#label_ds = labels_ds.map(lambda x: x if x != -1 else None) #cannot work because map cannot take in None
label_ds = labels_ds.filter(lambda x: x != -1)

In [None]:
# Iterate through the data to see what it contains
for item in data_ds:
    print(item)
    break

### Defining and training a model

Here we are defining a simple Convolution Neural Network (CNN) model to train it on the image data we just retrieved. You don't have to worry about the technical details of CNNs right now. We will discuss them in detail in the next chapter.

In [None]:
# Section 3.2

from tensorflow.keras.layers import Dense, Conv2D, Flatten
from tensorflow.keras.models import Sequential

# Defining a Convolution neural network for you to train for the flowers data
# We will discuss convolution neural networks in more detail later
model = Sequential([
    Conv2D(64,(5,5), activation='relu', input_shape=(64,64,3)),
    Flatten(),
    Dense(10, activation='softmax')
])

# Compiling the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])

# Training the model with the tf.data pipeline
model.fit(data_ds, epochs=10)

## Using Keras data generators to retrieve data

Instead of `tf.data` API let us use the Keras `ImageDataGenerator` to retrieve the data. As you can see, the `ImageDataGenerator` involves much less code than the using the `tf.data` API. 

In [None]:
# Section 3.2
# Code listing 3.6

from tensorflow.keras.preprocessing.image import ImageDataGenerator
import os
import pandas as pd

data_dir = os.path.join('data','flower_images', 'flower_images')

# Defining an image data generator provided in Keras
img_gen = ImageDataGenerator()

# Reading the CSV files containing filenames and labels
labels_df = pd.read_csv(os.path.join(data_dir, 'flower_labels.csv'), header=0)

# Generating data using the flow_from_dataframe function
gen_iter = img_gen.flow_from_dataframe(
    dataframe=labels_df, directory=data_dir, x_col='file', y_col='label', class_mode='raw', batch_size=5, target_size=(64,64))

# Iterating through the data
for item in gen_iter:
    print(item)
    break

## Using the `tensorflow-datasets` library

Here we will use the `tensorflow-datasets` package. It is a curated list of popular datasets available for machine learning projects. With this package you can download a dataset in a single line. This means you don't have to worry about downloading/extracting/formatting data manually. All of that will be already done when you import data using the `tensorflow-datasets` library.

### Lists the available datasets

In [None]:
# Section 3.2

import tensorflow_datasets as tfds
import tensorflow as tf
# See all registered datasets
tfds.list_builders()

### Download the Cifar10 dataset and view information

In [None]:
# Section 3.2

import tensorflow_datasets as tfds
import tensorflow.keras.backend as K

K.clear_session() # Making sure we are clearing out the TensorFlow graph

# Load a given dataset by name, along with the DatasetInfo
data, info = tfds.load("cifar10", with_info=True, data_dir=r"C:\Users\kinda\Documents\GitHub\manning_tf2_in_action\Ch03-Keras-and-Data-Retrieval\tensorflow_datasets")
print(info)

### Exploring the data 

Here we will print the `data` and see what it provides. Then we will need to batch the data as data is provided as individual samples when you import it from `tensorflow-datasets`.

In [None]:
print(data)

In [None]:
# Print some training data
train_ds = data["train"].batch(16)
for item in train_ds:
    print(item)
    break

In [None]:
# Section 3.2

import tensorflow as tf

# Defining a dataset with batch size 16
train_ds = data["train"].batch(16)

# Creating a dataset that returns an (image, one-hot label) tuple
def format_data(x):
    return (x["image"], tf.one_hot(x["label"], depth=10))
train_ds = train_ds.map(format_data)

# Iterating the dataset
for item in train_ds:
    print(item)
    break

### Training a simple CNN on the Cifar10 data

In [None]:
# Section 3.2

from tensorflow.keras.layers import Dense, Conv2D, Flatten
from tensorflow.keras.models import Sequential

# Defining a simple convolution neural network to process the CIFAR data
model = Sequential([
    Conv2D(64,(5,5), activation='relu', input_shape=(32,32,3)),
    Flatten(),
    Dense(10, activation='softmax')
])
# Compiling the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])

# Fitting the model on the data for 25 epochs
model.fit(train_ds, epochs=25)

In [4]:
#Exercise3
import tensorflow_datasets as tfds
import tensorflow.keras.backend as K

K.clear_session() # Making sure we are clearing out the TensorFlow graph

# Load a given dataset by name, along with the DatasetInfo
data, info =tfds.load("caltech101", with_info=True, data_dir=r"C:\Users\kinda\Documents\GitHub\manning_tf2_in_action\Ch03-Keras-and-Data-Retrieval\tensorflow_datasets")
print(info)

tfds.core.DatasetInfo(
    name='caltech101',
    full_name='caltech101/3.0.2',
    description="""
    Caltech-101 consists of pictures of objects belonging to 101 classes, plus one
    `background clutter` class. Each image is labelled with a single object. Each
    class contains roughly 40 to 800 images, totalling around 9k images. Images are
    of variable sizes, with typical edge lengths of 200-300 pixels. This version
    contains image-level labels only. The original dataset also contains bounding
    boxes.
    """,
    homepage='https://doi.org/10.22002/D1.20086',
    data_dir='C:\\Users\\kinda\\Documents\\GitHub\\manning_tf2_in_action\\Ch03-Keras-and-Data-Retrieval\\tensorflow_datasets\\caltech101\\3.0.2',
    file_format=tfrecord,
    download_size=131.05 MiB,
    dataset_size=132.86 MiB,
    features=FeaturesDict({
        'image': Image(shape=(None, None, 3), dtype=uint8),
        'image/file_name': Text(shape=(), dtype=string),
        'label': ClassLabel(shape=(), dtyp