#### This notebook (03) loads the images and labels, and save them as tensorflow datasets that will be used in notebook 04 for binary classification. Some functions called in this notebook and other notebooks are defined in 'mymodules.py' file.

In [None]:
#Import libraries
import tensorflow as tf
import cv2
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
import os
print("TF version:", tf.__version__)


#Check whether theres GPU available
print("GPU", "available" if tf.config.list_physical_devices("GPU") else "not available")



TF version: 2.9.2
GPU not available


Link to Google drive. Retrieve the image file names and corresponding labels





In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import sys
sys.path.append('./drive/MyDrive/dog_breeds_classification/notebooks')
import mymodules

# 3 Load and process data

In [None]:
DATA_PATH = "./drive/MyDrive/dog_breeds_classification/Images_Poodles"
IMAGE_SIZE = 224
BATCH_SIZE = 16

## 3.1 Load images

In [None]:
def retrieve_images_and_labels_binary(label_to_separate, data_path = DATA_PATH):
  image_names = []
  labels = []

  #retrieve image file names and labels
  for folder in os.listdir(data_path):
    for file in os.listdir(f'{data_path}/{folder}'):
      file = f'{data_path}/{folder}/{file}'
      image_names.append(file)
      labels.append(folder)
  image_names = np.array(image_names)
  
  #convert labels from strings to integers
  int_labels = np.int8(np.array(list(map(lambda x: x==label_to_separate, labels))))

  return image_names, int_labels

In [None]:
image_names_arr, labels_arr = retrieve_images_and_labels_binary('miniature_poodle', DATA_PATH)
print("The total number of images is", len(image_names_arr))
print("The total number of miniature_poodle images is", (labels_arr==1).sum())

The total number of images is 465
The total number of miniature_poodle images is 155


## 3.2 Split data, scale, resize, shuff, batch, prefetch

In [None]:
# split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(image_names_arr, labels_arr, test_size=0.10, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.12, random_state=42)

In [None]:
train_ds = mymodules.get_dataset(X_train, y_train)
valid_ds = mymodules.get_dataset(X_valid, y_valid)
test_ds = mymodules.get_dataset(X_test, y_test)

In [None]:
print('training set: {} \nvalidation set: {} \ntest set: {}'.format(len(train_ds), len(valid_ds), len(test_ds)))

training set: 367 
validation set: 51 
test set: 47


In [None]:
train = train_ds.shuffle(buffer_size=100).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
valid = valid_ds.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test  = test_ds.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

## 3.3 Save the tensorflow datasets for training, validation and test sets.

In [None]:
OUTPUT_PATH = "./drive/MyDrive/dog_breeds_classification/saved_tf_datasets/"
tf.data.experimental.save(train, OUTPUT_PATH + 'binary_train_ds_poodles')
tf.data.experimental.save(valid, OUTPUT_PATH + 'binary_valid_ds_poodles')
tf.data.experimental.save(test, OUTPUT_PATH + 'binary_test_ds_poodles')