# Demo
Demo of how to pull images in from dataset into jupyter notebook for training etc.

In [1]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow import keras

#Paths
train_dir = "split_dataset/train/"
val_dir = "split_dataset/val/"
test_dir = "split_dataset/test/"

This is the easiest way to load in the dataset as an iterator (as was performed in the Transfer Learning notebooks). Note that datagen.flow_from_directory is used to get the data (instead of datagen.flow which was used in the Transfer Learning notebooks). The iterators can be then used to fit/evaluate the model. 

Note that similar to the TransferLearning notebooks, I've set the batch size for val and test equal to the number of images in those sets.

In [2]:
#Count the number of images
num_val_images = sum([len(files) for _, _, files in os.walk(val_dir)])
num_test_images = sum([len(files) for _, _, files in os.walk(test_dir)])

#Image Data Generators
train_datagen = keras.preprocessing.image.ImageDataGenerator(
    #Do preprocessing and data augmentation
)

val_datagen = keras.preprocessing.image.ImageDataGenerator(
    #Do preprocessing
)

test_datagen = keras.preprocessing.image.ImageDataGenerator(
    #Do preprocessing
)

#Load datasets
train_iterator = train_datagen.flow_from_directory(train_dir, batch_size=30, class_mode='sparse')
val_iterator = val_datagen.flow_from_directory(val_dir, batch_size=num_val_images, class_mode='sparse')
test_iterator = test_datagen.flow_from_directory(test_dir, batch_size=num_test_images, class_mode='sparse')

print("Datasets loaded")

Found 122 images belonging to 10 classes.
Found 42 images belonging to 10 classes.
Found 46 images belonging to 10 classes.
Datasets loaded


If you wanted to get the X_train, y_train, X_test, y_test etc., as was done in Week 10 in the CNN digits notebook (e.g. where train_images, train_labels was used for fitting), then the below code will work. You would need to use OpenCV (Open Computer Vision) for images but not necessarily for numpy arrays

In [3]:
"""
import os
import numpy as np
import cv2  #For image processing - pip install cv2

#Paths to split dataset
dataset_folder = "split_dataset/"
splits = ["train", "val", "test"]

#Function to load images and labels from a specific split
def load_data(split_folder):
    X = []
    y = []
    flower_names = os.listdir(split_folder)  #Get flower types
    labels_map = {flower: idx for idx, flower in enumerate(flower_names)}  #Map flower names to integers

    for flower in flower_names:
        flower_path = os.path.join(split_folder, flower)
        if not os.path.isdir(flower_path):
            continue
        for img_name in os.listdir(flower_path):
            img_path = os.path.join(flower_path, img_name)
            try:
                img = cv2.imread(img_path)  #Read the image
                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  #Convert BGR (OpenCV default) to RGB
                X.append(img)
                y.append(labels_map[flower])
            except Exception as e:
                print(f"Error processing {img_path}: {e}")
    
    return np.array(X, dtype=np.uint8), np.array(y, dtype=np.int8)

#Load train, validation, and test datasets
X_train, y_train = load_data(os.path.join(dataset_folder, "train"))
X_val, y_val = load_data(os.path.join(dataset_folder, "val"))
X_test, y_test = load_data(os.path.join(dataset_folder, "test"))

#Outputs
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_val shape: {X_val.shape}, y_val shape: {y_val.shape}")
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")
"""


'\nimport os\nimport numpy as np\nimport cv2  #For image processing - pip install cv2\n\n#Paths to split dataset\ndataset_folder = "split_dataset/"\nsplits = ["train", "val", "test"]\n\n#Function to load images and labels from a specific split\ndef load_data(split_folder):\n    X = []\n    y = []\n    flower_names = os.listdir(split_folder)  #Get flower types\n    labels_map = {flower: idx for idx, flower in enumerate(flower_names)}  #Map flower names to integers\n\n    for flower in flower_names:\n        flower_path = os.path.join(split_folder, flower)\n        if not os.path.isdir(flower_path):\n            continue\n        for img_name in os.listdir(flower_path):\n            img_path = os.path.join(flower_path, img_name)\n            try:\n                img = cv2.imread(img_path)  #Read the image\n                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  #Convert BGR (OpenCV default) to RGB\n                X.append(img)\n                y.append(labels_map[flower])\n        

Probably easier to use the iterator method. Ok, let's test with a quick bit of training

In [4]:
import tensorflow as tf
from tensorflow import keras
from efficientnet import tfkeras as efficientnet  #EfficientNet from efficientnet.tfkeras
from tensorflow.keras.applications import EfficientNetB4
from tensorflow.keras.applications.efficientnet import preprocess_input
from tensorflow.keras import layers
import matplotlib.pyplot as plt

#Count the number of images
num_val_images = sum([len(files) for _, _, files in os.walk(val_dir)])
num_test_images = sum([len(files) for _, _, files in os.walk(test_dir)])

train_datagen = keras.preprocessing.image.ImageDataGenerator(
    rotation_range=10,
    width_shift_range=0.1,
    height_shift_range=0.1,
    horizontal_flip=True,
    preprocessing_function=efficientnet.preprocess_input
)

val_datagen = keras.preprocessing.image.ImageDataGenerator(
    preprocessing_function=efficientnet.preprocess_input
)

test_datagen = keras.preprocessing.image.ImageDataGenerator(
    preprocessing_function=efficientnet.preprocess_input
)

#Load datasets
train_iterator = train_datagen.flow_from_directory(train_dir, batch_size=32, class_mode='sparse')
val_iterator = val_datagen.flow_from_directory(val_dir, batch_size=num_val_images, shuffle=False, class_mode='sparse')
test_iterator = test_datagen.flow_from_directory(test_dir, batch_size=num_test_images, shuffle=False, class_mode='sparse')

print("Datasets loaded")


  "class": algorithms.Blowfish,


Found 122 images belonging to 10 classes.
Found 42 images belonging to 10 classes.
Found 46 images belonging to 10 classes.
Datasets loaded


## Model Setup - same as before

In [5]:
base_model = efficientnet.EfficientNetB4(
    weights='noisy-student',
    include_top=False,
    pooling='avg',
    input_shape=(128, 128, 3)
)

#Freeze the base model
for layer in base_model.layers:
    layer.trainable = False

#Add new custom top layers (this can be played with)
model = keras.models.Sequential([
    base_model,                          #Add the base model
    layers.Dropout(0.2),                 #Add Dropout
    #layers.Dense(128, activation='relu'),
    layers.Dense(10, activation='softmax')  #Add custom Dense layer
])

model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
    metrics=['accuracy']
)

## Training the Model
Train for up to 30 epochs and use the validation iterator for monitoring.

In [6]:
history = model.fit(
    train_iterator,
    epochs=40,
    validation_data=val_iterator
)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


Looking good - just had a play around if you had npz files instead:

Won't be able to use flow_from_directory as that deals with images. Will need to do something else instead.

If your data is organised a bit like the sorted_dataset that I created (instead of the split_dataset), might be able to use something like this Pseudocode

In [None]:
#Flower names
fNames = [
    'phlox', 'rose', 'calendula', 'iris', 'leucanthemum maximum',
    'bellflower', 'viola', 'rudbeckia laciniata', 'peony', 'aquilegia'
]

#Path to the dataset directory
data_dir = "..."  #Replace with actual path

#Map flower names to integer labels
class_to_label = {name: idx for idx, name in enumerate(fNames)}

#Initialize lists to store images and labels
images = []
labels = []

#Go through each class subdirectory
for fName in fNames:
    cls_dir = os.path.join(data_dir, fName)
    
    #Ensure directory exists
    if not os.path.isdir(cls_dir):
        print(f"Directory not found: {cls_dir}, skipping...")
        continue
    
    #List all files in the class directory
    for file_name in os.listdir(cls_dir):
        if file_name.endswith('.npz'):  #Check for .npz files
            file_path = os.path.join(cls_dir, file_name)
            
            #Load the .npz file
            data = np.load(file_path)
            
            #Assuming each file contains a key 'image'
            images.append(data['image'])
            labels.append(class_to_label[fName])  #Assign label based on the flower name - might need some checking
            
            data.close()

#Convert lists to NumPy arrays
X = np.array(images)
Y = np.array(labels)

#Split into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Verify 
print("X_train shape:", X_train.shape)
print("Y_train shape:", Y_train.shape)
print("X_test shape:", X_test.shape)
print("Y_test shape:", Y_test.shape)