Workspace Preparation:

In [None]:
#import necessary libraries/packages here

import argparse
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pickle
import sys
from sklearn.metrics import accuracy_score
import tensorflow as tf
import tensorflow_datasets as tfds
from deepforest import CascadeForestClassifier
import random

In [None]:
#Add GPU to list of devices, and configure so that memory usage can be tracked
gpu_devices = tf.config.list_physical_devices('GPU')

print(gpu_devices)

#Reset memory usage measurements
for gpu in gpu_devices:
  tf.config.experimental.set_memory_growth(gpu, True)

#check that memory measurement variables have been reset
if gpu_devices:
  print(tf.config.experimental.get_memory_info('GPU:0'))

Reading in the data:

To begin, let's read in the dataset using the 'tensorflow-datasets' package. This package reads in the dataset and then converts it into TFRecords which contain the serialised image and the image label, as well as the data split the image belongs to: 

In [None]:
#read in cassava dataset
dataset, info = tfds.load('cassava', with_info=True)

Check that the number of training, validation and test images imported is correct:

In [None]:
print('Training images: {}'.format(len(dataset['train'])))
print('Validation images: {}'.format(len(dataset['validation'])))
print('Test images: {}'.format(len(dataset['test'])))

#all good

Image pre-processing:

To begin this section, let's define a function that performs pixel normalisation and image resizing:

In [None]:
#function to perform image resizing and pixel value normalisation
def image_preprocessing(obs):

  #normalise pixel values
  obs['image'] = tf.cast(obs['image'], tf.float32)
  obs['image'] = obs['image'] / 255

  #resize image to 224 x 224 (can change this later) <-- although, Abayomi-Alli paper showed that there was a plateau of improvement in accuracy when image resolution was >128 pixels for this dataset
  obs['image'] = tf.image.resize(obs['image'], (224, 224))

  return obs['image'], obs['label']

Apply the above function to the different data splits. Also randomise the dataset to remove any structural bias:

In [None]:
AUTOTUNE = tf.data.AUTOTUNE

#apply the above image pre-processing to each data split and randomise dataset
training_data = dataset['train'].map(image_preprocessing, num_parallel_calls=AUTOTUNE)
validation_data = dataset['validation'].map(image_preprocessing, num_parallel_calls=AUTOTUNE)
test_data = dataset['test'].map(image_preprocessing, num_parallel_calls=AUTOTUNE)

To check that image pre-processing went as expected, let's print an example image:

In [None]:
#print an example image
batch = training_data.as_numpy_iterator()
examples = next(iter(batch))
plt.imshow(examples['image'])

#looks like things went to plan

Finally, let's define a function that converts images and associated labels to NumPy format for a given data split since this format is needed for the gcForests algorithm:

In [None]:
def convert_to_np_arrays(dataset, split_name, img_size):

  #get the number of images in dataset
  num_images = len(dataset)

  #created empty vectors to populate
  x_cassava = np.empty([num_images, img_size, img_size, 3], dtype='float32')
  y_cassava = np.empty(num_images, dtype='int64')

  #populate the above vectors
  counter = 0
  for image, label in dataset: 
    #x_cassava[counter] = data["image"]
    #y_cassava[counter] = data["label"]
    x_cassava[counter] = image
    y_cassava[counter] = label
    counter += 1

  if counter == num_images:
    print('All {} images and labels converted to NumPy arrays'.format(split_name))

  return x_cassava, y_cassava

Apply the above function to the different data splits

In [None]:
#training set
x_train, y_train = convert_to_np_arrays(dataset = training_data,
                                        split_name = 'training',
                                        img_size = 224)

#check that the shape of the output arrays are correct
x_train.shape
y_train.shape

#save the result
np.save('/content/drive/MyDrive/x_train.npy', x_train)
np.save('/content/drive/MyDrive/y_train.npy', y_train)

In [None]:
#validation set
x_val, y_val = convert_to_np_arrays(dataset = validation_data,
                                    split_name = 'validation',
                                    img_size = 224)

#check the shape
x_val.shape
y_val.shape

#save the result
np.save('/content/drive/MyDrive/x_val.npy', x_val)
np.save('/content/drive/MyDrive/y_val.npy', y_val)

In [None]:
#test set
x_test, y_test = convert_to_np_arrays(dataset = test_data,
                                      split_name = 'test',
                                      img_size = 224)

#check the shape
x_test.shape
y_test.shape

#save the result
np.save('/content/drive/MyDrive/x_test.npy', x_test)
np.save('/content/drive/MyDrive/y_test.npy', y_test)

Let's also take a look at the class distribution within each split to ensure that a stratified split occurred. To begin, let's get the label counts for each of the splits and the overall dataset.

In [None]:
#get the different label frequencies for each data split
train_set_freq = np.unique(y_train, return_counts = True)
val_set_freq = np.unique(y_val, return_counts = True)
test_set_freq = np.unique(y_test, return_counts = True)

#save these in dictionary form

#training set
train_dict = {}
for i in range(5):
  train_dict[train_set_freq[0][i]] = train_set_freq[1][i]

#validation set
val_dict = {}
for i in range(5):
  val_dict[val_set_freq[0][i]] = val_set_freq[1][i]

#test set
test_dict = {}
for i in range(5):
  test_dict[test_set_freq[0][i]] = test_set_freq[1][i]

Let's first look at the overall class distribution:

In [None]:
#to also get an overall count of class labels before split occurred, we will concatenate the class labels and repeat the process
dataset_dict = {}
dataset_labels = np.concatenate((y_train, y_val, y_test))
dataset_labels_freq = np.unique(dataset_labels, return_counts = True)
for i in range(5):
  dataset_dict[dataset_labels_freq[0][i]] = dataset_labels_freq[1][i]

#convert the above dictionaries into Pandas dataframes
class_distro_df = pd.DataFrame([dataset_dict])
class_distro_df.rename(columns = {0:'CBB', 1:'CBSD', 2:'CGM', 3:'CMD', 4:'Healthy'}, inplace = True)
class_distro_df = class_distro_df.transpose()
class_distro_df.rename(index = {0:'CBB', 1:'CBSD', 2:'CGM', 3:'CMD', 4:'Healthy'}, columns = {0: 'Frequency'}, inplace = True)

#plot the overall class distributions
class_distro_plt = class_distro_df.plot(kind='bar', legend=None, rot=0, color='#f08e70')
class_distro_plt.set_ylabel("Number of images", labelpad=10)
class_distro_plt.set_xlabel("Class label", labelpad=10)

Next, let's take a look at the class distribution by data split:

In [None]:
freq_counts = {}
freq_counts['training'] = train_dict
freq_counts['validation'] = val_dict
freq_counts['test'] = test_dict

#create dataframe from dictionary
df = pd.DataFrame.from_dict(freq_counts)
df = df.transpose()
df.rename(columns = {0:'CBB', 1:'CBSD', 2:'CGM', 3:'CMD', 4:'Healthy'}, inplace = True)

#Plot the results
plt = df.plot(kind='bar', rot=0, color={'CBB': '#3ff37e', 'CBSD': '#e1877f', 'CGM': '#f09f70', 'CMD': '#77a6e9', 'Healthy': '#8c81df'})
plt.set_ylabel("Number of images", labelpad=10)
plt.set_xlabel("Data split", labelpad=10)

One can see from the above that a stratified split of the class labels occurred. This helps to reduce the effects of class bias during training, whilst maintaining the distribution of the class labels across all data splits.