# Prepare Dataset

by Ian Flores & Alejandro Vega

### Loading Dependencies

In [1]:
%matplotlib inline
from six.moves import cPickle as pickle
import os
import shutil
from PIL import Image, ImageOps
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display as disp
from IPython.display import Image as Im 
from scipy import ndimage
import random
from scipy.ndimage.interpolation import shift 
from sklearn.model_selection import train_test_split

In [2]:
## image size 
image_size, size = 28, 28

## Shifts 
num_shifts = 0

## Number of imgs per class
min_imgs_per_class = 1

## Number of imgs per class after augmentation
min_augmentation = 20

### Cropping Spectrograms

Given the architectures we are using in our models, we want all spectrograms to have the same size, because the models don't allow for dynamic size input. 

In [3]:
def squareAndGrayImage(image, size, path, species, name):
    # open our image and convert to grayscale 
    # (needed since color channels add a third dimmension)
    im = Image.open(image).convert('L')
    # dimmensions of square image
    size = (size,size)
    # resize our image and adjust if image is not square. save our image
    squared_image = ImageOps.fit(im, size, Image.ANTIALIAS)
    squared_image.save(path + '/' + species + '/squared_' + name)
    squared_image.close()
    #print(ndimage.imread(path + '/' + species + '/squared_' + name).shape)
    
def squareAndGrayProcess(size, dataset_path, new_dataset_path):
    # if our dataset doesn't exist create it, otherwise overwrite
    if not os.path.exists(new_dataset_path):
        os.makedirs(new_dataset_path)
    else:
        shutil.rmtree(new_dataset_path)
        os.makedirs(new_dataset_path)
    
    # get a list of species folders in our dataset
    species_dataset = os.listdir(dataset_path)
    
    for species in species_dataset:
        os.makedirs(new_dataset_path + '/' + species)
        species_images = os.listdir(dataset_path + '/' + species)
        for image in species_images:
            image_path = dataset_path + '/' + species + '/' + image
            squareAndGrayImage(image_path, size, new_dataset_path, species, image)

dataset_path = '../dataset/spectrogram_roi_dataset'
new_dataset_path = '../dataset/squared_spectrogram_roi_dataset'
squareAndGrayProcess(size, dataset_path, new_dataset_path)

In [4]:
#new_dataset_path = '../dataset/augmented_spectrograms/'
#new_dataset_path = '../dataset/squared_spectrogram_roi_dataset/'

In [5]:
def getDatasetFolders(dataset_path):
    folders = os.listdir(dataset_path)
    dataset_folders = []
    for folder in folders:
        dataset_folders.append(dataset_path + '/' + folder)
    return dataset_folders

dataset_folders = getDatasetFolders(new_dataset_path)

In [6]:
pixel_depth = 255.0 # Number of levels per pixel.

def load_image(folder, min_num_images):
    """Load the data for a single letter label."""
    image_files = os.listdir(folder)
    dataset = np.ndarray(shape=(len(image_files), image_size, image_size), dtype=np.float32)
    num_images = 0

    for image in image_files:
        image_file = os.path.join(folder, image)
        try:
            image_data = (ndimage.imread(image_file).astype(float) - pixel_depth / 2) / pixel_depth
            #print(image_data.shape)
            # our images are RGBA so we would expect shape MxNx4
            # see: https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.ndimage.imread.html
            if (image_data.shape != (image_size, image_size)):
                raise Exception('Unexpected image shape: %s' % str(image_data.shape))
            dataset[num_images, :, :] = image_data
            num_images = num_images + 1
        except IOError as e:
            print('Could not read:', image_file, ':', e, '- it\'s ok, skipping.')

    dataset = dataset[0:num_images, :, :]
    #if num_images < min_num_images:
     #   raise Exception('Many fewer images than expected: %d < %d' % (num_images, min_num_images))

    print('Full dataset tensor:', dataset.shape)
    print('Mean:', np.mean(dataset))
    print('Standard deviation:', np.std(dataset))
    return dataset

In [None]:
a

In [7]:
def splitter(new_dataset_path):
    

SyntaxError: unexpected EOF while parsing (<ipython-input-7-af97bca0b823>, line 2)

In [8]:
def augmentation(new_dataset_path, num_shifts):    
    for folder in os.listdir(new_dataset_path):
        species_pictures = os.listdir(new_dataset_path + '/' + folder)
        os.makedirs('../dataset/augmented_spectrograms' + '/' + folder)
        for image in species_pictures:
            the_image = np.asarray(Image.open(new_dataset_path + '/' + folder + '/' + image))
            for i in range(num_shifts+1):
                pre_image = the_image.reshape((size,size))
                
                # shift up
                shifted_image_up = shift(pre_image, [(i*(-1)), 0]) 
                shifted_image_up.save('../dataset/augmented_spectrograms/' + folder + '/shifted_up' + str(i) + '_' + image)
                shifted_image_up.close()
                
                # shift_down
                shifted_image_down = shift(pre_image, [i, 0]) 
                shifted_image_down.save('../dataset/augmented_spectrograms/' + folder + '/shifted_down' + str(i) + '_' + image)
                shifted_image_down.close()
                
                #shift_left
                shifted_image_left = shift(pre_image, [0, (i*(-1))]) 
                shifted_image_left.save('../dataset/augmented_spectrograms/' + folder + '/shifted_left' + str(i) + '_' + image)
                shifted_image_left.close()
                
                #shift_right
                shifted_image_right = shift(pre_image, [0, i]) 
                shifted_image_right.save('../dataset/augmented_spectrograms/' + folder + '/shifted_right' + str(i) + '_' + image)
                shifted_image_right.close()
                
                pre_image.close()
            del the_image

### Pickling Data

We want to pickle the data by species, allowing for control of the minimum images per class. Beware that this will drastically influence the performance of your model.

In [9]:
def maybe_pickle(data_folders, min_num_images_per_class, pickles_path, force=False):
    
    if not os.path.exists(pickles_path):
        os.makedirs(pickles_path)
    else:
        shutil.rmtree(pickles_path)
        os.makedirs(pickles_path)
  
    dataset_names = []
    for folder in data_folders:
        class_name = folder.split('/')[-1] # species name
        set_filename = pickles_path + '/' + class_name + '.pickle'
        dataset_names.append(set_filename)
        if os.path.exists(set_filename) and not force:
            # You may override by setting force=True.
            print('%s already present - Skipping pickling.' % set_filename)
        else:
            image_files = os.listdir(folder)
            count = 0
            for image in image_files:
                count +=1
            if True:#count >= min_num_images_per_class:
                print('Pickling %s.' % set_filename)
                dataset = load_image(folder, min_num_images_per_class)
                try:
                    with open(set_filename, 'wb') as f:
                        pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
                except Exception as e:
                    print('Unable to save data to', set_filename, ':', e)

    return dataset_names

pickles_path = '../dataset/pickle_data'
datasets = maybe_pickle(dataset_folders, min_imgs_per_class, pickles_path)

Pickling ../dataset/pickle_data/Myrmeciza hemimelaena.pickle.
Full dataset tensor: (44, 28, 28)
Mean: -0.0248039
Standard deviation: 0.087640084
Pickling ../dataset/pickle_data/Microcerculus marginatus.pickle.
Full dataset tensor: (12, 28, 28)
Mean: 0.08082441
Standard deviation: 0.10161624
Pickling ../dataset/pickle_data/Eleutherodactylus coqui.pickle.
Full dataset tensor: (13, 28, 28)
Mean: -0.030509898
Standard deviation: 0.084106274
Pickling ../dataset/pickle_data/Epinephelus guttatus.pickle.
Full dataset tensor: (17, 28, 28)
Mean: 0.03140815
Standard deviation: 0.11902801
Pickling ../dataset/pickle_data/Megascops nudipes.pickle.
Full dataset tensor: (15, 28, 28)
Mean: 0.103591435
Standard deviation: 0.12395171
Pickling ../dataset/pickle_data/Basileuterus chrysogaster.pickle.
Full dataset tensor: (5, 28, 28)
Mean: 0.0352441
Standard deviation: 0.08039878
Pickling ../dataset/pickle_data/Eleutherodactylus cochranae.pickle.
Full dataset tensor: (25, 28, 28)
Mean: 0.053360745
Standard 

In [10]:
pickles = getDatasetFolders('../dataset/pickle_data')
#print(datasets)
num_classes = len(pickles)
print(f'We have {num_classes} classes')

We have 16 classes


### Classes 

We have to evaluate the number of classes and how are they distributed. Also, observe which species has a higher frequency, etc.  

In [11]:
def das_labeler(pickle_files):
    labels = []
    images = []
    for label, pickle_file in enumerate(pickle_files):
        try:
            with open(pickle_file, 'rb') as f:
                species_set = pickle.load(f)
                for image in species_set:
                    labels.append(label)
                    images.append(image)
        except Exception as e:
            print('Unable to process data from', pickle_file, ':', e)
            pass
    labels = np.asarray(labels)
    images = np.asarray(images)
    return labels, images

In [12]:
labels, images = das_labeler(datasets)

In [14]:
#X_train, X_test, y_train, y_test = train_test_split(images, labels, test_size = 0.33, random_state = 42)

In [13]:
# Calculates the total of images per class
def class_is_balanced(pickles):
    total = 0
    for pckle in pickles:
        if (os.path.isfile(pckle)):
            pickle_class = pickle.load(open(pckle, "rb"))
        else:
            print("Error reading dataset %s. Exiting.", pickle_path)
            return -1
        class_name = pckle.split('/')[-1].split('.')[0]
        print("The total number of images in class %s is: %d" % (class_name, len(pickle_class)))
        total += len(pickle_class)
    print("For the dataset to be balanced, each class should have approximately %d images.\n" % (total / len(pickles)))
    return (total // len(pickles))
    
print("Let's see if the dataset is balanced:")
balance_num = class_is_balanced(pickles)


Let's see if the dataset is balanced:
The total number of images in class Megascops nudipes is: 15
The total number of images in class Eleutherodactylus cooki is: 16
The total number of images in class Epinephelus guttatus is: 17
The total number of images in class Microcerculus marginatus is: 12
The total number of images in class Basileuterus chrysogaster is: 5
The total number of images in class Myrmeciza hemimelaena is: 44
The total number of images in class Megascops guatemalae is: 6
The total number of images in class Eleutherodactylus cochranae is: 25
The total number of images in class Basileuterus bivittatus is: 15
The total number of images in class Eleutherodactylus brittoni is: 14
The total number of images in class Formicarius analis is: 11
The total number of images in class Eleutherodactylus juanariveroi is: 5
The total number of images in class Liosceles thoracicus is: 14
The total number of images in class Eleutherodactylus coqui is: 13
The total number of images in cl

### Training, Testing, and Validation Separation

As with every implementation of Supervised Learning, we separate the dataset into three components. The training, the testing, and the validation dataset. 

### Output Data

We output the data in a pickle format, to be used next on the models. 

In [15]:
pickle_file = '../dataset/arbimon_' + str(num_shifts) + '.pickle'

try:
  f = open(pickle_file, 'wb')
  save = {
    'train_dataset': X_train,
    'train_labels': y_train,
    'test_dataset': X_test,
    'test_labels': y_test,
    } 
  pickle.dump(save, f, pickle.HIGHEST_PROTOCOL) # save all out datasets in one pickle 
  f.close()
except Exception as e:
  print('Unable to save data to', pickle_file, ':', e)
  raise