In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import os, sys

In [19]:
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Activation, BatchNormalization, Dropout
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.utils import Sequence
from tensorflow.keras.preprocessing import image as img_proc

In [5]:
'''
Define a dictionary which allows us to allocate
labels that we want to each folder that we create.
It contains mapping from folder path to label.

This is included for the continuous labels that we 
plan to allocate to certain samples. 

This represents one 'Scenario'

To create a new scenario, just add/remove/change 
the mappings of filepaths and labels.
'''

label_mapping = {
    'data/Archive/BadData1': 0.0,
    'data/Archive/BadData2': 0.0,
    'data/Archive/GoodData': 1.0,
    'data/Archive/TrueData': 1.0,
    'data/Archive1/TrueGoodData': 1.0
}

In [None]:
'''
We now define a custom DataGenerator class to incorporate 
parallel loading of data on the CPU while GPU trains a batch.

This will prevent overloading the RAM of the system. 
This scheme will act similar to the Datagenerator+Dataloader 
interface of PyTorch.
'''

class DataGenerator(Sequence):
    '''
    Data generator class
    '''
    def __init__(self, label_mapping, batch_size=128, shuffle=True,
                 image_size=(256,256)):
        
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.data_df = self.get_data_df(label_mapping)
        self.image_size = image_size
        self.on_epoch_end()
        
    def get_data_df(self, label_mapping):
        '''
        Produces a dataframe of filepath of each image and
        its label. 
        '''
        
        data_dicts = []
        
        for folder_path in label_mapping:
            
            label = label_mapping[folder_path]
            
            for image_name in os.list_dir(folder_path):
                record = {}
                image_path = os.path.join(folder_path, image_name)
                record['ImagePath'] = image_path
                record['Label'] = label
                data_dicts.append(record)
        
        return pd.DataFrame(data_dicts)
    
    def __getitem__(self, index):
        '''
        Get one batch of data 
        '''
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        
        X, y = self.__data_generation(indexes)
        
        return X, y
        
    def on_epoch_end(self):       
        'Updates indexes after each epoch'
        
        self.indexes = np.arange(len(self.data_df))
        if self.shuffle:
            np.random.shuffle(self.indexes)
            
    def __data_generation(self, indexes, image_preprocess=None):
        '''
        Depending on the indexes, reads images and preprocesses 
        them by the callable image_preprocess and processes the labels
        '''
        batch_df = self.df.iloc[indexes]
        image_list = []
        label_list = []
        
        for i in range(len(batch_df)):
            
            image_path = batch_df['ImagePath'].iloc[i]
            label = batch_df['Label'].iloc[i]
            
            image = img_proc.load_img(image_path, grayscale=True,
                                      target_size=self.image_size)
            img = img_proc.img_to_array(image)
            image_list.append(img)
            
            label_ = [1-label, label]
            label_list.append(label)
        
        return np.array(image_list), np.array(label_list)
        