# IMAGE RANDOMIZATION (Training, Validation & Testing)
### Code to generate the basic train/validation/testing image database for Keras  
#### by Luis Soenksen
#### Last Update: 01/08/2018

----------------------------

## ORIGINAL & CLAHE DATA IMAGE RANDOMIZATION (copies files)

In [None]:
"""
 BASIC IMAGE DATABASE TRAIN/VALIDATION/TEST RANDOMIZATION CODE
 ---------------------------------
 by Luis R Soenksen
 Last Update: 2017/04/23
"""
import glob
import cv2
import os
import shutil
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import tensorflow as tf
#tf.Session(config=tf.ConfigProto(log_device_placement=True)) #To ensure activation of GPUs in TF Backend
# Check for a GPU
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    print("GPU Available: ", gpus[0])
    tf.config.experimental.set_memory_growth(gpus[0], True)
else:
    print("No GPUs. Using CPU")
from keras.preprocessing.image import ImageDataGenerator

#IMAGE RANDOMIZATION AND AUGMENTATION HELPER FUNCTIONS
# Print iterations progress
def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '█'):
    """
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        length      - Optional  : character length of bar (Int)
        fill        - Optional  : bar fill character (Str)
    """
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    print('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix), end = '\r')
    # Print New Line on Complete
    if iteration == total: 
        print()

def data_randomization (inputpath, outputpath):
    # Print Message specifying database
    print('Randomized Spliting of database in:'+ inputpath)
    
    #Creation of required folders
    if not os.path.isdir(outputpath):
        os.mkdir(outputpath)
        
    for dirpath, dirnames, filenames in os.walk(inputpath):
        structure = os.path.join(outputpath + 'train/', dirpath[len(inputpath):])
        if not os.path.isdir(structure):
            os.mkdir(structure)
    
    for dirpath, dirnames, filenames in os.walk(inputpath):
        structure = os.path.join(outputpath + 'validation/', dirpath[len(inputpath):])
        if not os.path.isdir(structure):
            os.mkdir(structure)       
            
    for dirpath, dirnames, filenames in os.walk(inputpath):
        structure = os.path.join(outputpath + 'test/', dirpath[len(inputpath):])
        if not os.path.isdir(structure):
            os.mkdir(structure)
            
    class_num = 0
    all_file_dataframe = pd.DataFrame([])
    all_file_list = []
    all_train_set_df = pd.DataFrame([])
    all_validate_set_df = pd.DataFrame([])
    all_test_set_df = pd.DataFrame([])
    
    for directory in glob.iglob(inputpath + '*', recursive=True):
        class_file_list=[]
        df = pd.DataFrame([])
        for filename in glob.iglob(directory + '/' +'*.png', recursive=True):
            class_file_list.append(filename)
        
        class_num += 1
        all_file_list.append(class_file_list)
        df = pd.DataFrame({directory:class_file_list})
        all_file_dataframe = pd.concat([all_file_dataframe, df], axis=1)
        train, validate, test = np.split(df.sample(frac=1), [int(train_p*len(df)), int((train_p + validation_p)*len(df))])
        all_train_set_df = pd.concat([all_train_set_df, train], axis=1)
        all_validate_set_df = pd.concat([all_validate_set_df, validate], axis=1)
        all_test_set_df = pd.concat([all_test_set_df, test], axis=1)
    
    print('Total Number of classes: '+ str(len(all_file_list)))
    n = 0
    l = sum([len(files) for r, d, files in os.walk(inputpath)])-1  
    for dir_n in all_file_dataframe.columns:
        #CREATE FULLY RANDOMIZED TRAINING SET
        for train_file_n in list(all_train_set_df.loc[:, dir_n]):
            if isinstance(train_file_n,str):
                n += 1
                printProgressBar(n , l, prefix = 'Progress:', suffix = 'Complete', length = 50)
                shutil.copy2(train_file_n, train_file_n.replace(inputpath, outputpath + 'train/')) # Copy files to target filename is /data/test/img.png
        
        #CREATE FULLY RANDOMIZED VALIDATION SET
        for val_file_n in list(all_validate_set_df.loc[:, dir_n]):
            if isinstance(val_file_n,str):
                n += 1
                printProgressBar(n , l, prefix = 'Progress:', suffix = 'Complete', length = 50)
                shutil.copy2(val_file_n, val_file_n.replace(inputpath, outputpath + 'validation/')) # Copy files to target filename is /data/test/img.png
        #CREATE FULLY RANDOMIZED TESTING SET    
        for test_file_n in list(all_test_set_df.loc[:, dir_n]):
            if isinstance(test_file_n,str):
                n += 1
                printProgressBar(n , l, prefix = 'Progress:', suffix = 'Complete', length = 50)
                shutil.copy2(test_file_n, test_file_n.replace(inputpath, outputpath + 'test/')) # Copy files to target filename is /data/test/img.png
        
    print('Total number of randomized files: '+ str(n))
        
        
# SPLITING PERCENTAGES (TRAINING=60% / VALIDATION=20% / TESTING=20%)
train_p = 0.6
validation_p = 0.20
test_p = (1.0 - train_p - validation_p)
print('SPLITING PERCENTAGES ARE: (TRAINING=' + str(train_p*100) +'% / VALIDATION=' + str(validation_p*100) +'% / TESTING=' + str(test_p*100) + '%)')

In [None]:
#BASE DATA RANDOMIZATION
#Definition of folder tree structure for converted files for original database and Randomization
inputpath = '/app/data/single_lesion_database/original_data/'
outputpath = '/app/data/single_lesion_database/original_data_randomized/'
data_randomization(inputpath, outputpath)     

In [None]:
#BASE DATA RANDOMIZATION
#Definition of folder tree structure for converted files for original database and Randomization
inputpath = '/app/data/single_lesion_database/clahe_data/'
outputpath = '/app/data/single_lesion_database/clahe_data_randomized/'

#Randomize clahe database if it's available (by default, it will not be)
if os.path.exists(inputpath):
    data_randomization(inputpath, outputpath)

------------------