Add more for streaming from files

Now should be mostly ready to go, need to test it more to be sure, but should be good. More limitations to using the generator if streaming from files, but should loop infinitely from the preprocessors, and different modes.
jacobbieker · Oct 30, 2018 · b810701 · b810701
1 parent 20b378a
commit b810701
Show file tree

Hide file tree

Showing 6 changed files with 239 additions and 142 deletions.
diff --git a/factnn/data/augment.py b/factnn/data/augment.py
@@ -2,6 +2,7 @@
 from sklearn.utils import shuffle
 import h5py
 
+
 def image_augmenter(images):
     """
     Augment images by rotating and flipping input images randomly
@@ -31,14 +32,16 @@ def image_augmenter(images):
     images = np.asarray(new_images)
     return images
 
+
 def sum_cubes(images):
     '''
     Takes the 3D data cubes and sums up along the time axis, creating a 2D image for faster processing
     :param images:
     :return:
     '''
 
-def common_step(batch_images, positions, labels=None, proton_images=None, augment=True, swap=True, shape=None):
+
+def common_step(batch_images, positions=None, labels=None, proton_images=None, augment=True, swap=True, shape=None):
     if augment:
         batch_images = image_augmenter(batch_images)
     if proton_images is not None:
@@ -53,7 +56,8 @@ def common_step(batch_images, positions, labels=None, proton_images=None, augmen
             batch_images, batch_image_label = shuffle(batch_images, batch_image_label)
         return batch_images, batch_image_label
     else:
-        labels = labels[positions]
+        if positions is not None:
+            labels = labels[positions]
         batch_image_label = labels
         batch_images = batch_images.reshape(shape)
         if swap:
@@ -97,7 +101,8 @@ def get_random_hdf5_chunk(start, stop, size, time_slice, total_slices, gamma, pr
                 training_data = images_one["Image"]
                 batch_images = training_data[start_pos:int(start_pos + size), time_slice:time_slice + total_slices, ::]
                 proton_images = proton_data[start_pos:int(start_pos + size), time_slice:time_slice + total_slices, ::]
-                return common_step(batch_images, positions, labels=labels, proton_images=proton_images, augment=augment, swap=swap, shape=shape)
+                return common_step(batch_images, positions, labels=labels, proton_images=proton_images, augment=augment,
+                                   swap=swap, shape=shape)
         else:
             training_data = images_one["Image"]
             batch_images = training_data[start_pos:int(start_pos + size), time_slice:time_slice + total_slices, ::]
@@ -137,15 +142,16 @@ def get_completely_random_hdf5(start, stop, size, time_slice, total_slices, gamm
                 training_data = images_one["Image"]
                 batch_images = training_data[positions, time_slice:time_slice + total_slices, ::]
                 proton_images = proton_data[positions, time_slice:time_slice + total_slices, ::]
-                return common_step(batch_images, positions, labels=labels, proton_images=proton_images, augment=augment, swap=swap, shape=shape)
+                return common_step(batch_images, positions, labels=labels, proton_images=proton_images, augment=augment,
+                                   swap=swap, shape=shape)
         else:
             training_data = images_one["Image"]
             batch_images = training_data[positions, time_slice:time_slice + total_slices, ::]
             return common_step(batch_images, positions, labels=labels, augment=augment, swap=swap, shape=shape)
 
 
 def get_random_from_list(indicies, size, time_slice, total_slices, gamma, proton_input=None, labels=None,
-                        augment=True, swap=True, shape=None):
+                         augment=True, swap=True, shape=None):
     '''
     Gets a random part of the HDF5 database within a list of given indicies
     This is to help with shuffling data, as currently all the ones come and go in the same
@@ -177,16 +183,16 @@ def get_random_from_list(indicies, size, time_slice, total_slices, gamma, proton
                 training_data = images_one["Image"]
                 batch_images = training_data[positions, time_slice:time_slice + total_slices, ::]
                 proton_images = proton_data[positions, time_slice:time_slice + total_slices, ::]
-                return common_step(batch_images, positions, labels=labels, proton_images=proton_images, augment=augment, swap=swap, shape=shape)
+                return common_step(batch_images, positions, labels=labels, proton_images=proton_images, augment=augment,
+                                   swap=swap, shape=shape)
         else:
             training_data = images_one["Image"]
             batch_images = training_data[positions, time_slice:time_slice + total_slices, ::]
             return common_step(batch_images, positions, labels=labels, augment=augment, swap=swap, shape=shape)
 
 
-
 def get_chunk_from_list(indicies, size, time_slice, total_slices, gamma, proton_input=None, labels=None,
-                         augment=True, swap=True, shape=None, current_step=0):
+                        augment=True, swap=True, shape=None, current_step=0):
     '''
     Gets a section of the HDF5 from the list of indicies, but not randomly,so can iterate through all options
     This is to help with shuffling data, as currently all the ones come and go in the same
@@ -210,7 +216,7 @@ def get_chunk_from_list(indicies, size, time_slice, total_slices, gamma, proton_
 
     # Get random positions within the start and stop sizes
     if (current_step + 1) * size < len(indicies):
-        positions = indicies[current_step*size:(current_step+1)*size]
+        positions = indicies[current_step * size:(current_step + 1) * size]
     else:
         if current_step * size < len(indicies):
             positions = indicies[current_step * size:]
@@ -227,14 +233,27 @@ def get_chunk_from_list(indicies, size, time_slice, total_slices, gamma, proton_
                 training_data = images_one["Image"]
                 batch_images = training_data[positions, time_slice:time_slice + total_slices, ::]
                 proton_images = proton_data[positions, time_slice:time_slice + total_slices, ::]
-                return common_step(batch_images, positions, labels=labels, proton_images=proton_images, augment=augment, swap=swap, shape=shape)
+                return common_step(batch_images, positions, labels=labels, proton_images=proton_images, augment=augment,
+                                   swap=swap, shape=shape)
         else:
             training_data = images_one["Image"]
             batch_images = training_data[positions, time_slice:time_slice + total_slices, ::]
             return common_step(batch_images, positions, labels=labels, augment=augment, swap=swap, shape=shape)
 
-def get_random_from_paths(paths, size, time_slice, total_slices, preprocessor, labels=None,
-                          proton_data=None, type_training=None, augment=True, swap=True):
+
+def euclidean_distance(x1, y1, x2, y2):
+    return np.sqrt((x1 - x2) ** 2 + (y1 - y2) ** 2)
+
+
+def true_delta(cog_y, source_y, cog_x, source_x):
+    return np.arctan2(
+        cog_y - source_y,
+        cog_x - source_x
+    )
+
+
+def get_random_from_paths(preprocessor, size, time_slice, total_slices,
+                          proton_preprocessor=None, type_training=None, augment=True, swap=True, shape=None):
     '''
     Gets a random part of the HDF5 database within start and stop endpoints
     This is to help with shuffling data, as currently all the ones come and go in the same
@@ -257,21 +276,37 @@ def get_random_from_paths(paths, size, time_slice, total_slices, preprocessor, l
     :return:
     '''
 
-    # Get random paths to use
-    used_paths = np.random.choice(paths, size=size, replace=False)
-
-    # Need to use preprocessors streaming to generate the data
-    # TODO Add streaming preprocessors to generate data and create training_data for use
-    # As not using data in HDF5, have to generate that first
-    # TODO Have to replace the training data with created data, so need to add generator
+    # For this, the single processors are assumed to infinitely iterate through their files, shuffling the order of the
+    # files after every go through of the whole file set, so some kind of shuffling, but not much
+    training_data = []
+    labels = None
+    data_format = {}
+    for i in range(size):
+        # Call processor size times to get the correct number for the batch
+        processed_data, data_format = next(preprocessor.single_processor())
+        training_data.append(processed_data)
 
-    # Uses the generator to generate data from random paths
-    preprocessor.paths = used_paths
+    # Use the type of data to determine what to keep
+    if type_training == "Separation":
+        training_data = [item[data_format["Image"]] for item in training_data]
+    elif type_training == "Energy":
+        labels = [item[data_format["Energy"]] for item in training_data]
+        training_data = [item[data_format["Image"]] for item in training_data]
+    elif type_training == "Disp":
+        labels = [euclidean_distance(item[data_format['Source_X']], item[data_format['Source_Y']],
+                                     item[data_format['COG_X']], item[data_format['COG_Y']]) for item in training_data]
+        training_data = [item[data_format["Image"]] for item in training_data]
+    elif type_training == "Sign":
+        labels = [true_delta(item[data_format['Source_X']], item[data_format['Source_Y']],
+                             item[data_format['COG_X']], item[data_format['COG_Y']]) for item in training_data]
+        training_data = [item[data_format["Image"]] for item in training_data]
 
-
-    #batch_images = training_data[used_paths, time_slice - total_slices:time_slice, ::]
-    #common_step(batch_images, positions, time_slice, total_slices, labels=labels,
-    #            proton_data=proton_data, type_training=type_training, augment=augment, swap=swap)
-
-    # TODO Actually do this, for now
-    return NotImplementedError
+    if proton_preprocessor is not None:
+        proton_data = [item[data_format["Image"]] for item in training_data]
+        batch_images = training_data[::, time_slice:time_slice + total_slices, ::]
+        proton_images = proton_data[::, time_slice:time_slice + total_slices, ::]
+        return common_step(batch_images, positions=None, labels=labels, proton_images=proton_images, augment=augment,
+                           swap=swap, shape=shape)
+    else:
+        batch_images = training_data[::, time_slice:time_slice + total_slices, ::]
+        return common_step(batch_images, positions=None, labels=labels, augment=augment, swap=swap, shape=shape)
diff --git a/factnn/data/base_generator.py b/factnn/data/base_generator.py
@@ -1,9 +1,7 @@
-from factnn.data.augment import get_completely_random_hdf5, get_random_hdf5_chunk, get_random_from_list, \
-    get_chunk_from_list
-from sklearn.model_selection import train_test_split
+from factnn.data.augment import get_random_from_list, \
+    get_chunk_from_list, get_random_from_paths, get_chunk_from_paths
 import numpy as np
 
-
 # TODO Add k-fold cross-validation generation
 
 class BaseGenerator(object):
@@ -42,6 +40,16 @@ def __init__(self, config):
         self.test_steps = None
         self.test_current_step = 0
 
+        # Now the preprocessor stuff, only used for streaming from files
+        self.train_preprocessor = None
+        self.validate_preprocessor = None
+        self.test_preprocessor = None
+
+        # Also need proton counterparts, has to be an easier way....
+        self.proton_train_preprocessor = None
+        self.proton_validate_preprocessor = None
+        self.proton_test_preprocessor = None
+
         if 'train_data' in config:
             self.train_data = config['train_data']
 
@@ -135,6 +143,41 @@ def __next__(self):
                     self.test_current_step += 1
                     self.test_current_step %= self.test_steps
                     return batch_images, batch_image_label
+        else:
+            # Now streaming from files, training, test, and validation need to be preprocessors set up for it.
+            if self.mode == "train":
+                batch_images, batch_image_label = get_random_from_paths(preprocessor=self.train_preprocessor,
+                                                                        size=self.batch_size,
+                                                                        time_slice=self.start_slice,
+                                                                        total_slices=self.number_slices,
+                                                                        augment=self.augment,
+                                                                        shape=self.input_shape,
+                                                                        type_training=self.type_gen,
+                                                                        proton_preprocessor=self.proton_train_preprocessor)
+                return batch_images, batch_image_label
+
+            elif self.mode == "validate":
+                batch_images, batch_image_label = get_random_from_paths(preprocessor=self.validate_preprocessor,
+                                                                        size=self.batch_size,
+                                                                        time_slice=self.start_slice,
+                                                                        total_slices=self.number_slices,
+                                                                        augment=False,
+                                                                        shape=self.input_shape,
+                                                                        swap=False,
+                                                                        type_training=self.type_gen,
+                                                                        proton_preprocessor=self.proton_validate_preprocessor)
+                return batch_images, batch_image_label
+            elif self.mode == "test":
+                batch_images, batch_image_label = get_random_from_paths(preprocessor=self.test_preprocessor,
+                                                                        size=self.batch_size,
+                                                                        time_slice=self.start_slice,
+                                                                        total_slices=self.number_slices,
+                                                                        augment=False,
+                                                                        shape=self.input_shape,
+                                                                        swap=False,
+                                                                        type_training=self.type_gen,
+                                                                        proton_preprocessor=self.proton_test_preprocessor)
+                return batch_images, batch_image_label
 
     def __str__(self):
         return NotImplemented

diff --git a/factnn/preprocess/__init__.py b/factnn/preprocess/__init__.py
@@ -0,0 +1,2 @@
+from factnn.preprocess.simulation_preprocessors import GammaDiffusePreprocessor, GammaPreprocessor, ProtonPreprocessor
+from factnn.preprocess.observation_preprocessors import ObservationPreprocessor
diff --git a/factnn/preprocess/base_preprocessor.py b/factnn/preprocess/base_preprocessor.py
@@ -52,6 +52,11 @@ def __init__(self, config):
             self.output_file = None
 
     def init(self):
+        """
+        Recalcs the file paths if called based on self.directories
+        :return:
+        """
+        # TODO Finish this
         return NotImplemented
 
     def generate_rebinning(self, size):

diff --git a/factnn/preprocess/observation_preprocessors.py b/factnn/preprocess/observation_preprocessors.py
@@ -105,7 +105,7 @@ def single_processor(self):
                         data.append([np.fliplr(np.rot90(input_matrix, 3)), energy, zd_deg, az_deg, source_pos_x,
                                      source_pos_y, sky_source_zd, sky_source_az, zd_deg1, az_deg1,
                                      event_num, night, run, cog_x, cog_y])
-                    # need to do the format thing here
+                    # need to do the format thing here, and add auxiliary structure
                     yield self.format(data)
 
             except Exception as e: