Start groundwork for K-fold validation

Need to improve the generator and ideally get the streaming from directory to work as well.
jacobbieker · Oct 29, 2018 · acb14e7 · acb14e7
1 parent 522ee36
commit acb14e7
Show file tree

Hide file tree

Showing 6 changed files with 97 additions and 59 deletions.
diff --git a/examples/energy.py b/examples/energy.py
@@ -30,7 +30,7 @@
     'train_fraction': 0.6,
     'validate_fraction': 0.2,
     'mode': 'train',
-    'samples': 800000,
+    'samples': 20000,
     'chunked': False,
     'augment': True,
 }
@@ -45,21 +45,21 @@
 energy_model_configuration = {
     'conv_dropout': 0.1,
     'lstm_dropout': 0.2,
-    'fc_dropout': 0.5,
+    'fc_dropout': 0.3,
     'num_conv3d': 3,
     'kernel_conv3d': 2,
     'strides_conv3d': 1,
     'num_lstm': 1,
     'kernel_lstm': 2,
     'strides_lstm': 1,
-    'num_fc': 3,
+    'num_fc': 2,
     'pooling': True,
-    'neurons': [32, 32, 16, 16, 16, 32, 48, 64],
+    'neurons': [32, 64, 128, 64, 32, 64],
     'shape': [25, 38, 38, 1],
     'start_slice': 0,
     'number_slices': 25,
     'activation': 'relu',
-    'name': 'testEnergy',
+    'patience': 400,
 }
 
 energy_model = EnergyModel(config=energy_model_configuration)

diff --git a/examples/source_detection.py b/examples/source_detection.py
@@ -1,5 +1,6 @@
 from factnn import GammaDiffusePreprocessor, DispGenerator, DispModel, SignGenerator, SignModel
 import os.path
+import numpy as np
 
 base_dir = "../ihp-pc41.ethz.ch/public/phs/"
 obs_dir = [base_dir + "public/"]
@@ -23,6 +24,10 @@
 if not os.path.isfile(gamma_diffuse_configuration["output_file"]):
     gamma_diffuse_preprocessor.create_dataset()
 
+# Since the Gamma Diffuse simulations seem to be located in order, this should allow for better testing of small groups
+# So that not need to use whole thing, but also get a random uniform sample from it anyways
+used_positions = list(np.random.randint(0, 550000, size=20000))
+
 source_generator_configuration = {
     'seed': 1337,
     'batch_size': 32,
@@ -32,7 +37,7 @@
     'train_fraction': 0.6,
     'validate_fraction': 0.2,
     'mode': 'train',
-    'samples': 550000,
+    'samples': used_positions,
     'chunked': False,
     'augment': True,
 }
@@ -67,12 +72,12 @@
     'strides_lstm': 1,
     'num_fc': 2,
     'pooling': True,
-    'neurons': [32, 16, 8, 16, 32, 48],
+    'neurons': [32, 32, 32, 64, 32, 48],
     'shape': [25, 38, 38, 1],
     'start_slice': 0,
     'number_slices': 25,
     'activation': 'relu',
-    'name': 'testDisp',
+    'patience': 200,
 }
 
 sign_model_configuration = {
@@ -109,11 +114,9 @@
 disp_model.test_generator = disp_test
 
 disp_model.train()
-disp_model.apply()
 
 sign_model.train_generator = sign_train
 sign_model.validate_generator = sign_validate
 sign_model.test_generator = sign_test
 
 sign_model.train()
-sign_model.apply()
diff --git a/factnn/data/augment.py b/factnn/data/augment.py
@@ -99,7 +99,7 @@ def get_random_hdf5_chunk(start, stop, size, time_slice, total_slices, gamma, pr
 
 
 def get_completely_random_hdf5(start, stop, size, time_slice, total_slices, gamma, proton_input=None, labels=None,
-                               type_training=None, augment=True, swap=True, shape=None):
+                               augment=True, swap=True, shape=None):
     '''
     Gets a random part of the HDF5 database within start and stop endpoints
     This is to help with shuffling data, as currently all the ones come and go in the same
@@ -139,9 +139,9 @@ def get_completely_random_hdf5(start, stop, size, time_slice, total_slices, gamm
 
 
 def get_random_from_list(indicies, size, time_slice, total_slices, gamma, proton_input=None, labels=None,
-                         type_training=None, augment=True, swap=True, shape=None):
+                        augment=True, swap=True, shape=None):
     '''
-    Gets a random part of the HDF5 database within start and stop endpoints
+    Gets a random part of the HDF5 database within a list of given indicies
     This is to help with shuffling data, as currently all the ones come and go in the same
     order
     Does not guarantee that a given event will be used though, unlike before
@@ -174,6 +174,7 @@ def get_random_from_list(indicies, size, time_slice, total_slices, gamma, proton
                 return common_step(batch_images, positions, labels=labels, proton_images=proton_images, augment=augment, swap=swap, shape=shape)
         else:
             training_data = images_one["Image"]
+            positions = sorted(positions)
             batch_images = training_data[positions, time_slice:time_slice + total_slices, ::]
             return common_step(batch_images, positions, labels=labels, augment=augment, swap=swap, shape=shape)
 

diff --git a/factnn/data/base_generator.py b/factnn/data/base_generator.py
@@ -33,6 +33,8 @@ def __init__(self, config):
         self.items = config['samples']
         self.mode = config['mode']
 
+        # TODO Actually do something with the train and validate fractions
+
         if 'chunked' in config:
             self.chunked = config['chunked']
         else:
@@ -71,53 +73,88 @@ def __next__(self):
         :return:
         '''
         if not self.from_directory:
-            if self.chunked:
-                if self.mode == "train":
-                    while True:
-                        batch_images, batch_image_label = get_random_hdf5_chunk(0, self.items, size=self.batch_size,
-                                                                                time_slice=self.start_slice,
-                                                                                total_slices=self.number_slices,
-                                                                                labels=self.labels,
-                                                                                type_training=self.type_gen,
-                                                                                augment=self.augment,
-                                                                                gamma=self.input,
-                                                                                proton_input=self.second_input,
-                                                                                shape=self.input_shape)
-                        return batch_images, batch_image_label
-                elif self.mode == "validate":
-                    while True:
-                        batch_images, batch_image_label = get_random_hdf5_chunk(0, self.items, size=self.batch_size,
-                                                                                time_slice=self.start_slice,
-                                                                                total_slices=self.number_slices,
-                                                                                labels=self.labels,
-                                                                                type_training=self.type_gen,
-                                                                                augment=self.augment,
-                                                                                gamma=self.input,
-                                                                                proton_input=self.second_input,
-                                                                                shape=self.input_shape)
-                        return batch_images, batch_image_label
-
-                elif self.mode == "test":
-                    while True:
-                        batch_images, batch_image_label = get_random_hdf5_chunk(0, self.items, size=self.batch_size,
-                                                                                time_slice=self.start_slice,
-                                                                                total_slices=self.number_slices,
-                                                                                labels=self.labels,
-                                                                                type_training=self.type_gen,
-                                                                                augment=self.augment,
-                                                                                gamma=self.input,
-                                                                                proton_input=self.second_input,
-                                                                                shape=self.input_shape)
-                        return batch_images, batch_image_label
-            else:
-                # not chunked
+            if type(self.items) is int:
+                if self.chunked:
+                    if self.mode == "train":
+                        while True:
+                            batch_images, batch_image_label = get_random_hdf5_chunk(0, self.items, size=self.batch_size,
+                                                                                    time_slice=self.start_slice,
+                                                                                    total_slices=self.number_slices,
+                                                                                    labels=self.labels,
+                                                                                    type_training=self.type_gen,
+                                                                                    augment=self.augment,
+                                                                                    gamma=self.input,
+                                                                                    proton_input=self.second_input,
+                                                                                    shape=self.input_shape)
+                            return batch_images, batch_image_label
+                    elif self.mode == "validate":
+                        while True:
+                            batch_images, batch_image_label = get_random_hdf5_chunk(0, self.items, size=self.batch_size,
+                                                                                    time_slice=self.start_slice,
+                                                                                    total_slices=self.number_slices,
+                                                                                    labels=self.labels,
+                                                                                    type_training=self.type_gen,
+                                                                                    augment=self.augment,
+                                                                                    gamma=self.input,
+                                                                                    proton_input=self.second_input,
+                                                                                    shape=self.input_shape)
+                            return batch_images, batch_image_label
+
+                    elif self.mode == "test":
+                        while True:
+                            batch_images, batch_image_label = get_random_hdf5_chunk(0, self.items, size=self.batch_size,
+                                                                                    time_slice=self.start_slice,
+                                                                                    total_slices=self.number_slices,
+                                                                                    labels=self.labels,
+                                                                                    type_training=self.type_gen,
+                                                                                    augment=self.augment,
+                                                                                    gamma=self.input,
+                                                                                    proton_input=self.second_input,
+                                                                                    shape=self.input_shape)
+                            return batch_images, batch_image_label
+                else:
+                    # not chunked
+                    if self.mode == "train":
+                        while True:
+                            batch_images, batch_image_label = get_random_from_list(self.items, size=self.batch_size,
+                                                                                   time_slice=self.start_slice,
+                                                                                   total_slices=self.number_slices,
+                                                                                   labels=self.labels,
+                                                                                   augment=self.augment,
+                                                                                   gamma=self.input,
+                                                                                   proton_input=self.second_input,
+                                                                                   shape=self.input_shape)
+                            return batch_images, batch_image_label
+                    elif self.mode == "validate":
+                        while True:
+                            batch_images, batch_image_label = get_random_from_list(self.items, size=self.batch_size,
+                                                                                   time_slice=self.start_slice,
+                                                                                   total_slices=self.number_slices,
+                                                                                   labels=self.labels,
+                                                                                   augment=self.augment,
+                                                                                   gamma=self.input,
+                                                                                   proton_input=self.second_input,
+                                                                                   shape=self.input_shape)
+                            return batch_images, batch_image_label
+
+                    elif self.mode == "test":
+                        while True:
+                            batch_images, batch_image_label = get_random_from_list(self.items, size=self.batch_size,
+                                                                                   time_slice=self.start_slice,
+                                                                                   total_slices=self.number_slices,
+                                                                                   labels=self.labels,
+                                                                                   augment=self.augment,
+                                                                                   gamma=self.input,
+                                                                                   proton_input=self.second_input,
+                                                                                   shape=self.input_shape)
+                            return batch_images, batch_image_label
+            elif type(self.items) is list:
                 if self.mode == "train":
                     while True:
                         batch_images, batch_image_label = get_random_from_list(self.items, size=self.batch_size,
                                                                                time_slice=self.start_slice,
                                                                                total_slices=self.number_slices,
                                                                                labels=self.labels,
-                                                                               type_training=self.type_gen,
                                                                                augment=self.augment,
                                                                                gamma=self.input,
                                                                                proton_input=self.second_input,
@@ -129,7 +166,6 @@ def __next__(self):
                                                                                time_slice=self.start_slice,
                                                                                total_slices=self.number_slices,
                                                                                labels=self.labels,
-                                                                               type_training=self.type_gen,
                                                                                augment=self.augment,
                                                                                gamma=self.input,
                                                                                proton_input=self.second_input,
@@ -142,14 +178,12 @@ def __next__(self):
                                                                                time_slice=self.start_slice,
                                                                                total_slices=self.number_slices,
                                                                                labels=self.labels,
-                                                                               type_training=self.type_gen,
                                                                                augment=self.augment,
                                                                                gamma=self.input,
                                                                                proton_input=self.second_input,
                                                                                shape=self.input_shape)
                         return batch_images, batch_image_label
 
-
     def __str__(self):
         return NotImplemented
 

diff --git a/factnn/models/energy_models.py b/factnn/models/energy_models.py
@@ -20,7 +20,7 @@ def init(self):
         self.model_type = "Energy"
         self.auc = 0.0
         if self.name is None:
-            self.name = self.model_type + "_" + self.num_lstm + "LSTM_" + self.num_conv3d + "Conv3D_" + self.num_fc + \
+            self.name = self.model_type + "_" + str(self.num_lstm) + "LSTM_" + str(self.num_conv3d) + "Conv3D_" + str(self.num_fc) + \
                         "FC" + ".hdf5"
 
     def create(self):

diff --git a/factnn/models/source_models.py b/factnn/models/source_models.py
@@ -20,7 +20,7 @@ def init(self):
         self.model_type = "Source"
         self.auc = 0.0
         if self.name is None:
-            self.name = self.model_type + "_" + self.num_lstm + "LSTM_" + self.num_conv3d + "Conv3D_" + self.num_fc + \
+            self.name = self.model_type + "_" + str(self.num_lstm) + "LSTM_" + str(self.num_conv3d) + "Conv3D_" + str(self.num_fc) + \
                         "FC" + ".hdf5"
 
     def create(self):