Add flow from files

Works mostly now, one caveat is that the model now needs the number of training events and number of validation events specified by keyword args if using from_directory. Getting the size of the number of files does not work as of right now.
jacobbieker · Oct 30, 2018 · 5c9f3b5 · 5c9f3b5
1 parent b810701
commit 5c9f3b5
Show file tree

Hide file tree

Showing 8 changed files with 150 additions and 25 deletions.
diff --git a/examples/flow_from_files.py b/examples/flow_from_files.py
@@ -0,0 +1,116 @@
+from factnn import GammaPreprocessor, ProtonPreprocessor, SeparationGenerator, SeparationModel
+import os.path
+from factnn.data import kfold
+
+base_dir = "../ihp-pc41.ethz.ch/public/phs/"
+obs_dir = [base_dir + "public/"]
+gamma_dir = [base_dir + "sim/gamma/"]
+proton_dir = [base_dir + "sim/proton/"]
+
+shape = [30,70]
+rebin_size = 3
+
+# Get paths from the directories
+gamma_paths = []
+for directory in gamma_dir:
+    for root, dirs, files in os.walk(directory):
+        for file in files:
+            if file.endswith("phs.jsonl.gz"):
+                gamma_paths.append(os.path.join(root, file))
+
+
+# Get paths from the directories
+proton_paths = []
+for directory in proton_dir:
+    for root, dirs, files in os.walk(directory):
+        for file in files:
+            if file.endswith("phs.jsonl.gz"):
+                proton_paths.append(os.path.join(root, file))
+
+
+# Now do the Kfold Cross validation Part for both sets of paths
+gamma_indexes = kfold.split_data(gamma_paths, kfolds=5)
+proton_indexes = kfold.split_data(proton_paths, kfolds=5)
+
+
+gamma_configuration = {
+    'rebin_size': rebin_size,
+    'output_file': "../gamma.hdf5",
+    'shape': shape,
+    'paths': gamma_indexes[0][0]
+}
+
+proton_configuration = {
+    'rebin_size': rebin_size,
+    'output_file': "../proton.hdf5",
+    'shape': shape,
+    'paths': proton_indexes[0][0]
+}
+
+
+proton_train_preprocessor = ProtonPreprocessor(config=proton_configuration)
+gamma_train_preprocessor = GammaPreprocessor(config=gamma_configuration)
+
+gamma_configuration['paths'] = gamma_indexes[1][0]
+proton_configuration['paths'] = proton_indexes[1][0]
+
+proton_validate_preprocessor = ProtonPreprocessor(config=proton_configuration)
+gamma_validate_preprocessor = GammaPreprocessor(config=gamma_configuration)
+
+
+separation_generator_configuration = {
+    'seed': 1337,
+    'batch_size': 4,
+    'start_slice': 0,
+    'number_slices': 38,
+    'mode': 'train',
+    'chunked': False,
+    'augment': True,
+    'from_directory': True,
+    'input_shape': [-1, gamma_train_preprocessor.shape[3]-2, gamma_train_preprocessor.shape[2], gamma_train_preprocessor.shape[1], 1],
+}
+
+separation_validate = SeparationGenerator(config=separation_generator_configuration)
+separation_train = SeparationGenerator(config=separation_generator_configuration)
+
+separation_validate.mode = "validate"
+separation_train.mode = "train"
+
+separation_train.proton_train_preprocessor = proton_train_preprocessor
+separation_train.proton_validate_preprocessor = proton_validate_preprocessor
+separation_train.train_preprocessor = gamma_train_preprocessor
+separation_train.validate_preprocessor = gamma_validate_preprocessor
+
+separation_model_configuration = {
+    'conv_dropout': 0.1,
+    'lstm_dropout': 0.2,
+    'fc_dropout': 0.4,
+    'num_conv3d': 3,
+    'kernel_conv3d': 2,
+    'strides_conv3d': 1,
+    'num_lstm': 0,
+    'kernel_lstm': 2,
+    'strides_lstm': 1,
+    'num_fc': 2,
+    'pooling': True,
+    'neurons': [16, 16, 16, 8, 32],
+    'shape': [gamma_train_preprocessor.shape[3]-2, gamma_train_preprocessor.shape[2], gamma_train_preprocessor.shape[1], 1],
+    'start_slice': 0,
+    'number_slices': 25,
+    'activation': 'relu',
+}
+
+separation_model = SeparationModel(config=separation_model_configuration)
+
+print(separation_model)
+"""
+
+Now run the models with the generators!
+
+"""
+
+separation_model.train_generator = separation_train
+separation_model.validate_generator = separation_validate
+
+separation_model.train(train_generator=separation_train, validate_generator=separation_validate)
+
diff --git a/examples/separation.py b/examples/separation.py
@@ -97,6 +97,6 @@
 separation_model.validate_generator = separation_validate
 separation_model.train_generator = separation_test
 
-separation_model.train()
+separation_model.train(train_generator=separation_train, validate_generator=separation_validate)
 separation_model.apply()
 
diff --git a/factnn/data/augment.py b/factnn/data/augment.py
@@ -285,7 +285,6 @@ def get_random_from_paths(preprocessor, size, time_slice, total_slices,
         # Call processor size times to get the correct number for the batch
         processed_data, data_format = next(preprocessor.single_processor())
         training_data.append(processed_data)
-
     # Use the type of data to determine what to keep
     if type_training == "Separation":
         training_data = [item[data_format["Image"]] for item in training_data]
@@ -301,8 +300,18 @@ def get_random_from_paths(preprocessor, size, time_slice, total_slices,
                              item[data_format['COG_X']], item[data_format['COG_Y']]) for item in training_data]
         training_data = [item[data_format["Image"]] for item in training_data]
 
+    training_data = np.array(training_data)
+    training_data = training_data.reshape(-1,training_data.shape[2], training_data.shape[3], training_data.shape[4])
+
     if proton_preprocessor is not None:
-        proton_data = [item[data_format["Image"]] for item in training_data]
+        proton_data = []
+        for i in range(size):
+            # Call processor size times to get the correct number for the batch
+            processed_data, data_format = next(proton_preprocessor.single_processor())
+            proton_data.append(processed_data)
+        proton_data = [item[data_format["Image"]] for item in proton_data]
+        proton_data = np.array(proton_data)
+        proton_data = proton_data.reshape(-1, proton_data.shape[2], proton_data.shape[3], proton_data.shape[4])
         batch_images = training_data[::, time_slice:time_slice + total_slices, ::]
         proton_images = proton_data[::, time_slice:time_slice + total_slices, ::]
         return common_step(batch_images, positions=None, labels=labels, proton_images=proton_images, augment=augment,

diff --git a/factnn/data/base_generator.py b/factnn/data/base_generator.py
@@ -1,5 +1,5 @@
 from factnn.data.augment import get_random_from_list, \
-    get_chunk_from_list, get_random_from_paths, get_chunk_from_paths
+    get_chunk_from_list, get_random_from_paths
 import numpy as np
 
 # TODO Add k-fold cross-validation generation
@@ -15,7 +15,10 @@ def __init__(self, config):
             self.seed = config['seed']
 
         self.batch_size = config['batch_size']
-        self.input = config['input']
+        if 'input' in config:
+            self.input = config['input']
+        else:
+            self.input = None
         if 'second_input' in config:
             self.second_input = config['second_input']
         else:
@@ -26,9 +29,10 @@ def __init__(self, config):
         self.second_input_data = None
         self.labels = None
         self.type_gen = None
-        self.input_shape = None
-        # Items is either an int, the number of samples to use, or an array of indicies for the generator
-        # If items is an array, then chunked must be False, and cannot be from_directory
+        if 'input_shape' in config:
+            self.input_shape = config['input_shape']
+        else:
+            self.input_shape = None
         self.mode = config['mode']
         self.train_data = None
         self.validate_data = None

diff --git a/factnn/models/base_model.py b/factnn/models/base_model.py
@@ -177,7 +177,7 @@ def apply(self, test_generator):
 
         return (predictions, truth)
 
-    def train(self, train_generator=None, validate_generator=None):
+    def train(self, train_generator=None, validate_generator=None, num_events=100000, val_num=20000):
         '''
         Train model
         :return:
@@ -194,8 +194,12 @@ def train(self, train_generator=None, validate_generator=None):
 
         tensorboard = keras.callbacks.TensorBoard(update_freq='epoch')
 
-        num_events = int(len(train_generator.train_data))
-        val_num = int(len(train_generator.validate_data))
+        if not train_generator.from_directory:
+            num_events = int(len(train_generator.train_data))
+            val_num = int(len(train_generator.validate_data))
+        else:
+            num_events = num_events
+            val_num = val_num
 
         self.model.fit_generator(
             generator=train_generator,

diff --git a/factnn/models/separation_models.py b/factnn/models/separation_models.py
@@ -13,7 +13,7 @@ def init(self):
         self.model_type = "Separation"
         self.auc = 0.0
         if self.name is None:
-            self.name = self.model_type + "_" + self.num_lstm + "LSTM_" + self.num_conv3d + "Conv3D_" + self.num_fc +\
+            self.name = self.model_type + "_" + str(self.num_lstm) + "LSTM_" + str(self.num_conv3d) + "Conv3D_" + str(self.num_fc) +\
                         "FC" + ".hdf5"
 
     def create(self):

diff --git a/factnn/preprocess/base_preprocessor.py b/factnn/preprocess/base_preprocessor.py
@@ -6,7 +6,8 @@
 class BasePreprocessor(object):
 
     def __init__(self, config):
-        self.directories = config['directories']
+        if 'directories' in config:
+            self.directories = config['directories']
         if 'paths' in config:
             self.paths = config['paths']
         else:
@@ -26,9 +27,9 @@ def __init__(self, config):
         if 'rebin_size' in config:
             if config['rebin_size'] <= 10:
                 try:
-                    self.rebinning = pickle.load(
-                        os.path.join("factnn", "resources", "rebinning_" + config['rebin_size'] + ".p"))
-                except:
+                    with open(os.path.join("..","factnn", "resources", "rebinning_" + str(config['rebin_size']) + ".p"), "rb") as rebinning_file:
+                        self.rebinning = pickle.load(rebinning_file)
+                except Exception as e:
                     self.rebinning = self.generate_rebinning(config['rebin_size'])
             else:
                 self.rebinning = self.generate_rebinning(config['rebin_size'])
@@ -102,7 +103,6 @@ def generate_rebinning(self, size):
 
         list_of_squares = [square]
         steps = int(np.ceil(np.abs(square_start * 2) / square_size))
-        print(steps)
 
         pixel_index_to_grid = {}
         pix_index = 0

diff --git a/factnn/preprocess/simulation_preprocessors.py b/factnn/preprocess/simulation_preprocessors.py
@@ -9,11 +9,8 @@
 class ProtonPreprocessor(BasePreprocessor):
 
     def batch_processor(self):
-        print("Number of files: " + str(len(self.paths)))
         for index, file in enumerate(self.paths):
-            print("Index: " + str(index))
             mc_truth = file.split(".phs")[0] + ".ch.gz"
-            print(mc_truth)
             try:
                 sim_reader = ps.SimulationReader(
                     photon_stream_path=file,
@@ -49,7 +46,6 @@ def single_processor(self):
             self.paths = shuffle(self.paths)
             for index, file in enumerate(self.paths):
                 mc_truth = file.split(".phs")[0] + ".ch.gz"
-                print(mc_truth)
                 try:
                     sim_reader = ps.SimulationReader(
                         photon_stream_path=file,
@@ -152,7 +148,6 @@ class GammaPreprocessor(BasePreprocessor):
     def batch_processor(self):
         for index, file in enumerate(self.paths):
             mc_truth = file.split(".phs")[0] + ".ch.gz"
-            print(mc_truth)
             try:
                 sim_reader = ps.SimulationReader(
                     photon_stream_path=file,
@@ -188,7 +183,6 @@ def single_processor(self):
             self.paths = shuffle(self.paths)
             for index, file in enumerate(self.paths):
                 mc_truth = file.split(".phs")[0] + ".ch.gz"
-                print(mc_truth)
                 try:
                     sim_reader = ps.SimulationReader(
                         photon_stream_path=file,
@@ -291,7 +285,6 @@ def batch_processor(self):
         self.init()
         for index, file in enumerate(self.paths):
             mc_truth = file.split(".phs")[0] + ".ch.gz"
-            print(mc_truth)
             try:
                 sim_reader = ps.SimulationReader(
                     photon_stream_path=file,
@@ -342,7 +335,6 @@ def single_processor(self):
             self.paths = shuffle(self.paths)
             for index, file in enumerate(self.paths):
                 mc_truth = file.split(".phs")[0] + ".ch.gz"
-                print(mc_truth)
                 try:
                     sim_reader = ps.SimulationReader(
                         photon_stream_path=file,