Skip to content

Commit

Permalink
Add flow from files
Browse files Browse the repository at this point in the history
Works mostly now, one caveat is that the model now needs the number of training events and number of validation events specified by keyword args if using from_directory. Getting the size of the number of files does not work as of right now.
  • Loading branch information
jacobbieker committed Oct 30, 2018
1 parent b810701 commit 5c9f3b5
Show file tree
Hide file tree
Showing 8 changed files with 150 additions and 25 deletions.
116 changes: 116 additions & 0 deletions examples/flow_from_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
from factnn import GammaPreprocessor, ProtonPreprocessor, SeparationGenerator, SeparationModel
import os.path
from factnn.data import kfold

base_dir = "../ihp-pc41.ethz.ch/public/phs/"
obs_dir = [base_dir + "public/"]
gamma_dir = [base_dir + "sim/gamma/"]
proton_dir = [base_dir + "sim/proton/"]

shape = [30,70]
rebin_size = 3

# Get paths from the directories
gamma_paths = []
for directory in gamma_dir:
for root, dirs, files in os.walk(directory):
for file in files:
if file.endswith("phs.jsonl.gz"):
gamma_paths.append(os.path.join(root, file))


# Get paths from the directories
proton_paths = []
for directory in proton_dir:
for root, dirs, files in os.walk(directory):
for file in files:
if file.endswith("phs.jsonl.gz"):
proton_paths.append(os.path.join(root, file))


# Now do the Kfold Cross validation Part for both sets of paths
gamma_indexes = kfold.split_data(gamma_paths, kfolds=5)
proton_indexes = kfold.split_data(proton_paths, kfolds=5)


gamma_configuration = {
'rebin_size': rebin_size,
'output_file': "../gamma.hdf5",
'shape': shape,
'paths': gamma_indexes[0][0]
}

proton_configuration = {
'rebin_size': rebin_size,
'output_file': "../proton.hdf5",
'shape': shape,
'paths': proton_indexes[0][0]
}


proton_train_preprocessor = ProtonPreprocessor(config=proton_configuration)
gamma_train_preprocessor = GammaPreprocessor(config=gamma_configuration)

gamma_configuration['paths'] = gamma_indexes[1][0]
proton_configuration['paths'] = proton_indexes[1][0]

proton_validate_preprocessor = ProtonPreprocessor(config=proton_configuration)
gamma_validate_preprocessor = GammaPreprocessor(config=gamma_configuration)


separation_generator_configuration = {
'seed': 1337,
'batch_size': 4,
'start_slice': 0,
'number_slices': 38,
'mode': 'train',
'chunked': False,
'augment': True,
'from_directory': True,
'input_shape': [-1, gamma_train_preprocessor.shape[3]-2, gamma_train_preprocessor.shape[2], gamma_train_preprocessor.shape[1], 1],
}

separation_validate = SeparationGenerator(config=separation_generator_configuration)
separation_train = SeparationGenerator(config=separation_generator_configuration)

separation_validate.mode = "validate"
separation_train.mode = "train"

separation_train.proton_train_preprocessor = proton_train_preprocessor
separation_train.proton_validate_preprocessor = proton_validate_preprocessor
separation_train.train_preprocessor = gamma_train_preprocessor
separation_train.validate_preprocessor = gamma_validate_preprocessor

separation_model_configuration = {
'conv_dropout': 0.1,
'lstm_dropout': 0.2,
'fc_dropout': 0.4,
'num_conv3d': 3,
'kernel_conv3d': 2,
'strides_conv3d': 1,
'num_lstm': 0,
'kernel_lstm': 2,
'strides_lstm': 1,
'num_fc': 2,
'pooling': True,
'neurons': [16, 16, 16, 8, 32],
'shape': [gamma_train_preprocessor.shape[3]-2, gamma_train_preprocessor.shape[2], gamma_train_preprocessor.shape[1], 1],
'start_slice': 0,
'number_slices': 25,
'activation': 'relu',
}

separation_model = SeparationModel(config=separation_model_configuration)

print(separation_model)
"""
Now run the models with the generators!
"""

separation_model.train_generator = separation_train
separation_model.validate_generator = separation_validate

separation_model.train(train_generator=separation_train, validate_generator=separation_validate)

2 changes: 1 addition & 1 deletion examples/separation.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,6 @@
separation_model.validate_generator = separation_validate
separation_model.train_generator = separation_test

separation_model.train()
separation_model.train(train_generator=separation_train, validate_generator=separation_validate)
separation_model.apply()

13 changes: 11 additions & 2 deletions factnn/data/augment.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,6 @@ def get_random_from_paths(preprocessor, size, time_slice, total_slices,
# Call processor size times to get the correct number for the batch
processed_data, data_format = next(preprocessor.single_processor())
training_data.append(processed_data)

# Use the type of data to determine what to keep
if type_training == "Separation":
training_data = [item[data_format["Image"]] for item in training_data]
Expand All @@ -301,8 +300,18 @@ def get_random_from_paths(preprocessor, size, time_slice, total_slices,
item[data_format['COG_X']], item[data_format['COG_Y']]) for item in training_data]
training_data = [item[data_format["Image"]] for item in training_data]

training_data = np.array(training_data)
training_data = training_data.reshape(-1,training_data.shape[2], training_data.shape[3], training_data.shape[4])

if proton_preprocessor is not None:
proton_data = [item[data_format["Image"]] for item in training_data]
proton_data = []
for i in range(size):
# Call processor size times to get the correct number for the batch
processed_data, data_format = next(proton_preprocessor.single_processor())
proton_data.append(processed_data)
proton_data = [item[data_format["Image"]] for item in proton_data]
proton_data = np.array(proton_data)
proton_data = proton_data.reshape(-1, proton_data.shape[2], proton_data.shape[3], proton_data.shape[4])
batch_images = training_data[::, time_slice:time_slice + total_slices, ::]
proton_images = proton_data[::, time_slice:time_slice + total_slices, ::]
return common_step(batch_images, positions=None, labels=labels, proton_images=proton_images, augment=augment,
Expand Down
14 changes: 9 additions & 5 deletions factnn/data/base_generator.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from factnn.data.augment import get_random_from_list, \
get_chunk_from_list, get_random_from_paths, get_chunk_from_paths
get_chunk_from_list, get_random_from_paths
import numpy as np

# TODO Add k-fold cross-validation generation
Expand All @@ -15,7 +15,10 @@ def __init__(self, config):
self.seed = config['seed']

self.batch_size = config['batch_size']
self.input = config['input']
if 'input' in config:
self.input = config['input']
else:
self.input = None
if 'second_input' in config:
self.second_input = config['second_input']
else:
Expand All @@ -26,9 +29,10 @@ def __init__(self, config):
self.second_input_data = None
self.labels = None
self.type_gen = None
self.input_shape = None
# Items is either an int, the number of samples to use, or an array of indicies for the generator
# If items is an array, then chunked must be False, and cannot be from_directory
if 'input_shape' in config:
self.input_shape = config['input_shape']
else:
self.input_shape = None
self.mode = config['mode']
self.train_data = None
self.validate_data = None
Expand Down
10 changes: 7 additions & 3 deletions factnn/models/base_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ def apply(self, test_generator):

return (predictions, truth)

def train(self, train_generator=None, validate_generator=None):
def train(self, train_generator=None, validate_generator=None, num_events=100000, val_num=20000):
'''
Train model
:return:
Expand All @@ -194,8 +194,12 @@ def train(self, train_generator=None, validate_generator=None):

tensorboard = keras.callbacks.TensorBoard(update_freq='epoch')

num_events = int(len(train_generator.train_data))
val_num = int(len(train_generator.validate_data))
if not train_generator.from_directory:
num_events = int(len(train_generator.train_data))
val_num = int(len(train_generator.validate_data))
else:
num_events = num_events
val_num = val_num

self.model.fit_generator(
generator=train_generator,
Expand Down
2 changes: 1 addition & 1 deletion factnn/models/separation_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def init(self):
self.model_type = "Separation"
self.auc = 0.0
if self.name is None:
self.name = self.model_type + "_" + self.num_lstm + "LSTM_" + self.num_conv3d + "Conv3D_" + self.num_fc +\
self.name = self.model_type + "_" + str(self.num_lstm) + "LSTM_" + str(self.num_conv3d) + "Conv3D_" + str(self.num_fc) +\
"FC" + ".hdf5"

def create(self):
Expand Down
10 changes: 5 additions & 5 deletions factnn/preprocess/base_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
class BasePreprocessor(object):

def __init__(self, config):
self.directories = config['directories']
if 'directories' in config:
self.directories = config['directories']
if 'paths' in config:
self.paths = config['paths']
else:
Expand All @@ -26,9 +27,9 @@ def __init__(self, config):
if 'rebin_size' in config:
if config['rebin_size'] <= 10:
try:
self.rebinning = pickle.load(
os.path.join("factnn", "resources", "rebinning_" + config['rebin_size'] + ".p"))
except:
with open(os.path.join("..","factnn", "resources", "rebinning_" + str(config['rebin_size']) + ".p"), "rb") as rebinning_file:
self.rebinning = pickle.load(rebinning_file)
except Exception as e:
self.rebinning = self.generate_rebinning(config['rebin_size'])
else:
self.rebinning = self.generate_rebinning(config['rebin_size'])
Expand Down Expand Up @@ -102,7 +103,6 @@ def generate_rebinning(self, size):

list_of_squares = [square]
steps = int(np.ceil(np.abs(square_start * 2) / square_size))
print(steps)

pixel_index_to_grid = {}
pix_index = 0
Expand Down
8 changes: 0 additions & 8 deletions factnn/preprocess/simulation_preprocessors.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,8 @@
class ProtonPreprocessor(BasePreprocessor):

def batch_processor(self):
print("Number of files: " + str(len(self.paths)))
for index, file in enumerate(self.paths):
print("Index: " + str(index))
mc_truth = file.split(".phs")[0] + ".ch.gz"
print(mc_truth)
try:
sim_reader = ps.SimulationReader(
photon_stream_path=file,
Expand Down Expand Up @@ -49,7 +46,6 @@ def single_processor(self):
self.paths = shuffle(self.paths)
for index, file in enumerate(self.paths):
mc_truth = file.split(".phs")[0] + ".ch.gz"
print(mc_truth)
try:
sim_reader = ps.SimulationReader(
photon_stream_path=file,
Expand Down Expand Up @@ -152,7 +148,6 @@ class GammaPreprocessor(BasePreprocessor):
def batch_processor(self):
for index, file in enumerate(self.paths):
mc_truth = file.split(".phs")[0] + ".ch.gz"
print(mc_truth)
try:
sim_reader = ps.SimulationReader(
photon_stream_path=file,
Expand Down Expand Up @@ -188,7 +183,6 @@ def single_processor(self):
self.paths = shuffle(self.paths)
for index, file in enumerate(self.paths):
mc_truth = file.split(".phs")[0] + ".ch.gz"
print(mc_truth)
try:
sim_reader = ps.SimulationReader(
photon_stream_path=file,
Expand Down Expand Up @@ -291,7 +285,6 @@ def batch_processor(self):
self.init()
for index, file in enumerate(self.paths):
mc_truth = file.split(".phs")[0] + ".ch.gz"
print(mc_truth)
try:
sim_reader = ps.SimulationReader(
photon_stream_path=file,
Expand Down Expand Up @@ -342,7 +335,6 @@ def single_processor(self):
self.paths = shuffle(self.paths)
for index, file in enumerate(self.paths):
mc_truth = file.split(".phs")[0] + ".ch.gz"
print(mc_truth)
try:
sim_reader = ps.SimulationReader(
photon_stream_path=file,
Expand Down

0 comments on commit 5c9f3b5

Please sign in to comment.