Skip to content

Commit

Permalink
Add more for streaming from files
Browse files Browse the repository at this point in the history
Now should be mostly ready to go, need to test it more to be sure, but should be good. More limitations to using the generator if streaming from files, but should loop infinitely from the preprocessors, and different modes.
  • Loading branch information
jacobbieker committed Oct 30, 2018
1 parent 20b378a commit b810701
Show file tree
Hide file tree
Showing 6 changed files with 239 additions and 142 deletions.
91 changes: 63 additions & 28 deletions factnn/data/augment.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from sklearn.utils import shuffle
import h5py


def image_augmenter(images):
"""
Augment images by rotating and flipping input images randomly
Expand Down Expand Up @@ -31,14 +32,16 @@ def image_augmenter(images):
images = np.asarray(new_images)
return images


def sum_cubes(images):
'''
Takes the 3D data cubes and sums up along the time axis, creating a 2D image for faster processing
:param images:
:return:
'''

def common_step(batch_images, positions, labels=None, proton_images=None, augment=True, swap=True, shape=None):

def common_step(batch_images, positions=None, labels=None, proton_images=None, augment=True, swap=True, shape=None):
if augment:
batch_images = image_augmenter(batch_images)
if proton_images is not None:
Expand All @@ -53,7 +56,8 @@ def common_step(batch_images, positions, labels=None, proton_images=None, augmen
batch_images, batch_image_label = shuffle(batch_images, batch_image_label)
return batch_images, batch_image_label
else:
labels = labels[positions]
if positions is not None:
labels = labels[positions]
batch_image_label = labels
batch_images = batch_images.reshape(shape)
if swap:
Expand Down Expand Up @@ -97,7 +101,8 @@ def get_random_hdf5_chunk(start, stop, size, time_slice, total_slices, gamma, pr
training_data = images_one["Image"]
batch_images = training_data[start_pos:int(start_pos + size), time_slice:time_slice + total_slices, ::]
proton_images = proton_data[start_pos:int(start_pos + size), time_slice:time_slice + total_slices, ::]
return common_step(batch_images, positions, labels=labels, proton_images=proton_images, augment=augment, swap=swap, shape=shape)
return common_step(batch_images, positions, labels=labels, proton_images=proton_images, augment=augment,
swap=swap, shape=shape)
else:
training_data = images_one["Image"]
batch_images = training_data[start_pos:int(start_pos + size), time_slice:time_slice + total_slices, ::]
Expand Down Expand Up @@ -137,15 +142,16 @@ def get_completely_random_hdf5(start, stop, size, time_slice, total_slices, gamm
training_data = images_one["Image"]
batch_images = training_data[positions, time_slice:time_slice + total_slices, ::]
proton_images = proton_data[positions, time_slice:time_slice + total_slices, ::]
return common_step(batch_images, positions, labels=labels, proton_images=proton_images, augment=augment, swap=swap, shape=shape)
return common_step(batch_images, positions, labels=labels, proton_images=proton_images, augment=augment,
swap=swap, shape=shape)
else:
training_data = images_one["Image"]
batch_images = training_data[positions, time_slice:time_slice + total_slices, ::]
return common_step(batch_images, positions, labels=labels, augment=augment, swap=swap, shape=shape)


def get_random_from_list(indicies, size, time_slice, total_slices, gamma, proton_input=None, labels=None,
augment=True, swap=True, shape=None):
augment=True, swap=True, shape=None):
'''
Gets a random part of the HDF5 database within a list of given indicies
This is to help with shuffling data, as currently all the ones come and go in the same
Expand Down Expand Up @@ -177,16 +183,16 @@ def get_random_from_list(indicies, size, time_slice, total_slices, gamma, proton
training_data = images_one["Image"]
batch_images = training_data[positions, time_slice:time_slice + total_slices, ::]
proton_images = proton_data[positions, time_slice:time_slice + total_slices, ::]
return common_step(batch_images, positions, labels=labels, proton_images=proton_images, augment=augment, swap=swap, shape=shape)
return common_step(batch_images, positions, labels=labels, proton_images=proton_images, augment=augment,
swap=swap, shape=shape)
else:
training_data = images_one["Image"]
batch_images = training_data[positions, time_slice:time_slice + total_slices, ::]
return common_step(batch_images, positions, labels=labels, augment=augment, swap=swap, shape=shape)



def get_chunk_from_list(indicies, size, time_slice, total_slices, gamma, proton_input=None, labels=None,
augment=True, swap=True, shape=None, current_step=0):
augment=True, swap=True, shape=None, current_step=0):
'''
Gets a section of the HDF5 from the list of indicies, but not randomly,so can iterate through all options
This is to help with shuffling data, as currently all the ones come and go in the same
Expand All @@ -210,7 +216,7 @@ def get_chunk_from_list(indicies, size, time_slice, total_slices, gamma, proton_

# Get random positions within the start and stop sizes
if (current_step + 1) * size < len(indicies):
positions = indicies[current_step*size:(current_step+1)*size]
positions = indicies[current_step * size:(current_step + 1) * size]
else:
if current_step * size < len(indicies):
positions = indicies[current_step * size:]
Expand All @@ -227,14 +233,27 @@ def get_chunk_from_list(indicies, size, time_slice, total_slices, gamma, proton_
training_data = images_one["Image"]
batch_images = training_data[positions, time_slice:time_slice + total_slices, ::]
proton_images = proton_data[positions, time_slice:time_slice + total_slices, ::]
return common_step(batch_images, positions, labels=labels, proton_images=proton_images, augment=augment, swap=swap, shape=shape)
return common_step(batch_images, positions, labels=labels, proton_images=proton_images, augment=augment,
swap=swap, shape=shape)
else:
training_data = images_one["Image"]
batch_images = training_data[positions, time_slice:time_slice + total_slices, ::]
return common_step(batch_images, positions, labels=labels, augment=augment, swap=swap, shape=shape)

def get_random_from_paths(paths, size, time_slice, total_slices, preprocessor, labels=None,
proton_data=None, type_training=None, augment=True, swap=True):

def euclidean_distance(x1, y1, x2, y2):
return np.sqrt((x1 - x2) ** 2 + (y1 - y2) ** 2)


def true_delta(cog_y, source_y, cog_x, source_x):
return np.arctan2(
cog_y - source_y,
cog_x - source_x
)


def get_random_from_paths(preprocessor, size, time_slice, total_slices,
proton_preprocessor=None, type_training=None, augment=True, swap=True, shape=None):
'''
Gets a random part of the HDF5 database within start and stop endpoints
This is to help with shuffling data, as currently all the ones come and go in the same
Expand All @@ -257,21 +276,37 @@ def get_random_from_paths(paths, size, time_slice, total_slices, preprocessor, l
:return:
'''

# Get random paths to use
used_paths = np.random.choice(paths, size=size, replace=False)

# Need to use preprocessors streaming to generate the data
# TODO Add streaming preprocessors to generate data and create training_data for use
# As not using data in HDF5, have to generate that first
# TODO Have to replace the training data with created data, so need to add generator
# For this, the single processors are assumed to infinitely iterate through their files, shuffling the order of the
# files after every go through of the whole file set, so some kind of shuffling, but not much
training_data = []
labels = None
data_format = {}
for i in range(size):
# Call processor size times to get the correct number for the batch
processed_data, data_format = next(preprocessor.single_processor())
training_data.append(processed_data)

# Uses the generator to generate data from random paths
preprocessor.paths = used_paths
# Use the type of data to determine what to keep
if type_training == "Separation":
training_data = [item[data_format["Image"]] for item in training_data]
elif type_training == "Energy":
labels = [item[data_format["Energy"]] for item in training_data]
training_data = [item[data_format["Image"]] for item in training_data]
elif type_training == "Disp":
labels = [euclidean_distance(item[data_format['Source_X']], item[data_format['Source_Y']],
item[data_format['COG_X']], item[data_format['COG_Y']]) for item in training_data]
training_data = [item[data_format["Image"]] for item in training_data]
elif type_training == "Sign":
labels = [true_delta(item[data_format['Source_X']], item[data_format['Source_Y']],
item[data_format['COG_X']], item[data_format['COG_Y']]) for item in training_data]
training_data = [item[data_format["Image"]] for item in training_data]


#batch_images = training_data[used_paths, time_slice - total_slices:time_slice, ::]
#common_step(batch_images, positions, time_slice, total_slices, labels=labels,
# proton_data=proton_data, type_training=type_training, augment=augment, swap=swap)

# TODO Actually do this, for now
return NotImplementedError
if proton_preprocessor is not None:
proton_data = [item[data_format["Image"]] for item in training_data]
batch_images = training_data[::, time_slice:time_slice + total_slices, ::]
proton_images = proton_data[::, time_slice:time_slice + total_slices, ::]
return common_step(batch_images, positions=None, labels=labels, proton_images=proton_images, augment=augment,
swap=swap, shape=shape)
else:
batch_images = training_data[::, time_slice:time_slice + total_slices, ::]
return common_step(batch_images, positions=None, labels=labels, augment=augment, swap=swap, shape=shape)
51 changes: 47 additions & 4 deletions factnn/data/base_generator.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
from factnn.data.augment import get_completely_random_hdf5, get_random_hdf5_chunk, get_random_from_list, \
get_chunk_from_list
from sklearn.model_selection import train_test_split
from factnn.data.augment import get_random_from_list, \
get_chunk_from_list, get_random_from_paths, get_chunk_from_paths
import numpy as np


# TODO Add k-fold cross-validation generation

class BaseGenerator(object):
Expand Down Expand Up @@ -42,6 +40,16 @@ def __init__(self, config):
self.test_steps = None
self.test_current_step = 0

# Now the preprocessor stuff, only used for streaming from files
self.train_preprocessor = None
self.validate_preprocessor = None
self.test_preprocessor = None

# Also need proton counterparts, has to be an easier way....
self.proton_train_preprocessor = None
self.proton_validate_preprocessor = None
self.proton_test_preprocessor = None

if 'train_data' in config:
self.train_data = config['train_data']

Expand Down Expand Up @@ -135,6 +143,41 @@ def __next__(self):
self.test_current_step += 1
self.test_current_step %= self.test_steps
return batch_images, batch_image_label
else:
# Now streaming from files, training, test, and validation need to be preprocessors set up for it.
if self.mode == "train":
batch_images, batch_image_label = get_random_from_paths(preprocessor=self.train_preprocessor,
size=self.batch_size,
time_slice=self.start_slice,
total_slices=self.number_slices,
augment=self.augment,
shape=self.input_shape,
type_training=self.type_gen,
proton_preprocessor=self.proton_train_preprocessor)
return batch_images, batch_image_label

elif self.mode == "validate":
batch_images, batch_image_label = get_random_from_paths(preprocessor=self.validate_preprocessor,
size=self.batch_size,
time_slice=self.start_slice,
total_slices=self.number_slices,
augment=False,
shape=self.input_shape,
swap=False,
type_training=self.type_gen,
proton_preprocessor=self.proton_validate_preprocessor)
return batch_images, batch_image_label
elif self.mode == "test":
batch_images, batch_image_label = get_random_from_paths(preprocessor=self.test_preprocessor,
size=self.batch_size,
time_slice=self.start_slice,
total_slices=self.number_slices,
augment=False,
shape=self.input_shape,
swap=False,
type_training=self.type_gen,
proton_preprocessor=self.proton_test_preprocessor)
return batch_images, batch_image_label

def __str__(self):
return NotImplemented
Expand Down
2 changes: 2 additions & 0 deletions factnn/preprocess/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from factnn.preprocess.simulation_preprocessors import GammaDiffusePreprocessor, GammaPreprocessor, ProtonPreprocessor
from factnn.preprocess.observation_preprocessors import ObservationPreprocessor
5 changes: 5 additions & 0 deletions factnn/preprocess/base_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,11 @@ def __init__(self, config):
self.output_file = None

def init(self):
"""
Recalcs the file paths if called based on self.directories
:return:
"""
# TODO Finish this
return NotImplemented

def generate_rebinning(self, size):
Expand Down
2 changes: 1 addition & 1 deletion factnn/preprocess/observation_preprocessors.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def single_processor(self):
data.append([np.fliplr(np.rot90(input_matrix, 3)), energy, zd_deg, az_deg, source_pos_x,
source_pos_y, sky_source_zd, sky_source_az, zd_deg1, az_deg1,
event_num, night, run, cog_x, cog_y])
# need to do the format thing here
# need to do the format thing here, and add auxiliary structure
yield self.format(data)

except Exception as e:
Expand Down

0 comments on commit b810701

Please sign in to comment.