Skip to content

Commit

Permalink
Update avocado_predict to be able to process chunks sequentially
Browse files Browse the repository at this point in the history
  • Loading branch information
kboone committed May 20, 2019
1 parent 12487a1 commit de9a6d2
Showing 1 changed file with 37 additions and 21 deletions.
58 changes: 37 additions & 21 deletions scripts/avocado_predict
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,30 @@
"""Generate predictions for a dataset using avocado."""

import argparse
from tqdm import tqdm

import avocado


def process_chunk(classifier, chunk, args, verbose=True):
# Load the dataset
if verbose:
print("Loading dataset...")
dataset = avocado.load(args.dataset, metadata_only=True, chunk=chunk,
num_chunks=args.num_chunks)
dataset.load_raw_features()

# Generate predictions.
if verbose:
print("Generating predictions...")
predictions = dataset.predict(classifier)

# Write the predictions to disk.
if verbose:
print("Writing out predictions...")
dataset.write_predictions()


if __name__ == "__main__":
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
Expand All @@ -17,37 +37,33 @@ if __name__ == "__main__":
help='Name of the classifier to use.'
)
parser.add_argument(
'--chunk',
'--num_chunks',
type=int,
default=None,
help='If set, process the dataset by chunks. This sets the chunk '
'number of the dataset to use.',
default=100,
help='The dataset will be processed in chunks to avoid loading all of '
'the data at once. This sets the total number of chunks to use. '
'(default: %(default)s)',
)
parser.add_argument(
'--num_chunks',
'--chunk',
type=int,
default=100,
help='If chunk is set, this is the total number of chunks to use for '
'processing the dataset. (default: %(default)s)',
default=None,
help='If set, only process this chunk of the dataset. This is '
'intended to be used to split processing into multiple jobs.'
)

args = parser.parse_args()

# Load the dataset
print("Loading dataset...")
dataset = avocado.load(args.dataset, metadata_only=True, chunk=args.chunk,
num_chunks=args.num_chunks)
dataset.load_raw_features()

# Load the classifier
classifier = avocado.load_classifier(args.classifier)

# Generate predictions.
print("Generating predictions...")
predictions = dataset.predict(classifier)

# Write the predictions to disk.
print("Writing out predictions...")
dataset.write_predictions()
if args.chunk:
# Process a single chunk
process_chunk(classifier, args.chunk, args)
else:
# Process all chunks
print("Processing the dataset in %d chunks..." % args.num_chunks)
for chunk in tqdm(range(args.num_chunks)):
process_chunk(classifier, chunk, args, verbose=False)

print("Done!")

0 comments on commit de9a6d2

Please sign in to comment.