Update avocado_predict to be able to process chunks sequentially

kboone · May 20, 2019 · de9a6d2 · de9a6d2
1 parent 12487a1
commit de9a6d2
Showing 1 changed file with 37 additions and 21 deletions.
diff --git a/scripts/avocado_predict b/scripts/avocado_predict
@@ -2,10 +2,30 @@
 """Generate predictions for a dataset using avocado."""
 
 import argparse
+from tqdm import tqdm
 
 import avocado
 
 
+def process_chunk(classifier, chunk, args, verbose=True):
+    # Load the dataset
+    if verbose:
+        print("Loading dataset...")
+    dataset = avocado.load(args.dataset, metadata_only=True, chunk=chunk,
+                           num_chunks=args.num_chunks)
+    dataset.load_raw_features()
+
+    # Generate predictions.
+    if verbose:
+        print("Generating predictions...")
+    predictions = dataset.predict(classifier)
+
+    # Write the predictions to disk.
+    if verbose:
+        print("Writing out predictions...")
+    dataset.write_predictions()
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description=__doc__)
     parser.add_argument(
@@ -17,37 +37,33 @@ if __name__ == "__main__":
         help='Name of the classifier to use.'
     )
     parser.add_argument(
-        '--chunk',
+        '--num_chunks',
         type=int,
-        default=None,
-        help='If set, process the dataset by chunks. This sets the chunk '
-        'number of the dataset to use.',
+        default=100,
+        help='The dataset will be processed in chunks to avoid loading all of '
+        'the data at once. This sets the total number of chunks to use. '
+        '(default: %(default)s)',
     )
     parser.add_argument(
-        '--num_chunks',
+        '--chunk',
         type=int,
-        default=100,
-        help='If chunk is set, this is the total number of chunks to use for '
-        'processing the dataset. (default: %(default)s)',
+        default=None,
+        help='If set, only process this chunk of the dataset. This is '
+        'intended to be used to split processing into multiple jobs.'
     )
 
     args = parser.parse_args()
 
-    # Load the dataset
-    print("Loading dataset...")
-    dataset = avocado.load(args.dataset, metadata_only=True, chunk=args.chunk,
-                           num_chunks=args.num_chunks)
-    dataset.load_raw_features()
-
     # Load the classifier
     classifier = avocado.load_classifier(args.classifier)
 
-    # Generate predictions.
-    print("Generating predictions...")
-    predictions = dataset.predict(classifier)
-
-    # Write the predictions to disk.
-    print("Writing out predictions...")
-    dataset.write_predictions()
+    if args.chunk:
+        # Process a single chunk
+        process_chunk(classifier, args.chunk, args)
+    else:
+        # Process all chunks
+        print("Processing the dataset in %d chunks..." % args.num_chunks)
+        for chunk in tqdm(range(args.num_chunks)):
+            process_chunk(classifier, chunk, args, verbose=False)
 
     print("Done!")