Add featurize scripts

kboone · May 17, 2019 · ce913fa · ce913fa
1 parent 0434f03
commit ce913fa
Show file tree

Hide file tree

Showing 4 changed files with 183 additions and 4 deletions.
diff --git a/avocado/astronomical_object.py b/avocado/astronomical_object.py
@@ -228,8 +228,8 @@ def grad_neg_ln_like(p):
             # Fit failed. Print out a warning, and use the initial guesses for
             # fit parameters. This only really seems to happen for objects
             # where the lightcurve is almost entirely noise.
-            logger.warn("GP fit failed for %s! Using guessed GP parameters." %
-                        self)
+            logger.warn("GP fit failed for %s! Using guessed GP parameters. "
+                        "This is usually OK." % self)
             gp.set_parameter_vector(guess_parameters)
 
         if verbose:

diff --git a/scripts/avocado_augment_submit b/scripts/avocado_augment_submit
@@ -2,7 +2,7 @@
 """Submit jobs to an SGE queue to augment a dataset using avocado
 
 This requires that avocado be installed and that the avocado_augment script is
-on the PATH of the submitted job.
+on the PATH.
 """
 
 import argparse
@@ -69,6 +69,11 @@ if __name__ == "__main__":
         help='Jobs directory for qsub scripts and output. Default is '
         '"[working_directory]/jobs/[augmented_dataset]/"'
     )
+    parser.add_argument(
+        '--qsub_arguments',
+        default='',
+        help='Additional arguments to pass to qsub'
+    )
 
     raw_args = parser.parse_args()
 
@@ -106,4 +111,4 @@ if __name__ == "__main__":
             job_file.write(job_template)
 
         # Submit the job
-        subprocess.call(["qsub", job_path])
+        subprocess.call(["qsub"] + args['qsub_arguments'].split() + [job_path])
diff --git a/scripts/avocado_featurize b/scripts/avocado_featurize
@@ -0,0 +1,59 @@
+#!/usr/bin/env python
+"""Featurize a dataset using avocado"""
+
+import argparse
+
+import avocado
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        'dataset',
+        help='Name of the dataset to featurize.'
+    )
+    parser.add_argument(
+        '--chunk',
+        type=int,
+        default=None,
+        help='If set, process the dataset by chunks. This sets the chunk '
+        'number of the dataset to use.',
+    )
+    parser.add_argument(
+        '--num_chunks',
+        type=int,
+        default=100,
+        help='If chunk is set, this is the total number of chunks to use for '
+        'processing the dataset. (default: %(default)s)',
+    )
+    parser.add_argument(
+        '--tag',
+        default=avocado.settings['features_tag'],
+        help='The tag to use for these features. The default is set in '
+        'avocado_settings.json. (default: %(default)s',
+    )
+
+    args = parser.parse_args()
+
+    # Load the reference dataset
+    print("Loading dataset...")
+    dataset = avocado.load(
+        args.dataset,
+        chunk=args.chunk,
+        num_chunks=args.num_chunks,
+    )
+
+    # Load the featurizer. For now, we only have the PLAsTiCC featurizer
+    # although this could be an option in the future.
+    print("Loading featurizer...")
+    featurizer = avocado.plasticc.PlasticcFeaturizer()
+
+    # Featurize the dataset
+    print("Featurizing the dataset...")
+    dataset.extract_raw_features(featurizer)
+
+    # Save the features.
+    print("Saving the features...")
+    dataset.write_raw_features(tag=args.tag)
+
+    print("Done!")
diff --git a/scripts/avocado_featurize_submit b/scripts/avocado_featurize_submit
@@ -0,0 +1,115 @@
+#!/usr/bin/env python
+"""Submit jobs to an SGE queue to featurize a dataset using avocado
+
+This requires that avocado be installed and that the avocado_featurize script
+is on the PATH.
+"""
+
+import argparse
+import os
+import subprocess
+
+import avocado
+
+sge_template = """
+#!/bin/bash
+#$ -V
+#$ -S /bin/bash
+#$ -N {job_name}
+#$ -o {jobs_directory}/{job_name}.out
+#$ -e {jobs_directory}/{job_name}.err
+
+# Use a single core for each job. This parallelizes better than trying to use
+# multiple cores per job.
+export MKL_NUM_THREADS=1
+export NUMEXPR_NUM_THREADS=1
+export OMP_NUM_THREADS=1
+
+cd {working_directory}
+avocado_featurize \\
+    {dataset} \\
+    --chunk {job} \\
+    --num_chunks {num_jobs} \\
+    --tag {tag}
+"""
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        'dataset',
+        help='Name of the dataset to featurize.'
+    )
+    parser.add_argument(
+        '--chunk',
+        type=int,
+        default=None,
+        help='If set, process the dataset by chunks. This sets the chunk '
+        'number of the dataset to use.',
+    )
+    parser.add_argument(
+        '--num_jobs',
+        type=int,
+        default=100,
+        help='Number of jobs to submit to process the dataset. '
+        '(default: %(default)s)',
+    )
+    parser.add_argument(
+        '--tag',
+        default=avocado.settings['features_tag'],
+        help='The tag to use for these features. The default is set in '
+        'avocado_settings.json. (default: %(default)s',
+    )
+    parser.add_argument(
+        '--working_directory',
+        default=None,
+        help='Working directory. Default is the current directory.'
+    )
+    parser.add_argument(
+        '--jobs_directory',
+        default=None,
+        help='Jobs directory for qsub scripts and output. Default is '
+        '"[working_directory]/jobs/featurize_[dataset]/"'
+    )
+    parser.add_argument(
+        '--qsub_arguments',
+        default='',
+        help='Additional arguments to pass to qsub'
+    )
+
+    raw_args = parser.parse_args()
+
+    # Build a dictionary with the arguments that will be used to format the
+    # submit script.
+    args = vars(raw_args).copy()
+
+    # Update the working directory if it wasn't set.
+    if args['working_directory'] is None:
+        args['working_directory'] = os.getcwd()
+
+    # Update the log directory if it wasn't set and make sure that it exists.
+    if args['jobs_directory'] is None:
+        args['jobs_directory'] = os.path.join(
+            args['working_directory'], 'jobs', 'featurize_%s' % args['dataset']
+        )
+    os.makedirs(args['jobs_directory'], exist_ok=True)
+
+
+    # Create and submit the jobs one by one
+    for job_id in range(args['num_jobs']):
+        job_args = args.copy()
+
+        job_args['job'] = job_id
+
+        job_name = 'featurize_%04d_%s' % (job_id, args['dataset'])
+        job_args['job_name'] = job_name
+
+        job_path = '{jobs_directory}/{job_name}.sh'.format(**job_args)
+
+        job_template = sge_template.format(**job_args)
+
+        # Write the jobs file
+        with open(job_path, 'w') as job_file:
+            job_file.write(job_template)
+
+        # Submit the job
+        subprocess.call(["qsub"] + args['qsub_arguments'].split() + [job_path])