Ability to extract data from an arbitrary python function.

Using the importlib functionality in python 2.7/3.1+, we can import a user-specified module and hence run a function to extract the data rather than going via subprocess. If the data analysis code requires importing a large module (e.g. pandas) then this can be *substantially* faster, as the repeated imports are avoided.
jsspencer · Mar 21, 2015 · 2cc7597 · 2cc7597
1 parent 7ae3aa5
commit 2cc7597
Show file tree

Hide file tree

Showing 4 changed files with 75 additions and 13 deletions.
diff --git a/docs/userconfig.rst b/docs/userconfig.rst
@@ -22,7 +22,7 @@ benchmark [string]
 
         $ testcode.py make-benchmarks
 
-    The format of the benchmark files is'benchmark.out.ID.inp=INPUT_FILE.arg=ARGS'.  
+    The format of the benchmark files is'benchmark.out.ID.inp=INPUT_FILE.arg=ARGS'.
     The 'inp' and/or 'arg' section is not included if it is empty.
 
     Multiple benchmarks can be used by providing a space-separated list of IDs.  The first
@@ -57,8 +57,15 @@ ignore_fields [space-separated list of strings]
     tests.  No default.
 exe [string]
     Path to the program executable.  No default.
+extract_fn [string]
+    A python function (in the form module_name.function_name) which extracts
+    data from test and benchmark outputs for comparison.  See :ref:`verification`
+    for details.  If a space-separated pair of strings are given, the first is
+    appended to sys.path before the module is imported.  Otherwise the desired
+    module **must** exist on PYTHONPATH.  The feature requires python 2.7 or
+    python 3.1+.
 extract_args [string]
-    Arguments to supply to the extraction program.  Default: null string. 
+    Arguments to supply to the extraction program.  Default: null string.
 extract_cmd_template [string]
     Template of command used to extract data from output(s) with the following
     substitutions made:
@@ -150,8 +157,9 @@ vcs [string]
     requested interactively when benchmarks are produced.  Default: None.
 
 Most settings are optional and need only be set if certain functionality is
-required or the default is not appropriate.  Note that either data_tag or
-extract_program must be supplied.
+required or the default is not appropriate.  Note that at least one of data_tag,
+extract_fn or extract_program must be supplied and are used in that order of
+precedence.
 
 In addition, the following variables are used, if present, as default settings
 for all tests of this type:
@@ -163,7 +171,7 @@ for all tests of this type:
 * output (no default)
 * run_concurrent (defailt: false)
 * submit_template
- 
+
 See :ref:`jobconfig` for more details.
 
 All other settings are assumed to be paths to other versions of the program

diff --git a/docs/verification.rst b/docs/verification.rst
@@ -7,7 +7,7 @@ testcode compares selected data from an output with previously obtained output
 (the 'benchmark'); a test passes if all data is within a desired tolerance.
 The data can be compared using an absolute tolerance and/or a relative
 tolerance.  testcode needs some way of knowing what data from the output files
-should be validated.  There are three options.
+should be validated.  There are four options.
 
 * label output with a 'data tag'
 
@@ -24,6 +24,31 @@ should be validated.  There are three options.
   item; lines with identical text but different values are handled but it is
   assumed that such lines always come in the same (relative) order.
 
+* user-supplied data extraction python function
+
+  An arbitrary python module can be imported and a function contained in the
+  module called with a test or benchmark output filename as its sole argument.
+  The function must return the extracted data from the output file as a python
+  dict with keys labelling each data item (corresponding to the keys used for
+  setting tolerances) and lists or tuples as values containing the data to be
+  compared.  For example::
+
+      {
+        'val 1': [1.2, 8.7],
+        'val 2': [2, 4],
+        'val 3': [3.32, 17.2],
+      }
+
+  Each entry need not contain the same amount of data::
+
+      {
+        'val 1': [1.2, 8.7],
+        'val 2': [2, 4],
+        'val 3': [3.32, 17.2],
+        'val 4': [11.22],
+        'val 5': [221.0],
+      }
+
 * user-supplied data extraction program
 
   An external program can be used to extract data from the test and benchmark

diff --git a/lib/testcode2/__init__.py b/lib/testcode2/__init__.py
@@ -14,13 +14,20 @@
 import shutil
 import subprocess
 import sys
+import warnings
 
 try:
     import yaml
     _HAVE_YAML = True
 except ImportError:
     _HAVE_YAML = False
 
+try:
+    import importlib
+    _HAVE_IMPORTLIB_ = True
+except ImportError:
+    _HAVE_IMPORTLIB_ = False
+
 import testcode2.dir_lock as dir_lock
 import testcode2.exceptions as exceptions
 import testcode2.queues  as queues
@@ -74,6 +81,7 @@ def __init__(self, name, exe, test_id, benchmark, **kwargs):
         self.skip_program = None
         self.skip_args = ''
         self.verify = False
+        self.extract_fn = None
 
         # Info
         self.vcs = None
@@ -87,6 +95,21 @@ def __init__(self, name, exe, test_id, benchmark, **kwargs):
         if self.verify and 'extract_cmd_template' not in kwargs:
             self.extract_cmd_template = 'tc.extract tc.args tc.test tc.bench'
 
+        if self.extract_fn:
+            if _HAVE_IMPORTLIB_:
+                self.extract_fn = self.extract_fn.split()
+                if len(self.extract_fn) == 2:
+                    sys.path.append(self.extract_fn[0])
+                (mod, fn) = self.extract_fn[-1].rsplit('.', 1)
+                mod = importlib.import_module(mod)
+                self.extract_fn = mod.__getattribute__(fn)
+            elif self.extract_program:
+                warnings.warn('importlib not available.  Will attempt to '
+                              'analyse data via an external script.')
+            else:
+                raise exceptions.TestCodeError('importlib not available and '
+                              'no data extraction program supplied.')
+
         # Can we actually extract the data?
         if self.extract_fmt == 'yaml' and not _HAVE_YAML:
             err = 'YAML data format cannot be used: PyYAML is not installed.'
@@ -530,18 +553,24 @@ def extract_data(self, input_file, args, verbose=1):
 
 Assume function is executed in self.path.'''
         tp_ptr = self.test_program
+        data_files = [
+                      tp_ptr.select_benchmark_file(self.path, input_file, args),
+                      util.testcode_filename(FILESTEM['test'],
+                      tp_ptr.test_id, input_file, args),
+                     ]
         if tp_ptr.data_tag:
             # Using internal data extraction function.
-            data_files = [
-                    tp_ptr.select_benchmark_file(self.path, input_file, args),
-                    util.testcode_filename(FILESTEM['test'],
-                            tp_ptr.test_id, input_file, args),
-                         ]
             if verbose > 2:
                 print('Analysing output using data_tag %s in %s on files %s.' %
                         (tp_ptr.data_tag, self.path, ' and '.join(data_files)))
             outputs = [util.extract_tagged_data(tp_ptr.data_tag, dfile)
                     for dfile in data_files]
+        elif tp_ptr.extract_fn:
+            if verbose > 2:
+                print('Analysing output using function %s in %s on files %s.' %
+                        (tp_ptr.extract_fn.__name__, self.path,
+                         ' and '.join(data_files)))
+            outputs = [tp_ptr.extract_fn(dfile) for dfile in data_files]
         else:
             # Using external data extraction script.
             # Get extraction commands.

diff --git a/lib/testcode2/config.py b/lib/testcode2/config.py
@@ -103,8 +103,8 @@ def parse_userconfig(config_file, executables=None, test_id=None,
 
     test_program_options = ('run_cmd_template',
         'launch_parallel', 'ignore_fields', 'data_tag', 'extract_cmd_template',
-        'extract_program', 'extract_args', 'extract_fmt', 'verify', 'vcs',
-        'skip_program', 'skip_args', 'skip_cmd_template')
+        'extract_fn', 'extract_program', 'extract_args', 'extract_fmt',
+        'verify', 'vcs', 'skip_program', 'skip_args', 'skip_cmd_template')
     default_test_options = ('inputs_args', 'output', 'nprocs',
         'min_nprocs', 'max_nprocs', 'submit_template',)
     test_programs = {}