From c1e3087ca35dbca1b0328954fe4769d666d3f934 Mon Sep 17 00:00:00 2001
From: Nate Coraor <nate@bx.psu.edu>
Date: Thu, 15 Dec 2016 12:34:18 -0500
Subject: [PATCH] Fix an ACE vulnerability with gff filter tools. Thanks to
 David Wyde for the disclosure.

---
 tools/filters/gff/gff_filter_by_attribute.py  | 152 ++++++++++++++++++
 tools/filters/gff/gff_filter_by_attribute.xml |   2 +-
 .../gff/gff_filter_by_feature_count.py        |  93 ++++++++++-
 .../gff/gff_filter_by_feature_count.xml       |   2 +-
 4 files changed, 246 insertions(+), 3 deletions(-)

diff --git a/tools/filters/gff/gff_filter_by_attribute.py b/tools/filters/gff/gff_filter_by_attribute.py
index 6b8a083ae10d..2eafa251987a 100644
--- a/tools/filters/gff/gff_filter_by_attribute.py
+++ b/tools/filters/gff/gff_filter_by_attribute.py
@@ -6,6 +6,7 @@
 
 from __future__ import division
 import sys
+
 from galaxy import eggs
 from galaxy.util.json import dumps, loads
 
@@ -17,6 +18,152 @@
 
 assert sys.version_info[:2] >= ( 2, 4 )
 
+from json import loads
+from ast import Module, parse, walk
+
+AST_NODE_TYPE_WHITELIST = [
+    'Expr', 'Load', 'Str', 'Num', 'BoolOp', 'Compare', 'And', 'Eq', 'NotEq',
+    'Or', 'GtE', 'LtE', 'Lt', 'Gt', 'BinOp', 'Add', 'Div', 'Sub', 'Mult', 'Mod',
+    'Pow', 'LShift', 'GShift', 'BitAnd', 'BitOr', 'BitXor', 'UnaryOp', 'Invert',
+    'Not', 'NotIn', 'In', 'Is', 'IsNot', 'List', 'Index', 'Subscript',
+    'Name',
+]
+
+
+BUILTIN_AND_MATH_FUNCTIONS = 'abs|all|any|bin|chr|cmp|complex|divmod|float|hex|int|len|long|max|min|oct|ord|pow|range|reversed|round|sorted|str|sum|type|unichr|unicode|log|exp|sqrt|ceil|floor'.split('|')
+STRING_AND_LIST_METHODS = [ name for name in dir('') + dir([]) if not name.startswith('_') ]
+VALID_FUNCTIONS = BUILTIN_AND_MATH_FUNCTIONS + STRING_AND_LIST_METHODS
+# Name blacklist isn't strictly needed - but provides extra peace of mind.
+NAME_BLACKLIST = ["exec", "eval", "globals", "locals", "__import__", "__builtins__"]
+
+
+def __check_name( ast_node ):
+    name = ast_node.id
+    return name not in NAME_BLACKLIST
+
+
+def check_simple_name( text ):
+    """
+
+    >>> check_simple_name("col_name")
+    True
+    >>> check_simple_name("c1=='chr1' and c3-c2>=2000 and c6=='+'")
+    False
+    >>> check_simple_name("eval('1+1')")
+    False
+    >>> check_simple_name("import sys")
+    False
+    >>> check_simple_name("[].__str__")
+    False
+    >>> check_simple_name("__builtins__")
+    False
+    >>> check_simple_name("'x' in globals")
+    False
+    >>> check_simple_name("'x' in [1,2,3]")
+    False
+    >>> check_simple_name("c3=='chr1' and c5>5")
+    False
+    >>> check_simple_name("c3=='chr1' and d5>5")
+    False
+    >>> check_simple_name("c3=='chr1' and c5>5 or exec")
+    False
+    >>> check_simple_name("type(c1) != type(1)")
+    False
+    >>> check_simple_name("c1.split(',')[1] == '1'")
+    False
+    >>> check_simple_name("exec 1")
+    False
+    >>> check_simple_name("str(c2) in [\\\"a\\\",\\\"b\\\"]")
+    False
+    >>> check_simple_name("__import__('os').system('touch /tmp/OOPS')")
+    False
+    """
+    try:
+        module = parse( text )
+    except SyntaxError:
+        return False
+
+    if not isinstance(module, Module):
+        return False
+    statements = module.body
+    if not len( statements ) == 1:
+        return False
+    expression = statements[0]
+    if expression.__class__.__name__ != 'Expr':
+        return False
+
+    for ast_node in walk( expression ):
+        ast_node_class = ast_node.__class__.__name__
+        if ast_node_class not in ["Expr", "Name", "Load"]:
+            return False
+
+        if ast_node_class == "Name" and not __check_name(ast_node):
+            return False
+
+    return True
+
+
+def check_expression( text ):
+    """
+
+    >>> check_expression("c1=='chr1' and c3-c2>=2000 and c6=='+'")
+    True
+    >>> check_expression("eval('1+1')")
+    False
+    >>> check_expression("import sys")
+    False
+    >>> check_expression("[].__str__")
+    False
+    >>> check_expression("__builtins__")
+    False
+    >>> check_expression("'x' in globals")
+    False
+    >>> check_expression("'x' in [1,2,3]")
+    True
+    >>> check_expression("c3=='chr1' and c5>5")
+    True
+    >>> check_expression("c3=='chr1' and d5>5")
+    True
+    >>> check_expression("c3=='chr1' and c5>5 or exec")
+    False
+    >>> check_expression("type(c1) != type(1)")
+    False
+    >>> check_expression("c1.split(',')[1] == '1'")
+    False
+    >>> check_expression("exec 1")
+    False
+    >>> check_expression("str(c2) in [\\\"a\\\",\\\"b\\\"]")
+    False
+    >>> check_expression("__import__('os').system('touch /tmp/OOPS')")
+    False
+    """
+    try:
+        module = parse( text )
+    except SyntaxError:
+        return False
+
+    if not isinstance(module, Module):
+        return False
+    statements = module.body
+    if not len( statements ) == 1:
+        return False
+    expression = statements[0]
+    if expression.__class__.__name__ != 'Expr':
+        return False
+
+    for ast_node in walk( expression ):
+        ast_node_class = ast_node.__class__.__name__
+
+        # Toss out everything that is not a "simple" expression,
+        # imports, error handling, etc...
+        if ast_node_class not in AST_NODE_TYPE_WHITELIST:
+            return False
+
+        if ast_node_class == "Name" and not __check_name(ast_node):
+            return False
+
+    return True
+
 #
 # Helper functions.
 #
@@ -57,6 +204,8 @@ def check_for_executable( text, description='' ):
 # Convert types from str to type objects.
 for name, a_type in attribute_types.items():
     check_for_executable(a_type)
+    if not check_simple_name( a_type ):
+        stop_err("Problem with attribute type [%s]" % a_type)
     attribute_types[ name ] = eval( a_type )
 
 # Unescape if input has been escaped
@@ -76,6 +225,9 @@ def check_for_executable( text, description='' ):
 # Attempt to determine if the condition includes executable stuff and, if so, exit.
 check_for_executable( cond_text, 'condition')
 
+if not check_expression(cond_text):
+    stop_err( "Illegal/invalid in condition '%s'" % ( cond_text ) )
+
 # Prepare the column variable names and wrappers for column data types. Only
 # prepare columns up to largest column in condition.
 attrs, type_casts = [], []
diff --git a/tools/filters/gff/gff_filter_by_attribute.xml b/tools/filters/gff/gff_filter_by_attribute.xml
index 4e64b84126ea..32a7376687b0 100644
--- a/tools/filters/gff/gff_filter_by_attribute.xml
+++ b/tools/filters/gff/gff_filter_by_attribute.xml
@@ -1,4 +1,4 @@
-<tool id="gff_filter_by_attribute" name="Filter GFF data by attribute" version="0.1">
+<tool id="gff_filter_by_attribute" name="Filter GFF data by attribute" version="0.1.1">
   <description>using simple expressions</description>
   <command interpreter="python">
     gff_filter_by_attribute.py $input $out_file1 "$cond" '${input.metadata.attribute_types}'
diff --git a/tools/filters/gff/gff_filter_by_feature_count.py b/tools/filters/gff/gff_filter_by_feature_count.py
index 6dd648fe4dea..140d5b8f9770 100644
--- a/tools/filters/gff/gff_filter_by_feature_count.py
+++ b/tools/filters/gff/gff_filter_by_feature_count.py
@@ -8,8 +8,94 @@
 import sys
 from galaxy import eggs
 from galaxy.datatypes.util.gff_util import GFFReaderWrapper
+
 from bx.intervals.io import GenomicInterval
 
+from ast import Module, parse, walk
+
+AST_NODE_TYPE_WHITELIST = [
+    'Expr', 'Load', 'Str', 'Num', 'BoolOp', 'Compare', 'And', 'Eq', 'NotEq',
+    'Or', 'GtE', 'LtE', 'Lt', 'Gt', 'BinOp', 'Add', 'Div', 'Sub', 'Mult', 'Mod',
+    'Pow', 'LShift', 'GShift', 'BitAnd', 'BitOr', 'BitXor', 'UnaryOp', 'Invert',
+    'Not', 'NotIn', 'In', 'Is', 'IsNot', 'List', 'Index', 'Subscript',
+    'Name',
+]
+
+
+BUILTIN_AND_MATH_FUNCTIONS = 'abs|all|any|bin|chr|cmp|complex|divmod|float|hex|int|len|long|max|min|oct|ord|pow|range|reversed|round|sorted|str|sum|type|unichr|unicode|log|exp|sqrt|ceil|floor'.split('|')
+STRING_AND_LIST_METHODS = [ name for name in dir('') + dir([]) if not name.startswith('_') ]
+VALID_FUNCTIONS = BUILTIN_AND_MATH_FUNCTIONS + STRING_AND_LIST_METHODS
+# Name blacklist isn't strictly needed - but provides extra peace of mind.
+NAME_BLACKLIST = ["exec", "eval", "globals", "locals", "__import__", "__builtins__"]
+
+
+def __check_name( ast_node ):
+    name = ast_node.id
+    return name not in NAME_BLACKLIST
+
+
+def check_expression( text ):
+    """
+
+    >>> check_expression("c1=='chr1' and c3-c2>=2000 and c6=='+'")
+    True
+    >>> check_expression("eval('1+1')")
+    False
+    >>> check_expression("import sys")
+    False
+    >>> check_expression("[].__str__")
+    False
+    >>> check_expression("__builtins__")
+    False
+    >>> check_expression("'x' in globals")
+    False
+    >>> check_expression("'x' in [1,2,3]")
+    True
+    >>> check_expression("c3=='chr1' and c5>5")
+    True
+    >>> check_expression("c3=='chr1' and d5>5")
+    True
+    >>> check_expression("c3=='chr1' and c5>5 or exec")
+    False
+    >>> check_expression("type(c1) != type(1)")
+    False
+    >>> check_expression("c1.split(',')[1] == '1'")
+    False
+    >>> check_expression("exec 1")
+    False
+    >>> check_expression("str(c2) in [\\\"a\\\",\\\"b\\\"]")
+    False
+    >>> check_expression("__import__('os').system('touch /tmp/OOPS')")
+    False
+    """
+    try:
+        module = parse( text )
+    except SyntaxError:
+        return False
+
+    if not isinstance(module, Module):
+        return False
+    statements = module.body
+    if not len( statements ) == 1:
+        return False
+    expression = statements[0]
+    if expression.__class__.__name__ != 'Expr':
+        return False
+
+    for ast_node in walk( expression ):
+        ast_node_class = ast_node.__class__.__name__
+
+        # Toss out everything that is not a "simple" expression,
+        # imports, error handling, etc...
+        if ast_node_class not in AST_NODE_TYPE_WHITELIST:
+            return False
+
+        if ast_node_class == "Name" and not __check_name(ast_node):
+            return False
+
+    return True
+
+
 # Valid operators, ordered so that complex operators (e.g. '>=') are
 # recognized before simple operators (e.g. '>')
 ops = [
@@ -68,7 +154,12 @@ def __main__():
         for interval in feature.intervals:
             if interval.feature == feature_name:
                 count += 1
-        if eval( '%s %s' % ( count, condition ) ):
+        eval_text = '%s %s' % ( count, condition )
+        if not check_expression(eval_text):
+            print >> sys.stderr, "Invalid condition: %s, cannot filter." % condition
+            sys.exit(1)
+
+        if eval(eval_text):
             # Keep feature.
             for interval in feature.intervals:
                 out.write( "\t".join(interval.fields) + '\n' )
diff --git a/tools/filters/gff/gff_filter_by_feature_count.xml b/tools/filters/gff/gff_filter_by_feature_count.xml
index 90fbd87c12fc..fdfc2e297503 100644
--- a/tools/filters/gff/gff_filter_by_feature_count.xml
+++ b/tools/filters/gff/gff_filter_by_feature_count.xml
@@ -1,4 +1,4 @@
-<tool id="gff_filter_by_feature_count" name="Filter GFF data by feature count" version="0.1">
+<tool id="gff_filter_by_feature_count" name="Filter GFF data by feature count" version="0.1.1">
   <description>using simple expressions</description>
   <command interpreter="python">
     gff_filter_by_feature_count.py $input_file1 $out_file1 "$feature_name" "$cond"