diff --git a/tools/filters/gff/gff_filter_by_attribute.py b/tools/filters/gff/gff_filter_by_attribute.py index 6125df5381bc..cd6600cf9162 100644 --- a/tools/filters/gff/gff_filter_by_attribute.py +++ b/tools/filters/gff/gff_filter_by_attribute.py @@ -7,7 +7,150 @@ import sys from json import loads +from ast import Module, parse, walk +AST_NODE_TYPE_WHITELIST = [ + 'Expr', 'Load', 'Str', 'Num', 'BoolOp', 'Compare', 'And', 'Eq', 'NotEq', + 'Or', 'GtE', 'LtE', 'Lt', 'Gt', 'BinOp', 'Add', 'Div', 'Sub', 'Mult', 'Mod', + 'Pow', 'LShift', 'GShift', 'BitAnd', 'BitOr', 'BitXor', 'UnaryOp', 'Invert', + 'Not', 'NotIn', 'In', 'Is', 'IsNot', 'List', 'Index', 'Subscript', + 'Name', +] + + +BUILTIN_AND_MATH_FUNCTIONS = 'abs|all|any|bin|chr|cmp|complex|divmod|float|hex|int|len|long|max|min|oct|ord|pow|range|reversed|round|sorted|str|sum|type|unichr|unicode|log|exp|sqrt|ceil|floor'.split('|') +STRING_AND_LIST_METHODS = [ name for name in dir('') + dir([]) if not name.startswith('_') ] +VALID_FUNCTIONS = BUILTIN_AND_MATH_FUNCTIONS + STRING_AND_LIST_METHODS +# Name blacklist isn't strictly needed - but provides extra peace of mind. +NAME_BLACKLIST = ["exec", "eval", "globals", "locals", "__import__", "__builtins__"] + + +def __check_name( ast_node ): + name = ast_node.id + return name not in NAME_BLACKLIST + + +def check_simple_name( text ): + """ + + >>> check_simple_name("col_name") + True + >>> check_simple_name("c1=='chr1' and c3-c2>=2000 and c6=='+'") + False + >>> check_simple_name("eval('1+1')") + False + >>> check_simple_name("import sys") + False + >>> check_simple_name("[].__str__") + False + >>> check_simple_name("__builtins__") + False + >>> check_simple_name("'x' in globals") + False + >>> check_simple_name("'x' in [1,2,3]") + False + >>> check_simple_name("c3=='chr1' and c5>5") + False + >>> check_simple_name("c3=='chr1' and d5>5") + False + >>> check_simple_name("c3=='chr1' and c5>5 or exec") + False + >>> check_simple_name("type(c1) != type(1)") + False + >>> check_simple_name("c1.split(',')[1] == '1'") + False + >>> check_simple_name("exec 1") + False + >>> check_simple_name("str(c2) in [\\\"a\\\",\\\"b\\\"]") + False + >>> check_simple_name("__import__('os').system('touch /tmp/OOPS')") + False + """ + try: + module = parse( text ) + except SyntaxError: + return False + + if not isinstance(module, Module): + return False + statements = module.body + if not len( statements ) == 1: + return False + expression = statements[0] + if expression.__class__.__name__ != 'Expr': + return False + + for ast_node in walk( expression ): + ast_node_class = ast_node.__class__.__name__ + if ast_node_class not in ["Expr", "Name", "Load"]: + return False + + if ast_node_class == "Name" and not __check_name(ast_node): + return False + + return True + + +def check_expression( text ): + """ + + >>> check_expression("c1=='chr1' and c3-c2>=2000 and c6=='+'") + True + >>> check_expression("eval('1+1')") + False + >>> check_expression("import sys") + False + >>> check_expression("[].__str__") + False + >>> check_expression("__builtins__") + False + >>> check_expression("'x' in globals") + False + >>> check_expression("'x' in [1,2,3]") + True + >>> check_expression("c3=='chr1' and c5>5") + True + >>> check_expression("c3=='chr1' and d5>5") + True + >>> check_expression("c3=='chr1' and c5>5 or exec") + False + >>> check_expression("type(c1) != type(1)") + False + >>> check_expression("c1.split(',')[1] == '1'") + False + >>> check_expression("exec 1") + False + >>> check_expression("str(c2) in [\\\"a\\\",\\\"b\\\"]") + False + >>> check_expression("__import__('os').system('touch /tmp/OOPS')") + False + """ + try: + module = parse( text ) + except SyntaxError: + return False + + if not isinstance(module, Module): + return False + statements = module.body + if not len( statements ) == 1: + return False + expression = statements[0] + if expression.__class__.__name__ != 'Expr': + return False + + for ast_node in walk( expression ): + ast_node_class = ast_node.__class__.__name__ + + # Toss out everything that is not a "simple" expression, + # imports, error handling, etc... + if ast_node_class not in AST_NODE_TYPE_WHITELIST: + return False + + if ast_node_class == "Name" and not __check_name(ast_node): + return False + + return True # # Helper functions. @@ -50,6 +193,8 @@ def check_for_executable( text, description='' ): # Convert types from str to type objects. for name, a_type in attribute_types.items(): check_for_executable(a_type) + if not check_simple_name( a_type ): + stop_err("Problem with attribute type [%s]" % a_type) attribute_types[ name ] = eval( a_type ) # Unescape if input has been escaped @@ -69,6 +214,9 @@ def check_for_executable( text, description='' ): # Attempt to determine if the condition includes executable stuff and, if so, exit. check_for_executable( cond_text, 'condition') +if not check_expression(cond_text): + stop_err( "Illegal/invalid in condition '%s'" % ( cond_text ) ) + # Prepare the column variable names and wrappers for column data types. Only # prepare columns up to largest column in condition. attrs, type_casts = [], [] diff --git a/tools/filters/gff/gff_filter_by_attribute.xml b/tools/filters/gff/gff_filter_by_attribute.xml index 42571fee3200..2ef27f4558e0 100644 --- a/tools/filters/gff/gff_filter_by_attribute.xml +++ b/tools/filters/gff/gff_filter_by_attribute.xml @@ -1,4 +1,4 @@ - + using simple expressions gff_filter_by_attribute.py $input $out_file1 "$cond" '${input.metadata.attribute_types}' diff --git a/tools/filters/gff/gff_filter_by_feature_count.py b/tools/filters/gff/gff_filter_by_feature_count.py index 4b824d672476..778527972bcb 100644 --- a/tools/filters/gff/gff_filter_by_feature_count.py +++ b/tools/filters/gff/gff_filter_by_feature_count.py @@ -10,6 +10,90 @@ from bx.intervals.io import GenomicInterval from galaxy.datatypes.util.gff_util import GFFReaderWrapper +from ast import Module, parse, walk + +AST_NODE_TYPE_WHITELIST = [ + 'Expr', 'Load', 'Str', 'Num', 'BoolOp', 'Compare', 'And', 'Eq', 'NotEq', + 'Or', 'GtE', 'LtE', 'Lt', 'Gt', 'BinOp', 'Add', 'Div', 'Sub', 'Mult', 'Mod', + 'Pow', 'LShift', 'GShift', 'BitAnd', 'BitOr', 'BitXor', 'UnaryOp', 'Invert', + 'Not', 'NotIn', 'In', 'Is', 'IsNot', 'List', 'Index', 'Subscript', + 'Name', +] + + +BUILTIN_AND_MATH_FUNCTIONS = 'abs|all|any|bin|chr|cmp|complex|divmod|float|hex|int|len|long|max|min|oct|ord|pow|range|reversed|round|sorted|str|sum|type|unichr|unicode|log|exp|sqrt|ceil|floor'.split('|') +STRING_AND_LIST_METHODS = [ name for name in dir('') + dir([]) if not name.startswith('_') ] +VALID_FUNCTIONS = BUILTIN_AND_MATH_FUNCTIONS + STRING_AND_LIST_METHODS +# Name blacklist isn't strictly needed - but provides extra peace of mind. +NAME_BLACKLIST = ["exec", "eval", "globals", "locals", "__import__", "__builtins__"] + + +def __check_name( ast_node ): + name = ast_node.id + return name not in NAME_BLACKLIST + + +def check_expression( text ): + """ + + >>> check_expression("c1=='chr1' and c3-c2>=2000 and c6=='+'") + True + >>> check_expression("eval('1+1')") + False + >>> check_expression("import sys") + False + >>> check_expression("[].__str__") + False + >>> check_expression("__builtins__") + False + >>> check_expression("'x' in globals") + False + >>> check_expression("'x' in [1,2,3]") + True + >>> check_expression("c3=='chr1' and c5>5") + True + >>> check_expression("c3=='chr1' and d5>5") + True + >>> check_expression("c3=='chr1' and c5>5 or exec") + False + >>> check_expression("type(c1) != type(1)") + False + >>> check_expression("c1.split(',')[1] == '1'") + False + >>> check_expression("exec 1") + False + >>> check_expression("str(c2) in [\\\"a\\\",\\\"b\\\"]") + False + >>> check_expression("__import__('os').system('touch /tmp/OOPS')") + False + """ + try: + module = parse( text ) + except SyntaxError: + return False + + if not isinstance(module, Module): + return False + statements = module.body + if not len( statements ) == 1: + return False + expression = statements[0] + if expression.__class__.__name__ != 'Expr': + return False + + for ast_node in walk( expression ): + ast_node_class = ast_node.__class__.__name__ + + # Toss out everything that is not a "simple" expression, + # imports, error handling, etc... + if ast_node_class not in AST_NODE_TYPE_WHITELIST: + return False + + if ast_node_class == "Name" and not __check_name(ast_node): + return False + + return True + # Valid operators, ordered so that complex operators (e.g. '>=') are # recognized before simple operators (e.g. '>') @@ -69,7 +153,12 @@ def __main__(): for interval in feature.intervals: if interval.feature == feature_name: count += 1 - if eval( '%s %s' % ( count, condition ) ): + eval_text = '%s %s' % ( count, condition ) + if not check_expression(eval_text): + print("Invalid condition: %s, cannot filter." % condition, file=sys.stderr) + sys.exit(1) + + if eval(eval_text): # Keep feature. for interval in feature.intervals: out.write( "\t".join(interval.fields) + '\n' ) diff --git a/tools/filters/gff/gff_filter_by_feature_count.xml b/tools/filters/gff/gff_filter_by_feature_count.xml index 75886432fb97..586103641996 100644 --- a/tools/filters/gff/gff_filter_by_feature_count.xml +++ b/tools/filters/gff/gff_filter_by_feature_count.xml @@ -1,4 +1,4 @@ - + using simple expressions gff_filter_by_feature_count.py $input_file1 $out_file1 "$feature_name" "$cond"