# code-book rules for the "default ruleset for Java, by r2c" 

There are 28 rules in this ruleset. In the notebook, you'll find 
parallel implementations of each of the rules for `code-book`. This is
to help me get an idea of what our high level API should look like, what
kinds of operations we need to support, and where we might be able to
"do better" than SemGrep. This also serves as a nice comparison for
performance (eventually, will need to set SemGrep up on the same data).

In [None]:
!pip3 install regex xxhash

In [None]:
!gandiva-build.sh

In [None]:
%load_ext autoreload
%autoreload 2

In [9]:
from utils.cb.java import *

query = (
  call(with_name('println')) % label_as_match()
)

results = Evaluator(query).evaluate(debug=True)
display_results(results)

eval [0.0000s]: v_1 = query_java('"println"=method_invocation')
eval [6.4480s]: rs_1 = get_results(v_1, [('$match', 'method_invocation')])
eval [6.4717s]: COMPLETE.


In [8]:
from utils.cb.java import *

# SemGrep: httpservlet-path-traversal
query = (
 new() % label_as_match()
 |where| any_arg()
   |isa| call(with_name('getParameter')) % label('the call')
   |where| the_receiver()
     |isa| param() % label('the param')
     |where| the_type()
       |isa| type_(with_name('HttpServletRequest')) 
)

results = Evaluator(query).evaluate(debug=True)
display_results(results)

eval [0.0000s]: v_1 = query_java('(variable_declarator_ref).f_child{=2}object_creation_expression')
eval [5.1818s]: v_2 = query_java('(variable_declarator_ref).f_object{=1}"getParameter"=(method_invocation).f_value{=1}variable_declarator')
eval [7.6971s]: m_3 = merge_paths(v_1, v_2, on=("defs.1", "gids.3"))
eval [7.8131s]: v_4 = query_java('(formal_parameter_ref).f_value{=1}variable_declarator')
eval [8.8734s]: m_5 = merge_paths(m_3, v_4, on=("right.defs.1", "gids.2"))
eval [8.8777s]: v_6 = query_java('"HttpServletRequest"=type_identifier.f_type{=1}formal_parameter')
eval [11.5931s]: m_7 = merge_paths(m_5, v_6, on=("right.defs.1", "gids.2"))
eval [11.5983s]: rs_1 = get_results(m_7, {'left': {'left': {'left': [(None, 'variable_declarator_ref'), ('$match', 'object_creation_expression')], 'right': [(None, 'variable_declarator_ref'), (None, 'method_invocation'), ('the call', 'variable_declarator')]}, 'right': [(None, 'formal_parameter_ref'), ('the param', 'variable_declarator')]}, 'right':

In [None]:
from utils.cb.java import *

query = (
  call() % label_as_match()
  |where| any_arg()
    |isa| new() % label('because (1)')
)

results = Evaluator(query).evaluate()
display_results(results)

In [None]:
from utils.cb.java import *

# SemGrep: servletresponse-writer-xss
query = (
  call() % label_as_match()
  |where| any_arg_is(
    call(with_name('getParameter')) % label('arg was')
    |where| the_receiver_is(
      param() % label("arg's receiver was")
      |where| the_type()
        |isa| type_(with_name('HttpServletRequest'))
    )
  )
  |and_w| the_receiver_is(
    call(with_name('getWriter')) % label('receiver was')
    |where| the_receiver()
      |isa| param() % label("receiver's receiver was")
      |where| the_type()
        |isa| type_(with_name('HttpServletResponse'))
  )
)

results = Evaluator(query).evaluate()
display_results(results)


In [10]:
from utils.cb.java import *

# SemGrep: anonymous-ldap-bind
query = (
  new() % label_as_match()
  |where| the_first_arg_is(
    call(with_name('put')) % label('because 1')
    |where| the_first_arg_is(
      string() % label('because 2') # with_text('none')
    )
  )
)

results = Evaluator(query).evaluate(debug=True)
display_results(results)

eval [0.0000s]: v_1 = query_java('(variable_declarator_ref).f_child[1]{=2}object_creation_expression')
eval [3.9122s]: v_2 = query_java('(variable_declarator_ref).f_child[1]{=2}"put"=(method_invocation).f_value{=1}variable_declarator')
eval [6.4315s]: m_3 = merge_paths(v_1, v_2, on=("defs.1", "gids.3"))
eval [6.4782s]: v_4 = query_java('(string_literal).f_value{=1}variable_declarator')
eval [8.3385s]: m_5 = merge_paths(m_3, v_4, on=("right.defs.1", "gids.2"))
eval [8.3744s]: rs_1 = get_results(m_5, {'left': {'left': [(None, 'variable_declarator_ref'), ('$match', 'object_creation_expression')], 'right': [(None, 'variable_declarator_ref'), (None, 'method_invocation'), ('because 1', 'variable_declarator')]}, 'right': [(None, 'string_literal'), ('because 2', 'variable_declarator')]})
eval [8.3758s]: v_6 = query_java('(string_literal).f_child[1]{=2}"put"=(method_invocation).f_value{=1}variable_declarator')
eval [9.2704s]: m_7 = merge_paths(v_1, v_6, on=("defs.1", "gids.3"))
eval [9.3197s]: 

In [None]:
query = (comment() % label_as_match())
results = Evaluator(query).evaluate()
display_results(results)

In [None]:
# SemGrep: bad-hexa-conversion

digest_results = cb.calls('digest').receiver_is(
    cb.vars(type='MessageDigest').bind()
).bind()

for_over_results = cb.fors().target_container(digest_results).bind()

matches = cb.calls('Integer.toHexString').any_arg_is(
    cb.deep_ref(for_over_results)
)

In [None]:
# SemGrep: cbc-padding-oracle

matches = cb.calls('getInstance').first_arg_is(
    cb.str(regex=r".*/CBC/PKCS5Padding/")
)


In [None]:
# SemGrep: command-injection-formatted-runtime-call

# This one is tricky! Trying to say no exec( ... "sh", "-c", user_supplied, ...)

matches1 = cb.calls(['exec', 'loadLibrary']).first_arg_is(
    cb.str_concat_or_format()
).receiver_is(cb.calls('getRuntime').bind())

matches2 = cb.calls('exec').any_arg_is(
    cb.deep_ref(cb.siblings(
        cb.str(regex=r"(sh|bash|ksh|csh|tcsh|zsh)"),
        cb.str('-c'),
        cb.var().has_no_init()
    ))
)


In [None]:
# SemGrep: formatted-sql-string

# TODO: this one is also quite complex (just long...)
# we can probably make it a lot shorter!

# https://semgrep.dev/editor?registry=java.lang.security.audit.formatted-sql-string.formatted-sql-string

In [None]:
# SemGrep: http-response-splitting

bad_cookie1 = cb.new('Cookie').any_arg_is(
    cb.calls('getParameter').bind()
)

bad_cookie2 = cb.new('Cookie').any_arg_is(
    cb.method_params().annotated_with('@PathVariable').bind()
)

matches = cb.calls('addCookie').first_arg_is(
    cb.either(bad_cookie1, bad_cookie2)
)



In [None]:
# SemGrep: ldap-injection

context_var = cb.var([
    'InitialDirContext',
    'DirContext',
    'InitialLdapContext',
    'LdapContext',
    'LdapCtx',
    'EventDirContext'
]).bind()

matches = cb.calls('search').receiver_is(
    context_var
).second_arg_is(
    cb.anything_but(cb.str())
)


In [None]:
# SemGrep: object-deserialization

matches = cb.new('ObjectInputStream')


In [None]:
# SemGrep: script-engine-injection

matches = cb.calls('eval').receiver_is(cb.either(
    cb.field(type='ScriptEngine').bind(),
    cb.var(type='ScriptEngine').bind()
)).first_arg_is(
    cb.anything_but(cb.str())
)


In [None]:

matches1 = merge(
  call() % 'c1'
  |where| the_receiver()
    |isa| formal_parameter_ref() % 'r1',
  
  call() % 'c2'
  |where| the_receiver()
    |isa| ref('r1'),
  
  ref('c1') != ref('c2')
)