In [29]:
from toolz_graph import *

In [47]:
import sys
import json
import pprint
from toolz.curried import *

The introspect-data-linux contains data from a parsed .tu file and cleaned up with scripts to normalize it. Later we will emit that clean json directly but for now it is the interface used.

In [31]:
input_file = open("introspector-data-linux/linux_clean.json")
nodes = json.load(input_file)['nodes']

An example node, the _id is the node id, _type is the type of the field.
The other attributes are either refrences to nodes or literals, there is no denotation of the type of the field here, but the fieldnames are clearly defined.  

In [32]:
def field_report(data):
    """Helper function to print out the fields in order of usage"""
    field_names_data = dict(frequencies(list(concat(map(keys, data)))))
    items = field_names_data.items()
    sorted_fields = list(map(first, list(reversed(sorted(items, key=nth(1))))))
    return sorted_fields


In [33]:
pprint.pprint(nodes[0])

{'_id': '1', '_type': 'type_decl', 'chain': '4', 'name': '2', 'type': '3'}


In [34]:
field_report(nodes)


['_type',
 '_id',
 'type',
 'name',
 'srcp',
 'scpe',
 'OP0 :',
 'chain',
 'algn',
 'size',
 '_string_len',
 '_string',
 'valu',
 'chan',
 'OP1',
 'body',
 'link',
 'bpos',
 'retn',
 'prms',
 'used',
 'E0',
 'purp',
 'cnst',
 'E1',
 'fn',
 'value',
 'ptd',
 'argt',
 'E2',
 'expr',
 'tag',
 'unql',
 'flds',
 'mngl',
 'prec',
 'sign',
 'min',
 'max',
 'args',
 'labl',
 'E3',
 'init',
 'val',
 'vars',
 'elts',
 'domn',
 'E4',
 'qual',
 'E5',
 'csts',
 'OP2',
 'E6',
 'E7',
 'E8',
 'E9',
 'E10',
 'low',
 'E11',
 'E12',
 'decl',
 'E13',
 'E14',
 'E15',
 'idx',
 'E16',
 'E17',
 'E18',
 'E19',
 'E20',
 'E21',
 'E22',
 'E23',
 'E25',
 'E24',
 'E26',
 'E27',
 'E28',
 'E29',
 'E31',
 'E30',
 'E35',
 'E34',
 'E33',
 'E32',
 'E39',
 'E38',
 'E37',
 'E36',
 'cond',
 'E43',
 'E42',
 'E41',
 'E40',
 'E47',
 'E46',
 'E45',
 'E44',
 'E54',
 'E53',
 'E52',
 'E51',
 'E50',
 'E49',
 'E48',
 'E62',
 'E61',
 'E60',
 'E59',
 'E58',
 'E57',
 'E56',
 'E55',
 'refd',
 'E97',
 'E96',
 'E95',
 'E94',
 'E93',
 'E92

In [79]:
def join_field(role_from, role_to, field_name, nodes, exclude_left=(), exclude_right=()):
    from_field_name = field_name
    if role_from:
        from_field_name = role_from + "_" + field_name
    to_field_name = role_to + "__id"
    
    results = keyjoin(
            from_field_name,
            filterfalse(
                get_in(field_name),
                map(
                    pushdown(role_from, field_name, exclude_left),
                    filter(_hasattr(field_name), nodes),
                ),
            ),
            to_field_name,
            map(pushdown(role_to, "_id", exclude_right), nodes),
        )
    # leave in all the ids
    # results = map(mdissoc((to_field_name, from_field_name)),results)
        
    return list(results)

Now we have a list of nodes we can join the nodes with themselves on the field named `type`. We name things on the left typed and things on the right the `typed_type` meaning is is the type behind the typed object. This will give us a nice table structure

In [80]:
types_list = join_field("typed",
                        "typed_type",
                        'type',
                         nodes)

In [81]:
names = join_field("named",
                        "name",
                        'name',
                         nodes)

In [82]:
field_report(names)

['name__type',
 'name__id',
 'named_name',
 'named__type',
 'named__id',
 'name__string_len',
 'name__string',
 'named_type',
 'named_srcp',
 'named_scpe',
 'named_chain',
 'named_algn',
 'named_size',
 'named_link',
 'named_body',
 'named_bpos',
 'named_used',
 'named_cnst',
 'named_argt',
 'name_type',
 'name_scpe',
 'named_mngl',
 'name_srcp',
 'name_name',
 'named_tag',
 'name_chain',
 'named_unql',
 'named_args',
 'named_flds',
 'named_prec',
 'named_max',
 'named_min',
 'named_sign',
 'named_init',
 'named_qual',
 'named_csts',
 'named_low',
 'named_ptd',
 'named_prms',
 'named_retn',
 'named_domn',
 'named_elts']

This next block joins the names of things where the name itself is not a string,
some names type decls and the name of them is the actual string .
The actual string is represented here in the field 'name_name__string'
 

In [83]:

names2 = join_field_extra(
            role_from=None,
            role_to="name",
            from_field_name="name_name",        
            to_field_name='_id',
            nodes_from=names,
            nodes_to=nodes,
            debug=True,
            exclude_right = (
                'type',
                '_type',
            )
        )

{'left examples': {'name__id': '1',
                   'name__type': 'type_decl',
                   'name_chain': '4',
                   'name_name': '2',
                   'name_type': '3',
                   'named__id': '3',
                   'named__type': 'integer_type',
                   'named_algn': 32,
                   'named_max': '7',
                   'named_min': '6',
                   'named_name': '1',
                   'named_prec': 32,
                   'named_sign': 'signed',
                   'named_size': '5'},
 'left_key': 'name_name',
 'new_left_key': 'name_name',
 'new_right_key': 'name__id',
 'right example': {'name__id': '1', 'name_chain': '4', 'name_name': '2'},
 'right_key': '_id'}
{'results-len': 501}
{'results-example': {'name__string': 'int',
                     'name__string_len': 3,
                     'name__type': 'type_decl',
                     'name_chain': '4',
                     'name_type': '3',
                     'named__id': 

Here is a unique list of the strings used in the names

In [84]:
set(map(get('name__string'),names2))

{'DIR',
 'Elf',
 'Elf32_Addr',
 'Elf32_Chdr',
 'Elf32_Conflict',
 'Elf32_Dyn',
 'Elf32_Ehdr',
 'Elf32_Half',
 'Elf32_Lib',
 'Elf32_Move',
 'Elf32_Nhdr',
 'Elf32_Off',
 'Elf32_Phdr',
 'Elf32_RegInfo',
 'Elf32_Rel',
 'Elf32_Rela',
 'Elf32_Section',
 'Elf32_Shdr',
 'Elf32_Sword',
 'Elf32_Sxword',
 'Elf32_Sym',
 'Elf32_Syminfo',
 'Elf32_Verdaux',
 'Elf32_Verdef',
 'Elf32_Vernaux',
 'Elf32_Verneed',
 'Elf32_Versym',
 'Elf32_Word',
 'Elf32_Xword',
 'Elf32_auxv_t',
 'Elf32_gptab',
 'Elf64_Addr',
 'Elf64_Chdr',
 'Elf64_Dyn',
 'Elf64_Ehdr',
 'Elf64_Half',
 'Elf64_Lib',
 'Elf64_Move',
 'Elf64_Nhdr',
 'Elf64_Off',
 'Elf64_Phdr',
 'Elf64_Rel',
 'Elf64_Rela',
 'Elf64_Section',
 'Elf64_Shdr',
 'Elf64_Sword',
 'Elf64_Sxword',
 'Elf64_Sym',
 'Elf64_Syminfo',
 'Elf64_Verdaux',
 'Elf64_Verdef',
 'Elf64_Vernaux',
 'Elf64_Verneed',
 'Elf64_Versym',
 'Elf64_Word',
 'Elf64_Xword',
 'Elf64_auxv_t',
 'Elf_Arhdr',
 'Elf_Arsym',
 'Elf_Cmd',
 'Elf_Data',
 'Elf_Kind',
 'Elf_MIPS_ABIFlags_v0',
 'Elf_Options',
 'El

In [85]:
scpe_list = join_field("scoped",
                        "scope",
                        'scpe',
                         nodes)
field_report(scpe_list)

['scope__type',
 'scope__id',
 'scoped_scpe',
 'scoped_type',
 'scoped__type',
 'scoped__id',
 'scoped_srcp',
 'scope_name',
 'scoped_name',
 'scoped_chain',
 'scoped_algn',
 'scoped_size',
 'scoped_link',
 'scoped_body',
 'scope_flds',
 'scope_tag',
 'scope_algn',
 'scope_size',
 'scoped_bpos',
 'scope_body',
 'scope_link',
 'scope_srcp',
 'scope_scpe',
 'scope_type',
 'scope_chain',
 'scope_args',
 'scoped_used',
 'scoped_cnst',
 'scoped_argt',
 'scoped_mngl',
 'scoped_args',
 'scoped_init',
 'scope_mngl']

Now we can merge the simple and complex names together into one list for lookup

In [86]:
all_names = names + names2
sorted(field_report(all_names))

['name__id',
 'name__string',
 'name__string_len',
 'name__type',
 'name_chain',
 'name_name',
 'name_scpe',
 'name_srcp',
 'name_type',
 'named__id',
 'named__type',
 'named_algn',
 'named_args',
 'named_argt',
 'named_body',
 'named_bpos',
 'named_chain',
 'named_cnst',
 'named_csts',
 'named_domn',
 'named_elts',
 'named_flds',
 'named_init',
 'named_link',
 'named_low',
 'named_max',
 'named_min',
 'named_mngl',
 'named_name',
 'named_prec',
 'named_prms',
 'named_ptd',
 'named_qual',
 'named_retn',
 'named_scpe',
 'named_sign',
 'named_size',
 'named_srcp',
 'named_tag',
 'named_type',
 'named_unql',
 'named_used']

In [92]:
@curry
def _hasattr(atr, obj):
    return atr in obj
def filter_nodes(role, field_name, nodes, excludes) :
    return list(filterfalse(
                get_in(field_name),
                map(
                    pushdown(role, field_name, excludes),
                    filter(_hasattr(field_name), nodes),
                ),
            ))
    
def join_field_extra(role_from, role_to, from_field_name, to_field_name, nodes_from, nodes_to,
                     debug=True,
                     exclude_left=(),
                     exclude_right=()):
    """The new from field name  is the field that is generated in the new obj
the new to field is also prefixed by the role
    """
    new_from_field_name = from_field_name
    if role_from:
        new_from_field_name = role_from + "_" + from_field_name
    new_to_field_name = to_field_name
    if role_to :
        new_to_field_name = role_to + "_" + to_field_name
 
    left = filter_nodes(role_from, from_field_name, nodes_from, exclude_left)
    right =filter_nodes(role_to, to_field_name, nodes_to, exclude_right) 

    if debug:
        if left:
            pprint.pprint({
                "left examples" :left[0],
                "new_left_key" : new_from_field_name,
                "left_key" : from_field_name,
            })
        if right:
            pprint.pprint({
                 "right example" : right[0],
                "right_key" :to_field_name,
                "new_right_key" :new_to_field_name})
    if not right :
        return
    if not left:
        return
    results = keyjoin(
            new_from_field_name,
            left,
            new_to_field_name,
            right,
        )
    results = map(mdissoc((new_to_field_name, new_from_field_name)),results)
    results_list = list(results)

    if debug:
        pprint.pprint({
            'results-len': len(results_list),
         })
        if len(results_list):
          pprint.pprint({
            'results-example' : results_list[0]})
    
    return results_list


In [93]:
field_report(all_names)

['name__type',
 'named_name',
 'named__type',
 'named__id',
 'name__id',
 'name__string_len',
 'name__string',
 'named_type',
 'named_srcp',
 'named_scpe',
 'named_chain',
 'named_algn',
 'named_size',
 'named_link',
 'named_body',
 'named_bpos',
 'named_used',
 'named_cnst',
 'name_type',
 'name_scpe',
 'name_srcp',
 'name_chain',
 'named_unql',
 'named_argt',
 'named_prec',
 'named_max',
 'named_min',
 'named_sign',
 'named_tag',
 'named_mngl',
 'named_flds',
 'name_name',
 'named_args',
 'named_init',
 'named_qual',
 'named_csts',
 'named_low',
 'named_ptd',
 'named_prms',
 'named_retn',
 'named_domn',
 'named_elts']

In [94]:
field_report(all_names)

scope_name = join_field_extra(
            nodes_from=scpe_list,
            role_from=None,
            from_field_name='scope_name',
            role_to='scope_name',
    nodes_to=all_names,        
    to_field_name='name__id',            
    
    debug=True)
field_report(scope_name)

{'left examples': {'scope__id': '158',
                   'scope__type': 'record_type',
                   'scope_algn': 64,
                   'scope_flds': '164',
                   'scope_name': '163',
                   'scope_size': '157',
                   'scope_tag': 'struct',
                   'scoped__id': '164',
                   'scoped__type': 'field_decl',
                   'scoped_algn': 32,
                   'scoped_bpos': '20',
                   'scoped_chain': '171',
                   'scoped_name': '170',
                   'scoped_scpe': '158',
                   'scoped_size': '5',
                   'scoped_srcp': '<built-in>:0',
                   'scoped_type': '26'},
 'left_key': 'scope_name',
 'new_left_key': 'scope_name'}
{'new_right_key': 'scope_name_name__id',
 'right example': {'scope_name_name__id': '1',
                   'scope_name_name__type': 'type_decl',
                   'scope_name_name_chain': '4',
                   'scope_name_name_name

['scope_name_name__type',
 'scope_name_named_name',
 'scope_name_named__type',
 'scope_name_named__id',
 'scope__type',
 'scope__id',
 'scoped_scpe',
 'scoped_type',
 'scoped__type',
 'scoped__id',
 'scope_name_name__string_len',
 'scope_name_name__string',
 'scoped_srcp',
 'scoped_name',
 'scoped_chain',
 'scoped_algn',
 'scoped_size',
 'scoped_link',
 'scoped_body',
 'scope_flds',
 'scope_tag',
 'scope_algn',
 'scope_size',
 'scoped_bpos',
 'scope_name_named_algn',
 'scope_name_named_size',
 'scope_name_named_srcp',
 'scope_name_named_scpe',
 'scope_name_named_type',
 'scope_name_named_chain',
 'scope_name_named_flds',
 'scope_name_named_tag',
 'scope_name_named_link',
 'scope_name_named_body',
 'scope_link',
 'scope_body',
 'scope_srcp',
 'scope_scpe',
 'scope_type',
 'scope_chain',
 'scope_name_named_args',
 'scope_args',
 'scoped_used',
 'scoped_cnst',
 'scoped_argt',
 'scope_name_named_used',
 'scoped_mngl',
 'scope_name_named_argt',
 'scoped_args',
 'scope_name_named_unql',
 'sc

In [95]:
scoped_name = join_field_extra(
            nodes_from=scpe_list,
            role_from=None,
            from_field_name='scoped_name',
            role_to='scoped_name',
    nodes_to=all_names,        
    to_field_name='name__id',                
    debug=True)
field_report(scoped_name)

{'left examples': {'scope__id': '158',
                   'scope__type': 'record_type',
                   'scope_algn': 64,
                   'scope_flds': '164',
                   'scope_name': '163',
                   'scope_size': '157',
                   'scope_tag': 'struct',
                   'scoped__id': '164',
                   'scoped__type': 'field_decl',
                   'scoped_algn': 32,
                   'scoped_bpos': '20',
                   'scoped_chain': '171',
                   'scoped_name': '170',
                   'scoped_scpe': '158',
                   'scoped_size': '5',
                   'scoped_srcp': '<built-in>:0',
                   'scoped_type': '26'},
 'left_key': 'scoped_name',
 'new_left_key': 'scoped_name'}
{'new_right_key': 'scoped_name_name__id',
 'right example': {'scoped_name_name__id': '1',
                   'scoped_name_name__type': 'type_decl',
                   'scoped_name_name_chain': '4',
                   'scoped_name_na

['scoped_name_name__string_len',
 'scoped_name_name__string',
 'scoped_name_name__type',
 'scoped_name_named_name',
 'scoped_name_named__type',
 'scoped_name_named__id',
 'scope__type',
 'scope__id',
 'scoped_srcp',
 'scoped_scpe',
 'scoped_type',
 'scoped__type',
 'scoped__id',
 'scoped_name_named_srcp',
 'scoped_name_named_type',
 'scoped_name_named_scpe',
 'scope_name',
 'scoped_name_named_algn',
 'scoped_algn',
 'scoped_size',
 'scoped_name_named_size',
 'scoped_chain',
 'scoped_name_named_chain',
 'scope_body',
 'scope_link',
 'scope_srcp',
 'scope_scpe',
 'scope_type',
 'scope_chain',
 'scope_args',
 'scoped_used',
 'scoped_name_named_used',
 'scoped_argt',
 'scoped_name_named_argt',
 'scope_flds',
 'scope_tag',
 'scope_algn',
 'scope_size',
 'scoped_bpos',
 'scoped_name_named_bpos',
 'scoped_init',
 'scoped_name_named_init',
 'scoped_link',
 'scoped_body',
 'scoped_name_named_link',
 'scoped_name_named_body',
 'scoped_name_named_cnst',
 'scoped_cnst',
 'scoped_mngl',
 'scoped_na

now we join the scoped with the scope names

In [105]:
scoped_scope = join_field_extra(
            nodes_from=scoped_name,
            role_from='scope',
    
            from_field_name='scoped__id',
            role_to='scoped',
            nodes_to=scope_name,        
            to_field_name='scope__id',                
    debug=True)

{'left examples': {'scope_scope__id': '158',
                   'scope_scope__type': 'record_type',
                   'scope_scope_algn': 64,
                   'scope_scope_flds': '164',
                   'scope_scope_name': '163',
                   'scope_scope_size': '157',
                   'scope_scope_tag': 'struct',
                   'scope_scoped__id': '164',
                   'scope_scoped__type': 'field_decl',
                   'scope_scoped_algn': 32,
                   'scope_scoped_bpos': '20',
                   'scope_scoped_chain': '171',
                   'scope_scoped_name_name__string': 'gp_offset',
                   'scope_scoped_name_name__string_len': 9,
                   'scope_scoped_name_name__type': 'identifier_node',
                   'scope_scoped_name_named__id': '164',
                   'scope_scoped_name_named__type': 'field_decl',
                   'scope_scoped_name_named_algn': 32,
                   'scope_scoped_name_named_bpos': '20',
 

In [106]:
scope_list = set(map(get(['scope_scoped_name_name__string',
             'scope_scope__type', 
             
             'scoped_scope_name_name__string','scoped_scoped__type']),scoped_scope))

{('print_stat', 'translation_unit_decl', 'print_stat', 'parm_decl'),
 ('__cpu_to_le64p', 'translation_unit_decl', '__cpu_to_le64p', 'result_decl'),
 ('target__has_cpu',
  'translation_unit_decl',
  'target__has_cpu',
  'result_decl'),
 ('fgets_unlocked', 'translation_unit_decl', 'fgets_unlocked', 'parm_decl'),
 ('print_pmu_mappings',
  'translation_unit_decl',
  'print_pmu_mappings',
  'parm_decl'),
 ('process_sample_time',
  'translation_unit_decl',
  'process_sample_time',
  'parm_decl'),
 ('write_total_mem', 'translation_unit_decl', 'write_total_mem', 'parm_decl'),
 ('process_cache', 'translation_unit_decl', 'process_cache', 'parm_decl'),
 ('machine__kernel_ip',
  'translation_unit_decl',
  'machine__kernel_ip',
  'parm_decl'),
 ('strncpy', 'translation_unit_decl', 'strncpy', 'result_decl'),
 ('machine__kernel_start',
  'translation_unit_decl',
  'machine__kernel_start',
  'result_decl'),
 ('perf_header__process_sections',
  'translation_unit_decl',
  'perf_header__process_sections'

In [110]:
list(groupby(['scope_scope__type','scoped_scoped__type'],scoped_scope))

[('translation_unit_decl', 'parm_decl'),
 ('translation_unit_decl', 'result_decl'),
 ('translation_unit_decl', 'label_decl'),
 ('translation_unit_decl', 'type_decl'),
 ('translation_unit_decl', 'var_decl')]