In [2]:
import pandas as pd
import os
import re
from tqdm.notebook import tqdm
tqdm.pandas()
from tree_sitter import Language, Parser
from copy import deepcopy

In [3]:
tree_sitter_path = "../assets/tree-sitter-cpp/"

Language.build_library(
  # Store the library in the `build` directory
  "../assets/build/cpp.so", 
   [tree_sitter_path])

False

In [4]:
CP_LANGUAGE = Language('../tree_sitter/cpp.so', 'cpp')
parser = Parser()
parser.set_language(CP_LANGUAGE)

# get header files

In [18]:
def preprocess_appfile(cpp_file):
    cpp_file = re.sub(r"(/\*.+?(?=\*/)\*/)", "", cpp_file, flags=re.DOTALL)
    cpp_file = re.sub(r"(//.+?\n)", r"\n", cpp_file, flags=re.DOTALL)
    cpp_file = re.sub(r"(#.*?\n)|(#(endif|else))", r"\n", cpp_file)
    cpp_file = cpp_file.split('\n')
    cpp_file = [x.strip() for x in cpp_file if x.strip() != '']
    cpp_file = '\n'.join(cpp_file)
    return cpp_file

def get_h_files(path):
    dir_list = os.listdir(path)
    
    codes = []
    for id_ in dir_list:
        temp_path = os.path.join(path, id_)
        
        for root, dirnames, filenames in os.walk(temp_path):
            
            for name in filenames:    
                if name.endswith(".h"):
                    filepath = os.path.join(root, name)
                    library = root.split("/")[3]
                    with open(filepath, 'r', errors='ignore') as f:
                        code = f.read()
                    code = preprocess_appfile(code)
                    codes.append({
                        'id': id_,
                        'library': library,
                        'path': filepath,
                        'code': code,
                    })
    df = pd.DataFrame(codes)
    return df

In [16]:
df = get_h_files(path)

In [240]:
df.head(3)

Unnamed: 0,id,library,path,code,len,constructor,methods,processed_methods,fname,parents
0,168110269,Xbox360ControllerLEDs,../libraries/168110269/Xbox360ControllerLEDs/s...,namespace Xbox360Controller_LEDs {\nenum class...,720,AnimationBase LED_Animation XboxLEDAnimations ...,{'AnimationBase': ['constexpr AnimationBase(ui...,"{'AnimationBase': {'AnimationBase', 'getFrame'...",X360ControllerLEDs,"{'LED_Animation': ['AnimationBase'], 'XboxLED_..."
1,260585944,Adafruit_LIS331,../libraries/260585944/Adafruit_LIS331/Adafrui...,0x32\n0x0F\n0x22\ntypedef enum hpf_cutoff {\nL...,191,Adafruit_LIS331,{'Adafruit_LIS331': ['uint8_t getDeviceID(void...,"{'Adafruit_LIS331': {'getDeviceID', 'writeRang...",Adafruit_LIS331,{'Adafruit_LIS331': ['Adafruit_Sensor']}
2,260585944,Adafruit_LIS331,../libraries/260585944/Adafruit_LIS331/Adafrui...,"typedef enum {\nLIS331HH_RANGE_6_G = 0x0,\nLIS...",71,Adafruit_LIS331HH,{'Adafruit_LIS331HH': ['bool begin_I2C(uint8_t...,"{'Adafruit_LIS331HH': {'setRange', 'begin_I2C'...",Adafruit_LIS331HH,{'Adafruit_LIS331HH': ['Adafruit_LIS331']}


In [None]:
df_core = get_h_files("../ArduinoCore-avr")

In [241]:
df_core.head(3)

Unnamed: 0,id,library,path,code,len,constructor,methods,processed_methods,fname,parents
0,bootloaders,stk500v2,../ArduinoCore-avr/bootloaders/stk500v2/avr_cp...,,0,,,,avr_cpunames,
1,bootloaders,stk500v2,../ArduinoCore-avr/bootloaders/stk500v2/command.h,,0,,,,command,
2,bootloaders,stk500v2,../ArduinoCore-avr/bootloaders/stk500v2/avrint...,"prog_char\tgAvrInt_RESET[]\t\t\tPROGMEM\t=\t""R...",1064,,,,avrinterruptnames,


# helper traverse the AST

In [6]:
translator = str.maketrans('', '', '<>"\'')

### Helper to traverse treesitter output
def is_terminal(node):
    if len(node.children) == 0:
        return True
    else:
        return False
    
def get_start_tuple(node):
    return node.start_point

def get_end_tuple(node):
    return node.end_point

def get_substring_of_loc(start_tuple, end_tuple, loc_list):
    line_number_start, id_begin = start_tuple
    line_number_end, id_end = end_tuple
    
    temp_list = []
    if line_number_start != line_number_end:
        for idx in range(line_number_start, line_number_end+1):
            if idx == line_number_start:
                temp_loc = loc_list[idx][id_begin:]
            elif idx == line_number_end:
                temp_loc = loc_list[idx][:id_end]
            else:
                temp_loc= loc_list[idx]
            
            if temp_loc != '':
                temp_list.append(temp_loc)
    else:
        temp_list.append(loc_list[line_number_start][id_begin:id_end])
    out = '\n'.join(temp_list)
    return out

def fix_punctuation(str_input):
    out = str_input.translate(str.maketrans('"', "'", ';'))
    return out

def get_node_name(node, loc_list):
    if is_terminal(node):
        start_tuple = get_start_tuple(node)
        end_tuple = get_end_tuple(node)
        substr = get_substring_of_loc(start_tuple=start_tuple, end_tuple=end_tuple, loc_list=loc_list).strip()
        substr = fix_punctuation(substr)
        return node.type, substr
    else:
        return (node.type, '')

def resolve_string_literal(node, loc_list):
    start_tuple = get_start_tuple(node)
    end_tuple = get_end_tuple(node)
    substr = get_substring_of_loc(start_tuple=start_tuple, end_tuple=end_tuple, loc_list=loc_list).strip()
    substr = fix_punctuation(substr)
    return substr

def get_substring(node, loc_list):
    start_tuple = get_start_tuple(node)
    end_tuple = get_end_tuple(node)
    return get_substring_of_loc(start_tuple=start_tuple, end_tuple=end_tuple, loc_list=loc_list).strip()
###

### Traverse the AST
def traverse_tree_with_path(tree):
    cursor = tree.walk()
    current_path = []
    current_path.append(cursor.node.type)
    reached_root = False
    while reached_root == False:
        yield cursor.node, current_path

        if cursor.goto_first_child():
            current_path.append(cursor.node.type)
            continue
        
        if cursor.goto_next_sibling():
            if current_path:
                current_path.pop(-1)
            current_path.append(cursor.node.type)
            continue

        retracing = True
        while retracing:
            
            if not cursor.goto_parent():
                current_path.pop(-1)
                retracing = False
                reached_root = True
                
            if current_path:
                current_path.pop(-1)
                # print("a")
                
            if cursor.goto_next_sibling():
                retracing = False
                if current_path:
                    current_path.pop(-1)
                    
                current_path.append(cursor.node.type)
###

# process header files

In [20]:
def get_constructor(code):
    try:
        classes = []
        query = CP_LANGUAGE.query("""
        (class_specifier (type_identifier) @constructor)
        """)
        tree = parser.parse(bytes(code, "utf8"))
        query_results = query.captures(tree.root_node)
        
        for node, _ in query_results:
            temp1, temp2 = get_node_name(node=node, loc_list=code.split("\n"))
            classes.append(temp2)
        
        if len(classes) > 0:
            return ' '.join(classes)
        else: 
            return 'null'
    
    except Exception as e:
        return 'null'
    
def get_methods(code):
    try:
        methods_dict = {}
        query = CP_LANGUAGE.query("""
        ((class_specifier) @constructor)
        """)
        tree = parser.parse(bytes(code, "utf8"))
        query_results = query.captures(tree.root_node)
        is_public = False
        temp_exp = ''
        
        for item, _ in query_results:
            methods = []
            
            for node, path in traverse_tree_with_path(item):
                temp1, temp2 = get_node_name(node=node, loc_list=code.split("\n"))
                
                if path[-1] == 'public':
                    is_public = True

                elif path[-1] == 'private':
                    is_public = False

                if len(path)>=2:
                    if path[-2] == 'class_specifier' and path[-1] == 'type_identifier':
                        classname = temp2
                    
                    if is_public:
                        
                        if path[-1] in ('function_definition'):
                            temp_exp = get_substring(node, code.split("\n"))
                            methods.append(temp_exp)
                        elif path[-1] == 'field_declaration':
                            temp_exp = get_substring(node, code.split("\n"))

                        if path[-1] == 'function_declarator' and temp_exp != '':
                            methods.append(temp_exp)
                        elif path[-1] == ';':
                            temp_exp = ''
            
            methods_dict[classname] = methods.copy()
        
        if len(methods_dict) > 0:
            return methods_dict
        else:
            return 'null'
    
    except Exception as e:
        # print(e)
        return 'null'
    
def process_methods(methods_dict):
    try:
        new_methods_dict = {}
        # new_methods = []

        for constructor, methods in methods_dict.items():
            new_methods_dict[constructor] = []

            for method in methods:
                tree = parser.parse(bytes(method, "utf8"))

                for node, path in traverse_tree_with_path(tree.root_node):
                    temp1, temp2 = get_node_name(node=node, loc_list=method.split("\n"))

                    if len(path) >= 2:
                        if path[-2] == 'function_declarator' and path[-1] == 'identifier':
                            new_methods_dict[constructor].append(temp2)

            new_methods_dict[constructor] = set(new_methods_dict[constructor])
        
        if len(new_methods_dict)> 0:
            return new_methods_dict
        
        else:
            return "null"
        
    except:
        return 'null'
            

In [22]:
df['len'] = df.code.str.split().str.len()
df = df[df.len <= 5000].copy()
df['constructor'] = df.code.progress_apply(lambda x: get_constructor(x)) 
df['methods'] = df.code.progress_apply(lambda x: get_methods(x)) 
df['processed_methods'] = df.methods.progress_apply(lambda x: process_methods(x)) 
df['fname'] = df.path.progress_apply(lambda x: x.split("/")[-1].replace(".h", "").strip())

  0%|          | 0/16391 [00:00<?, ?it/s]

  0%|          | 0/16391 [00:00<?, ?it/s]

  0%|          | 0/16391 [00:00<?, ?it/s]

  0%|          | 0/16391 [00:00<?, ?it/s]

In [26]:
df.head(5)

Unnamed: 0,id,library,path,code,len,constructor,methods,processed_methods,fname
0,168110269,Xbox360ControllerLEDs,../libraries/168110269/Xbox360ControllerLEDs/s...,namespace Xbox360Controller_LEDs {\nenum class...,720,AnimationBase LED_Animation XboxLEDAnimations ...,{'AnimationBase': ['constexpr AnimationBase(ui...,"{'AnimationBase': {'AnimationBase', 'getFrame'...",X360ControllerLEDs
1,260585944,Adafruit_LIS331,../libraries/260585944/Adafruit_LIS331/Adafrui...,0x32\n0x0F\n0x22\ntypedef enum hpf_cutoff {\nL...,191,Adafruit_LIS331,{'Adafruit_LIS331': ['uint8_t getDeviceID(void...,"{'Adafruit_LIS331': {'getDeviceID', 'writeRang...",Adafruit_LIS331
2,260585944,Adafruit_LIS331,../libraries/260585944/Adafruit_LIS331/Adafrui...,"typedef enum {\nLIS331HH_RANGE_6_G = 0x0,\nLIS...",71,Adafruit_LIS331HH,{'Adafruit_LIS331HH': ['bool begin_I2C(uint8_t...,"{'Adafruit_LIS331HH': {'setRange', 'begin_I2C'...",Adafruit_LIS331HH
3,260585944,Adafruit_LIS331,../libraries/260585944/Adafruit_LIS331/Adafrui...,"typedef enum {\nH3LIS331_RANGE_100_G = 0x0,\nH...",71,Adafruit_H3LIS331,{'Adafruit_H3LIS331': ['bool begin_I2C(uint8_t...,"{'Adafruit_H3LIS331': {'setRange', 'begin_I2C'...",Adafruit_H3LIS331
4,271201127,leaphy-extensions-extra,../libraries/271201127/leaphy-extensions-extra...,extern int _snelHeid;\nextern int _delayTime;\...,61,LEDSTRIP,"{'LEDSTRIP': ['void runFunction(uint8_t, uint8...","{'LEDSTRIP': {'breathe', 'basis', 'rainbow', '...",ledstrip


In [25]:
df_core['len'] = df_core.code.str.split().str.len()
df_core['constructor'] = df_core.code.progress_apply(lambda x: get_constructor(x)) 
df_core['methods'] = df_core.code.progress_apply(lambda x: get_methods(x)) 
df_core['processed_methods'] = df_core.methods.progress_apply(lambda x: process_methods(x))
df_core['fname'] = df_core.path.progress_apply(lambda x: x.split("/")[-1].replace(".h", "").strip())

  0%|          | 0/205 [00:00<?, ?it/s]

  0%|          | 0/205 [00:00<?, ?it/s]

  0%|          | 0/205 [00:00<?, ?it/s]

  0%|          | 0/205 [00:00<?, ?it/s]

In [27]:
df_core.head(5)

Unnamed: 0,id,library,path,code,len,constructor,methods,processed_methods,fname
0,bootloaders,stk500v2,../ArduinoCore-avr/bootloaders/stk500v2/avr_cp...,,0,,,,avr_cpunames
1,bootloaders,stk500v2,../ArduinoCore-avr/bootloaders/stk500v2/command.h,,0,,,,command
2,bootloaders,stk500v2,../ArduinoCore-avr/bootloaders/stk500v2/avrint...,"prog_char\tgAvrInt_RESET[]\t\t\tPROGMEM\t=\t""R...",1064,,,,avrinterruptnames
3,bootloaders,caterina,../ArduinoCore-avr/bootloaders/caterina/Descri...,typedef struct\n{\nUSB_Descriptor_Configuratio...,36,,,,Descriptors
4,bootloaders,caterina,../ArduinoCore-avr/bootloaders/caterina/Cateri...,typedef void (*AppPtr_t)(void) ATTR_NO_RETURN;...,27,,,,Caterina


# resolve parent class

In [54]:
def get_parent_class(code):
    try:
        parents_dict = {}
        query = CP_LANGUAGE.query("""
        ((class_specifier) @constructor)
        """)
        tree = parser.parse(bytes(code, "utf8"))
        query_results = query.captures(tree.root_node)
        classname = ''
        
        for item, _ in query_results:
            parents = []
            
            for node, path in traverse_tree_with_path(item):
                temp1, temp2 = get_node_name(node=node, loc_list=code.split("\n"))

                if path == ['class_specifier', 'type_identifier']:
                    classname = temp2

                elif 'base_class_clause' in path and path[-1] == 'type_identifier':
                    parents.append(temp2)

                elif len(parents) > 0 and 'base_class_clause' not in path:
                    parents_dict[classname] = parents.copy()
                    classname = ''
                    parents = []
                    break

        if len(parents_dict) > 0:
            return parents_dict
        else: 
            return 'null'
    
    except Exception as e:
        print(e)
        return 'null'
    
def get_all_core_method_dict_from_repo(df, is_resolved=False):
    df_cp = deepcopy(df)
    df_cp = df_cp[(df_cp.methods!='null')].copy()
    methods_all = {}
    
    if is_resolved:
        
        for id_, repo_name, path, code, methods, constructor, fname in df_cp.values:

            if type(methods) == dict:

                for key, method_list in methods.items():

                    if key not in methods_all:
                        methods_all[key] = []

                    for method in method_list:
                        methods_all[key].append(method)

                    methods_all[key] = set(methods_all[key])
    else:
         for id_, repo_name, path, code, _, constructor, methods, processed_methods, fname, parents in df_cp.values:

            if type(processed_methods) == dict:

                for key, method_list in processed_methods.items():

                    if key not in methods_all:
                        methods_all[key] = []

                    for method in method_list:
                        methods_all[key].append(method)

                    methods_all[key] = set(methods_all[key])
        
    return methods_all

def resolve_inheritance_core(df, is_resolved = False):
    df_cp = df.copy()
    df_list = []
    
    for id_, repo_name, path, code, _, constructor, methods, processed_methods, fname, parents in tqdm(df_cp.values, total=len(df_cp)):
        class_to_methods_all = get_all_core_method_dict_from_repo(df, is_resolved)
        class_to_parents_all = get_all_parent_dict_from_repo(df, repo_name)
        method_dict = deepcopy(processed_methods)
        resolved_method_dict = {}
        
        if type(parents) == dict:
            resolved_method_dict =  resolve_parent_class_iter(parents, class_to_parents_all, class_to_methods_all)

        if len(resolved_method_dict) > 0 :
            
            for classname, method_list in resolved_method_dict.items():
                resolved_method_dict[classname] = set(method_list)
            
            if type(method_dict) == str:
                method_dict = {}
            
            for classname, method_set in resolved_method_dict.items():
                # method_dict.pop(classname, None)
                # method_dict[classname] = resolved_method_dict.get(classname)
                if classname not in method_dict:
                    method_dict[classname] = set()
                    
                method_dict[classname] = method_dict[classname].union(method_set)

                
        temp_dict = {'id': id_,
                    'repo_name': repo_name,
                    'path': path,
                    'code': code,
                    'methods': method_dict,
                    'constructor': constructor,
                    'fname': fname}    
        
        df_list.append(temp_dict)
    
    df_out = pd.DataFrame(df_list)
    return df_out

def get_all_method_dict_from_repo(df, repo_name):
    df_cp = deepcopy(df)
    df_cp = df_cp[(df_cp.library==repo_name) & (df_cp.processed_methods!='null')].copy()
    methods_all = {}

    for id_, repo_name, path, code, _, constructor, methods, processed_methods, fname, parents in df_cp.values:
        
        if type(processed_methods) == dict:
            
            for key, method_set in processed_methods.items():

                if key not in methods_all:
                    methods_all[key] = set()
                
                methods_all[key] = methods_all[key].union(method_set)
                   
    return methods_all

def get_all_parent_dict_from_repo(df, repo_name, is_core=False):
    df_cp = deepcopy(df)
    
    if is_core:
        df_cp = df_cp[(df_cp.processed_methods!='null')].copy()
    
    else:
        df_cp = df_cp[(df_cp.library==repo_name) & (df_cp.processed_methods!='null')].copy()

    parents_all = {}
    
    for id_, repo_name, path, code, _, constructor, methods, processed_methods, fname, parents in df_cp.values:
        
        if type(parents) == dict:
            parents_all.update(parents)
        
    new_parents_all = {}
    
    for key, val in parents_all.items():
        new_parents_all[key] = []
        
        for temp_val in val:
            
            if temp_val != key:
                new_parents_all[key].append(temp_val)
        
    return new_parents_all
            
def resolve_inheritance(df, class_to_methods_all_core):
    df_cp = df.copy()
    df_list = []
    
    for id_, repo_name, path, code, _, constructor, methods, processed_methods, fname, parents in tqdm(df_cp.values, total=len(df_cp)):
        class_to_methods_all = get_all_method_dict_from_repo(df, repo_name)
        # print(class_to_methods_all)
        for classname, method_set in class_to_methods_all_core.items():
            
            if classname not in class_to_methods_all:
                class_to_methods_all[classname] = set()
            
            class_to_methods_all[classname] = class_to_methods_all[classname].union(method_set)
        
        class_to_parents_all = get_all_parent_dict_from_repo(df, repo_name)
        method_dict = deepcopy(processed_methods)
        resolved_method_dict = {}
        
        if type(parents) == dict:
            resolved_method_dict =  resolve_parent_class_iter(parents, class_to_parents_all, class_to_methods_all)

        if len(resolved_method_dict) > 0 :
            
            for classname, method_list in resolved_method_dict.items():
                resolved_method_dict[classname] = set(method_list)
            
            if type(method_dict) == str:
                method_dict = {}
            
            for classname, method_set in resolved_method_dict.items():
                # method_dict.pop(classname, None)
                
                if classname not in method_dict:
                    method_dict[classname] = set()
                    
                method_dict[classname] = method_dict[classname].union(method_set) 

                
        temp_dict = {'id': id_,
                    'repo_name': repo_name,
                    'path': path,
                    'code': code,
                    'methods': method_dict,
                    'methods_exp': methods,
                    'constructor': constructor,
                    'fname': fname}    
        
        df_list.append(temp_dict)
    
    df_out = pd.DataFrame(df_list)
    return df_out 

def resolve_parent_class_iter(parent_dict, all_parent_dict, all_method_dict):
    all_parent_dict_cp = deepcopy(all_parent_dict)

    all_method_dict_cp = deepcopy(all_method_dict)
    
    method_dict = {}
    
    
    for classname, parent_classname_list in parent_dict.items():
        temp_method_list = list(all_method_dict_cp.get(classname, {}))
        
        if classname not in method_dict:
            method_dict[classname] = []
        # print(method_dict)
        for method in temp_method_list:
            
            if  method not in method_dict[classname]: 
                
                method_dict[classname].append(method)
        
        parent_methods = resolve_parent_class(parent_dict, all_parent_dict, all_method_dict)
        # print(method_dict)
        for method in parent_methods:

            if method not in method_dict[classname]:
                method_dict[classname].append(method)
                
    return method_dict

def resolve_parent_class(parent_dict, all_parent_dict, all_method_dict):
    all_parent_dict_cp = deepcopy(all_parent_dict)

    all_method_dict_cp = deepcopy(all_method_dict)

    parent_dict_cp = deepcopy(parent_dict)
    
    output_list = []
    
    for classname, parent_classname_list in parent_dict.items():

        for parent_classname in parent_classname_list:
            parent_methods = all_method_dict_cp.get(parent_classname, {})
            
            for method in parent_methods:
                
                if method != classname:
                    output_list.append(method)

            parent_parent_classname_list = all_parent_dict_cp.get(parent_classname, [])
            temp_parent_dict = {parent_classname: parent_parent_classname_list}
            temp_methods = resolve_parent_class(temp_parent_dict, all_parent_dict, all_method_dict)
            
            for method in temp_methods:
                output_list.append(method)
                
    return output_list 

In [29]:
df['parents'] = df.code.progress_apply(lambda x: get_parent_class(x))
df_cp = deepcopy(df)

df_core['parents'] = df_core.code.progress_apply(lambda x: get_parent_class(x))
df_core_cp = deepcopy(df_core)

  0%|          | 0/16391 [00:00<?, ?it/s]

  0%|          | 0/205 [00:00<?, ?it/s]

In [46]:
df_core_resolved = resolve_inheritance_core(df_core_cp)

  0%|          | 0/205 [00:00<?, ?it/s]

In [64]:
# test_method_dict = get_all_core_method_dict_from_repo(df_core_cp)
# test_parent_dict = get_all_parent_dict_from_repo(df_core_cp, "UDP", True)
# resolve_parent_class_iter({"UDP": ["Stream"]}, test_parent_dict, test_method_dict)

In [63]:
# df_core_cp[(df_core_cp.fname=="Print") & (df_core_cp.processed_methods!='null')].iloc[0].processed_methods

In [62]:
# df_core_cp[(df_core_cp.fname=="Stream") & (df_core_cp.processed_methods!='null')].iloc[0].processed_methods

In [61]:
# df_core_cp[(df_core_cp.fname=="Udp") & (df_core_cp.processed_methods!='null')].iloc[0].processed_methods

In [1079]:
# df_core_cp[(df_core_cp.repo_name=="cores") & (df_core_cp.processed_methods!='null')].copy()

In [1064]:
# df_core_cp[df_core_cp.methods!='null'].iloc[0].processed_methods

In [22]:
# df_core_resolved[df_core_resolved.methods!='null'].iloc[0].methods

In [52]:
class_to_methods_all_core = get_all_core_method_dict_from_repo(df_core_resolved, is_resolved=True)
# class_to_methods_all_core = {key: val for key, val in class_to_methods_all_core.items()}

In [55]:
df_resolved = resolve_inheritance(df_cp, class_to_methods_all_core)

  0%|          | 0/16391 [00:00<?, ?it/s]

In [73]:
df_resolved.head(5)

Unnamed: 0,id,repo_name,path,code,methods,methods_exp,constructor,fname
0,168110269,Xbox360ControllerLEDs,../libraries/168110269/Xbox360ControllerLEDs/s...,namespace Xbox360Controller_LEDs {\nenum class...,"{'AnimationBase': {'AnimationBase', 'getFrame'...",{'AnimationBase': ['constexpr AnimationBase(ui...,AnimationBase LED_Animation XboxLEDAnimations ...,X360ControllerLEDs
1,260585944,Adafruit_LIS331,../libraries/260585944/Adafruit_LIS331/Adafrui...,0x32\n0x0F\n0x22\ntypedef enum hpf_cutoff {\nL...,"{'Adafruit_LIS331': {'getSensor', 'getEvent', ...",{'Adafruit_LIS331': ['uint8_t getDeviceID(void...,Adafruit_LIS331,Adafruit_LIS331
2,260585944,Adafruit_LIS331,../libraries/260585944/Adafruit_LIS331/Adafrui...,"typedef enum {\nLIS331HH_RANGE_6_G = 0x0,\nLIS...","{'Adafruit_LIS331HH': {'HPFReset', 'getRange',...",{'Adafruit_LIS331HH': ['bool begin_I2C(uint8_t...,Adafruit_LIS331HH,Adafruit_LIS331HH
3,260585944,Adafruit_LIS331,../libraries/260585944/Adafruit_LIS331/Adafrui...,"typedef enum {\nH3LIS331_RANGE_100_G = 0x0,\nH...","{'Adafruit_H3LIS331': {'HPFReset', 'getRange',...",{'Adafruit_H3LIS331': ['bool begin_I2C(uint8_t...,Adafruit_H3LIS331,Adafruit_H3LIS331
4,271201127,leaphy-extensions-extra,../libraries/271201127/leaphy-extensions-extra...,extern int _snelHeid;\nextern int _delayTime;\...,"{'LEDSTRIP': {'basis', 'breathe', 'rainbow', '...","{'LEDSTRIP': ['void runFunction(uint8_t, uint8...",LEDSTRIP,ledstrip


# filter df

In [209]:
df_filtered = df_resolved[~(df_resolved.methods=='null') | ~(df_resolved.methods=='null')].copy()

In [210]:
len(df_filtered), df_filtered.repo_name.nunique()

(6596, 2230)

In [117]:
def extract_constructor_perfname(df):
    df_cp = df.copy()
    df_list = []
    for id_, library, path, code, methods, methods_exp, constructors, fname in df_cp.values:
        constructors = constructors.split(" ")
        for constructor in constructors:
            temp_methods = methods.get(constructor, set())
            df_list.append(
                {
                    'id': id_,
                    'library': library,
                    'path': path,
                    # 'code': code,
                    'methods': "###".join(list(temp_methods)),
                    'len_methods': len(temp_methods),
                    'constructor': constructor,
                    'fname': fname
                }
            )
    df_out = pd.DataFrame(df_list)
    df_out = df_out[df_out.len_methods > 0].copy()
    df_out.drop(columns='len_methods', inplace=True)
    return df_out

In [211]:
df_filtered = extract_constructor_perfname(df_filtered)
len(df_filtered)

8743

In [212]:
df_filtered.drop_duplicates(inplace=True)
len(df_filtered)

8442

In [213]:
df_filtered.head(5)

Unnamed: 0,id,library,path,methods,constructor,fname
0,168110269,Xbox360ControllerLEDs,../libraries/168110269/Xbox360ControllerLEDs/s...,AnimationBase###getFrame###getNumFrames,AnimationBase,X360ControllerLEDs
1,168110269,Xbox360ControllerLEDs,../libraries/168110269/Xbox360ControllerLEDs/s...,AnimationBase###rewriteFrame###getFrame###resu...,LED_Animation,X360ControllerLEDs
2,168110269,Xbox360ControllerLEDs,../libraries/168110269/Xbox360ControllerLEDs/s...,getAnimation,XboxLEDAnimations,X360ControllerLEDs
3,168110269,Xbox360ControllerLEDs,../libraries/168110269/Xbox360ControllerLEDs/s...,setPattern###rewriteFrame###getAnimation###pau...,XboxLEDHandler,X360ControllerLEDs
4,168110269,Xbox360ControllerLEDs,../libraries/168110269/Xbox360ControllerLEDs/s...,AnimationBase###rewriteFrame###getFrame###Xbox...,XboxLED_IndividualPins,X360ControllerLEDs


# get valid include

In [122]:
def preprocess_appfile(cpp_file):
    cpp_file = re.sub(r"(/\*.+?(?=\*/)\*/)", "", cpp_file, flags=re.DOTALL)
    cpp_file = re.sub(r"(//.+?\n)", r"\n", cpp_file, flags=re.DOTALL)
    # cpp_file = re.sub(r"(#.*?\n)|(#(endif|else))", r"\n", cpp_file)
    cpp_file = cpp_file.split('\n')
    cpp_file = [x.strip() for x in cpp_file if x.strip() != '']
    cpp_file = '\n'.join(cpp_file)
    return cpp_file

def get_ino_files(path):
    dir_list = os.listdir(path)
    
    codes = []
    for id_ in dir_list:
        temp_path = os.path.join(path, id_)
        
        for root, dirnames, filenames in os.walk(temp_path):
            
            for name in filenames:    
                if name.endswith(".ino"):
                    filepath = os.path.join(root, name)
                    library = root.split("/")[3]
                    with open(filepath, 'r', errors='ignore') as f:
                        code = f.read()
                    code = preprocess_appfile(code)
                    codes.append({
                        'id': id_,
                        'library': library,
                        'path': filepath,
                        'code': code,
                    })
    df = pd.DataFrame(codes)
    return df

def get_h_files(path):
    dir_list = os.listdir(path)   
    codes = []
    for id_ in dir_list:
        temp_path = os.path.join(path, id_)
        
        for root, dirnames, filenames in os.walk(temp_path):
            
            for name in filenames:    
                if name.endswith(".h"):
                    filepath = os.path.join(root, name)
                    library = root.split("/")[3]
                    with open(filepath, 'r', errors='ignore') as f:
                        code = f.read()
                    code = preprocess_appfile(code)
                    codes.append({
                        'id': id_,
                        'library': library,
                        'path': filepath,
                        'code': code,
                    })
    df = pd.DataFrame(codes)
    return df

In [123]:
df_ino = get_ino_files(path)

In [125]:
df_ino.head(5)

Unnamed: 0,id,library,path,code
0,168110269,Xbox360ControllerLEDs,../libraries/168110269/Xbox360ControllerLEDs/e...,#include <X360ControllerLEDs.h>\n#if !defined(...
1,168110269,Xbox360ControllerLEDs,../libraries/168110269/Xbox360ControllerLEDs/e...,#include <X360ControllerLEDs.h>\n#include <XIn...
2,168110269,Xbox360ControllerLEDs,../libraries/168110269/Xbox360ControllerLEDs/e...,#include <X360ControllerLEDs.h>\n#include <Fas...
3,168110269,Xbox360ControllerLEDs,../libraries/168110269/Xbox360ControllerLEDs/e...,#include <X360ControllerLEDs.h>\nconst uint8_t...
4,168110269,Xbox360ControllerLEDs,../libraries/168110269/Xbox360ControllerLEDs/e...,#include <X360ControllerLEDs.h>\nconst uint8_t...


In [129]:
declarator = ['init_declarator', 
              'function_declarator']

val_list = ['argument_list', 
            'parameter_list', 
            'initializer_list']

# 'unsigned', 'signed', 'long', 'short' == sized_type_identifier
type_list = ['primitive_type', 
             'type_identifier', 
             'sized_type_specifier']

sized_type_specifier_list = ['unsigned', 
                             'signed', 
                             'long', 
                             'short']

expression_list = ['conditional_expression',
                   'assignment_expression',
                   'binary_expression',
                   'unary_expression',
                   'cast_expression',
                   'pointer_expression',
                   'sizeof_expression',
                   'subscript_expression',
                   'call_expression',
                   'field_expression',
                   'compound_literal_expression',
                   'string_literal',
                   'number_literal',
                   'char_literal',
                   'true',
                   'false',
                   'null',
                   'concatenated_string',
                   'parenthesized_expression']

for_expression = ['conditional_expression',
                   'assignment_expression',
                   'binary_expression',
                   'unary_expression',
                   'cast_expression',
                   'pointer_expression',
                   'sizeof_expression',
                   'subscript_expression',
                   'call_expression',
                   'field_expression',
                   'parenthesized_expression']

exclusion_list = val_list + expression_list

built_in_api = ["digitalRead", 
                "digitalWrite", 
                'analogRead', 
                'analogWrite', 
                'noTone', 
                'tone', 
                'pulseIn', 
                'pulseInLong', 
                'shiftIn', 
                'shiftOut', 
                'analogWrite', 
                'analogReadResolution', 
                'analogWriteResolution',
                'attachInterrupt', 
                'detachInterrupt']

exclusion_lib = ['SPI', 
                 'Wire']

In [204]:
def get_declarations(code):
    declaration_list = []
    declaration_dict = {}
    includes = []
    constant_dict = {}
    
    code_splitted = code.split("\n")
    
    query = CP_LANGUAGE.query("""
    ((preproc_include) @libname)
    (translation_unit (declaration) @declaration_pattern1)
    (translation_unit (preproc_ifdef (declaration) @declaration_pattern2))
    """)
    
    tree = parser.parse(bytes(code, "utf8"))
    query_results = query.captures(tree.root_node)
    translator = str.maketrans('', '', '<>"\'')            
    
    for node, node_type in query_results:
        if node_type in ('libname'):
            for element, path in traverse_tree_with_path(node):
                temp1, temp2 = get_node_name(node=element, loc_list=code_splitted)
            
                if path[-1] == 'system_lib_string':
                    includes.append(temp2.translate(translator).split(".")[0])
                
                elif path[-1] == 'string_literal':
                    temp2 = resolve_string_literal(node=element, loc_list=code_splitted)
                    includes.append(temp2.translate(translator).split(".")[0])
            
        if node_type in ('declaration_pattern1', 'declaration_pattern2'):
            declaration = get_substring(node=node, loc_list=code_splitted)
            item_dict = {'type': 'init',
                        'statement': declaration, 
                        'obj_type_identifier': [],
                        'obj_name': None,
                        'identifiers': [],}
            is_const = False
            temp_identifier = ''
            
            for element, path in traverse_tree_with_path(node):
                temp1, temp2 = get_node_name(node=element, loc_list=code_splitted)
                intersection = set(path) & set(exclusion_list)
                
                if path[-1] == 'identifier':                
                    if len(intersection) == 0 and not item_dict['obj_name']:
                        item_dict['obj_name'] = temp2
                    else:
                        item_dict['identifiers'].append(temp2)
                
                elif path[-1] == 'init_declarator':
                    item_dict['type'] = 'init_declarator'
                
                elif path[-1] == 'type_identifier':
                    item_dict['obj_type_identifier'].append(temp2)
                
                elif path[-1] == 'const':
                    is_const = True
                    
                if item_dict.get('type', '') == 'init_declarator' and path[-2] == 'init_declarator' and path[-1] != '=':
                    
                    if path[-1] == 'identifier' and temp_identifier == '':
                        temp_identifier = temp2
                        
                    else:
                        temp_str = get_substring(node=element, loc_list=code_splitted) 
                        
                        if is_const == True:
                            constant_dict[temp_identifier] = temp_str
                        else:
                            declaration_dict[temp_identifier] = temp_str
                        
                        is_const = False
                        temp_identifier = ''
            
            declaration_list.append(item_dict)
    
    declaration_list = [subitem for item in declaration_list for subitem in item['obj_type_identifier'] if len(item['obj_type_identifier']) > 0]
    
    return declaration_list, includes

In [205]:
df_ino['temp'] = df_ino.code.progress_apply(lambda x: get_declarations(x))
df_ino['declarations'] = df_ino.temp.progress_apply(lambda x: x[0])
# df_ino['declarations'] = df_ino.code.progress_apply(lambda x: get_declarations(x))
df_ino["n_declarations"] = df_ino.declarations.progress_apply(lambda x: len(x))
df_ino['includes'] = df_ino.temp.progress_apply(lambda x: x[1])

  0%|          | 0/13321 [00:00<?, ?it/s]

  0%|          | 0/13321 [00:00<?, ?it/s]

  0%|          | 0/13321 [00:00<?, ?it/s]

  0%|          | 0/13321 [00:00<?, ?it/s]

In [206]:
df_ino_filtered = df_ino[df_ino.n_declarations > 0].copy()
len(df_ino), len(df_ino_filtered)

(13321, 11059)

In [207]:
df_ino_filtered.head(3)

Unnamed: 0,id,library,path,code,declarations,n_declarations,temp,includes
0,168110269,Xbox360ControllerLEDs,../libraries/168110269/Xbox360ControllerLEDs/e...,#include <X360ControllerLEDs.h>\n#if !defined(...,"[XboxControllerLEDs, LED_Pin]",2,"([XboxControllerLEDs, LED_Pin], [X360Controlle...",[X360ControllerLEDs]
1,168110269,Xbox360ControllerLEDs,../libraries/168110269/Xbox360ControllerLEDs/e...,#include <X360ControllerLEDs.h>\n#include <XIn...,"[boolean, XboxControllerLEDs, LED_Pin_1, LED_P...",8,"([boolean, XboxControllerLEDs, LED_Pin_1, LED_...","[X360ControllerLEDs, XInput]"
2,168110269,Xbox360ControllerLEDs,../libraries/168110269/Xbox360ControllerLEDs/e...,#include <X360ControllerLEDs.h>\n#include <Fas...,[MyCustomLEDClass],1,"([MyCustomLEDClass], [X360ControllerLEDs, Fast...","[X360ControllerLEDs, FastLED]"


In [214]:
df_filtered.head(3)

Unnamed: 0,id,library,path,methods,constructor,fname
0,168110269,Xbox360ControllerLEDs,../libraries/168110269/Xbox360ControllerLEDs/s...,AnimationBase###getFrame###getNumFrames,AnimationBase,X360ControllerLEDs
1,168110269,Xbox360ControllerLEDs,../libraries/168110269/Xbox360ControllerLEDs/s...,AnimationBase###rewriteFrame###getFrame###resu...,LED_Animation,X360ControllerLEDs
2,168110269,Xbox360ControllerLEDs,../libraries/168110269/Xbox360ControllerLEDs/s...,getAnimation,XboxLEDAnimations,X360ControllerLEDs


In [226]:
def check_constructor(x, df_ino):
    df_ino_cp = df_ino.copy()
    id_ = x.id
    
    df_ino_cp = df_ino_cp[df_ino_cp.id == id_]
    if len(df_ino_cp) > 0:
        declarations = df_ino_cp.declarations.tolist()
        declarations = [subitem for item in declarations for subitem in item]
        
        includes = df_ino_cp.includes.tolist()
        includes = [subitem for item in includes for subitem in item]
        
        constructor = x.constructor
        fname = x.fname

        if constructor in declarations :
            return 'constructor'
        
        # elif fname in includes:
        #     return 'fname'
        
        else:
            return 'null'
    
    else:
        return 'null'

In [227]:
df_filtered['is_pass'] = df_filtered.progress_apply(lambda x: check_constructor(x, df_ino_filtered), axis=1)

  0%|          | 0/8442 [00:00<?, ?it/s]

In [228]:
df_filtered.is_pass.value_counts()

null           4980
constructor    3462
Name: is_pass, dtype: int64

In [231]:
df_filtered = df_filtered[df_filtered.is_pass!='null'].copy()
df_filtered.head(3)

Unnamed: 0,id,library,path,methods,constructor,fname,is_pass
7,260585944,Adafruit_LIS331,../libraries/260585944/Adafruit_LIS331/Adafrui...,HPFReset###getRange###getSensor###getEvent###g...,Adafruit_LIS331HH,Adafruit_LIS331HH,constructor
8,260585944,Adafruit_LIS331,../libraries/260585944/Adafruit_LIS331/Adafrui...,HPFReset###getRange###getSensor###getEvent###g...,Adafruit_H3LIS331,Adafruit_H3LIS331,constructor
15,329785239,BigFont01,../libraries/329785239/BigFont01/BigFont01.h,clear###writechar###begin###writeint,BigFont01,BigFont01,constructor


In [233]:
df_filtered.drop(columns=['is_pass', 'fname', 'path'], inplace=True)

In [234]:
df_filtered.to_csv("lib_to_constructor.csv", index=False)

In [239]:
df_filtered

Unnamed: 0,id,library,methods,constructor
7,260585944,Adafruit_LIS331,HPFReset###getRange###getSensor###getEvent###g...,Adafruit_LIS331HH
8,260585944,Adafruit_LIS331,HPFReset###getRange###getSensor###getEvent###g...,Adafruit_H3LIS331
15,329785239,BigFont01,clear###writechar###begin###writeint,BigFont01
16,44492416,BlueRobotics_MS5837_Library,pressure###depth###init###setFluidDensity###al...,MS5837
17,4726670,Adafruit_MPL115A2,getTemperature###getPT###begin###getPressure,Adafruit_MPL115A2
...,...,...,...,...
9304,132886310,arduino-lib-oled,GetParent###SetName###Select###Back###Down###I...,DMenu
9305,132886310,arduino-lib-oled,clear###print###useOffset###Print###display###...,OLED
9306,316802078,Adafruit_LTR390,setGain###readUVS###setThresholds###getGain###...,Adafruit_LTR390
9310,365739877,EmbedUI-Library,clear###setFontWeight###setTextNormal###setBac...,EmbedUI
