In [None]:
import pandas as pd
from tqdm.notebook import tqdm
tqdm.pandas()
import time
import os
import numpy as np
from ast import literal_eval

# create df

In [None]:
categories = ("display", "sensors", "signal-input-output", "device-control")

df_metadata = []
for category in categories:
    filepath_metadata = f"../lib_metadata/lib_url_{category}.csv"
    temp_df = pd.read_csv(filepath_metadata, names=["id", "url1", "url2", "fullname", "repo_desc", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"])
    temp_df["cat"] = category
    df_metadata.append(
        temp_df
    )
    
df_metadata = pd.concat(df_metadata)
df_metadata = df_metadata[["id", "url1", "url2", "fullname", "repo_desc", "cat"]].copy()

In [None]:
df_metadata

In [None]:
df_metadata.drop_duplicates(subset="id", inplace=True)
df_metadata.id.nunique()

In [None]:
# df_metadata = pd.merge(left=df_metadata, right=df_crawler, on=["url2"], how="inner")

In [None]:
# df_metadata["repo_name"] = df_metadata.fullname.progress_apply(lambda x: x.split("/")[-1])
# df_metadata

In [None]:
df_labels = pd.read_csv("protocol_labels.csv")
df_labels.drop_duplicates(subset=["git_url"], inplace=True)
df_labels.rename(columns={'git_url':'url1'}, inplace=True)
df_labels.drop(columns=["cat", "repo_name", "url", "path_repo"], inplace=True)
df_labels

In [None]:
df_labels

In [None]:
df_merge = pd.merge(left=df_metadata, right=df_labels, on=["url1"], how="left")

In [None]:
df_merge

# download repo

In [None]:
os.getcwd()

In [None]:
def download_repo(df, target_path):
    downloaded = []  
    for id_, url1, url2, fullname, repo_desc, cat, _, _, _, _ in tqdm(df.values, total=len(df)):
        try:
            if url1 not in downloaded:
                command = "git clone "+ url1
                os.chdir(f"{target_path}")
                os.makedirs(str(id_),  exist_ok = True)
                os.chdir(f"{str(id_)}")
                os.system(command)
                os.chdir("/data/fix_arduino_project/raw_data")
                time.sleep(1)
                downloaded.append("success")
        except KeyboardInterrupt:
            break
        except Exception as e:
            downloaded.append(e)
    return downloaded

In [None]:
os.chdir("/data/fix_arduino_project/dataset_raw")
downloaded = download_repo(df_merge, "libraries")

In [None]:
df_merge['is_downloaded'] = downloaded

# check protocol label

In [None]:
df_merge.is_downloaded.value_counts()

In [None]:
df_merge['has_label'] = df_merge.progress_apply(lambda x: False if np.isnan(x.is_uart) or np.isnan(x.is_spi) or np.isnan(x.is_i2c) or np.isnan(x.is_none) else True, axis=1)
df_merge.has_label.value_counts()

In [None]:
df_merge[["is_uart", "is_spi", "is_i2c", "is_none"]] = df_merge[["is_uart", "is_spi", "is_i2c", "is_none"]].fillna(value=0)

In [None]:
for col in ('is_uart', "is_spi", "is_i2c", "is_none"):
    df_merge[col] = df_merge[col].astype(int)

In [None]:
len(df_merge.columns)

In [None]:
rename_dict = {
    "url1": "url_clone",
    "url2": "url",
    "repo_desc": "desc"
}
df_merge.rename(columns=rename_dict, inplace=True)
df_merge = df_merge[["id", "fullname", "desc", "url", "url_clone", "is_uart", "is_spi", "is_i2c", "is_none", "has_label", "is_downloaded", "cat"]]
df_merge

In [None]:
# df_merge_cp = df_merge.copy()

# get folder name

In [None]:
basepath = "../libraries"

In [None]:
def get_folder_name(basepath):
    id_to_dirname = {
        "id": [],
        "dirname": []
    }
    directories = os.listdir(basepath)
    for directory in directories:
        temp_directories = os.listdir(os.path.join(basepath, directory))
        assert(len(temp_directories)==1)
        id_to_dirname["id"].append(int(directory)) 
        id_to_dirname["dirname"].append(temp_directories[0])
    return id_to_dirname

In [None]:
id_to_dirname = get_folder_name(basepath)

In [None]:
temp_df = pd.DataFrame.from_dict(id_to_dirname)

In [None]:
temp_df

In [None]:
df_merge = pd.merge(left=df_merge, right=temp_df, on="id", how="inner")

In [None]:
df_merge

In [None]:
df_merge.id.nunique()

# get valid include

In [None]:
import re

def preprocess_appfile(cpp_file):
    cpp_file = re.sub(r"(/\*.+?(?=\*/)\*/)", "", cpp_file, flags=re.DOTALL)
    cpp_file = re.sub(r"(//.+?\n)", r"\n", cpp_file, flags=re.DOTALL)
    # cpp_file = re.sub(r"(#.*?\n)|(#(endif|else))", r"\n", cpp_file)
    cpp_file = cpp_file.split('\n')
    cpp_file = [x.strip() for x in cpp_file if x.strip() != '']
    cpp_file = '\n'.join(cpp_file)
    return cpp_file

def get_ino_files(path):
    dir_list = os.listdir(path)
    
    codes = []
    for dirname in dir_list:
        
        temp_path = os.path.join(path, dirname)
        for root, dirnames, filenames in os.walk(temp_path):
            # print(root)
            for name in filenames:
                
                if name.endswith(".ino") or name.endswith(".pde"):
                    filepath = os.path.join(root, name)
                    with open(filepath, 'r', errors='ignore') as f:
                        code = f.read()
                    code = preprocess_appfile(code)
                    codes.append({
                        'id': int(dirname),
                        'path': filepath,
                        'code': code,
                    })
    df = pd.DataFrame(codes)
    return df

In [None]:
df_ino = get_ino_files("libraries")

In [None]:
df_ino

In [None]:
df_ino.id.nunique()

In [None]:
df_merge[df_merge.id.isin(df_ino.id)].id.nunique()

In [None]:
from tree_sitter import Language, Parser
tree_sitter_path = "../assets/tree-sitter-cpp/"

Language.build_library(
  # Store the library in the `build` directory
  "../assets/build/cpp.so", 
   [tree_sitter_path])

CP_LANGUAGE = Language('../assets/build/cpp.so', 'cpp')
parser = Parser()
parser.set_language(CP_LANGUAGE)

translator = str.maketrans('', '', '<>"\'')

### Helper to traverse treesitter output
def is_terminal(node):
    if len(node.children) == 0:
        return True
    else:
        return False
    
def get_start_tuple(node):
    return node.start_point

def get_end_tuple(node):
    return node.end_point

def get_substring_of_loc(start_tuple, end_tuple, loc_list):
    line_number_start, id_begin = start_tuple
    line_number_end, id_end = end_tuple
    
    temp_list = []
    if line_number_start != line_number_end:
        for idx in range(line_number_start, line_number_end+1):
            if idx == line_number_start:
                temp_loc = loc_list[idx][id_begin:]
            elif idx == line_number_end:
                temp_loc = loc_list[idx][:id_end]
            else:
                temp_loc= loc_list[idx]
            
            if temp_loc != '':
                temp_list.append(temp_loc)
    else:
        temp_list.append(loc_list[line_number_start][id_begin:id_end])
    out = '\n'.join(temp_list)
    return out

def fix_punctuation(str_input):
    out = str_input.translate(str.maketrans('"', "'", ';'))
    return out

def get_node_name(node, loc_list):
    if is_terminal(node):
        start_tuple = get_start_tuple(node)
        end_tuple = get_end_tuple(node)
        substr = get_substring_of_loc(start_tuple=start_tuple, end_tuple=end_tuple, loc_list=loc_list).strip()
        substr = fix_punctuation(substr)
        return node.type, substr
    else:
        return (node.type, '')

def resolve_string_literal(node, loc_list):
    start_tuple = get_start_tuple(node)
    end_tuple = get_end_tuple(node)
    substr = get_substring_of_loc(start_tuple=start_tuple, end_tuple=end_tuple, loc_list=loc_list).strip()
    substr = fix_punctuation(substr)
    return substr

def get_substring(node, loc_list):
    start_tuple = get_start_tuple(node)
    end_tuple = get_end_tuple(node)
    return get_substring_of_loc(start_tuple=start_tuple, end_tuple=end_tuple, loc_list=loc_list).strip()
###

### Traverse the AST
def traverse_tree_with_path(tree):
    cursor = tree.walk()
    current_path = []
    current_path.append(cursor.node.type)
    reached_root = False
    while reached_root == False:
        yield cursor.node, current_path

        if cursor.goto_first_child():
            current_path.append(cursor.node.type)
            continue
        
        if cursor.goto_next_sibling():
            if current_path:
                current_path.pop(-1)
            current_path.append(cursor.node.type)
            continue

        retracing = True
        while retracing:
            
            if not cursor.goto_parent():
                current_path.pop(-1)
                retracing = False
                reached_root = True
                
            if current_path:
                current_path.pop(-1)
                # print("a")
                
            if cursor.goto_next_sibling():
                retracing = False
                if current_path:
                    current_path.pop(-1)
                    
                current_path.append(cursor.node.type)
###

def get_include(code, parser):
    output_list = []
    query = CP_LANGUAGE.query("""
    ((preproc_include) @libname)
    """)
    tree = parser.parse(bytes(code, "utf8"))
    query_results = query.captures(tree.root_node)
    translator = str.maketrans('', '', '<>"\'')

    for result in query_results:

        ### extract library information
        if result[1] in ['libname']:   
            
            for element, path in traverse_tree_with_path(result[0]):
                temp1, temp2 = get_node_name(node=element, loc_list=code.split("\n"))
            
                if path[-1] == 'system_lib_string':
                    output_list.append(temp2.translate(translator).split(".")[0])
                
                elif path[-1] == 'string_literal':
                    temp2 = resolve_string_literal(node=element, loc_list=code.split("\n"))
                    output_list.append(temp2.translate(translator).split(".")[0])
    
    return output_list

In [None]:
df_ino['valid_include'] = df_ino.code.progress_apply(lambda x: get_include(x, parser))

In [None]:
df_ino["len_include"] = df_ino.valid_include.progress_apply(lambda x: len(x))

In [None]:
df_ino = df_ino[df_ino.len_include>0].copy()

In [None]:
df_ino

In [None]:
valid_include_dict = {}
for id_, path, code, valid_include, len_include in df_ino.values:
    if id_ not in valid_include_dict:
        valid_include_dict[id_] = []
        
    for include in valid_include:
        if include not in valid_include_dict[id_] and include not in ("Wire", "SPI", "Arduino", "SoftwareSerial"):
            valid_include_dict[id_].append(include)

In [None]:
valid_include_dict

# get_filepath

In [None]:
basepath = "../libraries"

In [None]:

def get_valid_header_filepath(basepath):
    valid_headers_dict = {
        "id": [],
        "headers": [],
        "cpps": [],
        "hpps": [],
        "len_headers": [],
        "len_cpps": [],
        "len_hpps": [],
        "total_len": [],
    }
    
    directories = os.listdir(basepath)
    for directory in directories:
        temp_path = os.path.join(basepath, directory)
        temp_directories = os.listdir(os.path.join(basepath, directory))
        assert(len(temp_directories)==1)
        
        hw_dirname = os.path.join(temp_path, temp_directories[0])
        temp_directories = os.listdir(hw_dirname)
        
        headers = [os.path.join(hw_dirname, f) for f in temp_directories if os.path.isfile(os.path.join(hw_dirname, f)) and f.endswith(".h")]
        cpps = [os.path.join(hw_dirname, f) for f in temp_directories if os.path.isfile(os.path.join(hw_dirname, f)) and f.endswith(".cpp")]
        hpps = [os.path.join(hw_dirname, f) for f in temp_directories if os.path.isfile(os.path.join(hw_dirname, f)) and f.endswith(".hpp")]
        
        # if len(headers)>0 or len(cpps)>0:
        # valid_headers_dict["id"].append(int(directory))
        # valid_headers_dict["headers"].append(headers)
        # valid_headers_dict["len_headers"].append(len(headers))
        # valid_headers_dict["cpps"].append(cpps)
        # valid_headers_dict["len_cpps"].append(len(cpps))
        # valid_headers_dict["total_len"].append(len(cpps)+len(headers))
        
        if len(headers)==0 or len(cpps)==0:
            for temp_directory in temp_directories:
                path = os.path.join(hw_dirname, temp_directory)
                if temp_directory == "src":
                    headers = [os.path.join(path, f) for f in os.listdir(path) if os.path.isfile(os.path.join(path, f)) and f.endswith(".h")]
                    cpps = [os.path.join(path, f) for f in os.listdir(path) if os.path.isfile(os.path.join(path, f)) and f.endswith(".cpp")]
                    hpps = [os.path.join(path, f) for f in os.listdir(path) if os.path.isfile(os.path.join(path, f)) and f.endswith(".hpp")]
                    
        valid_headers_dict["id"].append(int(directory))
        valid_headers_dict["headers"].append(headers)
        valid_headers_dict["len_headers"].append(len(headers))
        valid_headers_dict["cpps"].append(cpps)
        valid_headers_dict["len_cpps"].append(len(cpps))
        valid_headers_dict["hpps"].append(cpps)
        valid_headers_dict["len_hpps"].append(len(hpps))
        valid_headers_dict["total_len"].append(len(cpps)+len(headers)+len(hpps))
                
    return valid_headers_dict

In [None]:
df_merge[df_merge.id==409346135]

In [None]:
temp_dict = get_valid_header_filepath(basepath)

In [None]:
temp_df = pd.DataFrame(temp_dict)

In [None]:
temp_df[temp_df.total_len>0]

In [None]:
temp_df.len_cpps.value_counts()[temp_df.len_cpps.value_counts().index<5]

In [None]:
# df_merge.to_csv("ckpt_files/df_merge_1.csv", index=False)

# get features

In [None]:
def extract_features(cpp_file):
    cpp_file_splitted = cpp_file.split("\n")
    
    # seqs = x.seqs
    # seqs = seqs.split("###")
    
    # queries = [x.split(".")[-1] for x in seqs]
    
    # if len(seqs) > 0:
    #     obj_identifier = seqs[0].split(".")[0]
    #     queries.append(obj_identifier)
    
    tree = parser.parse(bytes(cpp_file, "utf8"))
    root_node=tree.root_node
    
    query = CP_LANGUAGE.query("""
    (translation_unit (function_definition) @function_def)
    """)

    captures = query.captures(tree.root_node)
    
    features = []

    for result, _ in captures:
        namespace_identifiers = []
        found_obj_identifier = False
        found_method = False
        
        for node, path in traverse_tree_with_path(result):
            temp1, temp2 = get_node_name(node=node, loc_list=cpp_file_splitted)

            # if path[-1] == 'namespace_identifier' and temp2 in queries:
                # found_obj_identifier = True
                
            if len(path) >= 2:
            # and found_obj_identifier == True:
                
                if path[-2] == 'qualified_identifier' and path[-1] == 'identifier':
                    # if is_gold:
                    #     if temp2 in queries:
                    #         found_method = True
                    #         break
                    # else:
                    found_method = True
                    break
            # print()
        if found_method == True:
            declaration = get_substring(node=result, loc_list=cpp_file_splitted)
            features.append(declaration)
        
    if len(features) > 0:
        return '[SEP]'.join(features)
    else:
        return 'null'

In [None]:
def preprocess_appfile(cpp_file):
    cpp_file = re.sub(r"(/\*.+?(?=\*/)\*/)", "", cpp_file, flags=re.DOTALL)
    cpp_file = re.sub(r"(//.+?\n)", r"\n", cpp_file, flags=re.DOTALL)
    # cpp_file = re.sub(r"(#.*?\n)|(#(endif|else))", r"\n", cpp_file)
    cpp_file = cpp_file.split('\n')
    cpp_file = [x.strip() for x in cpp_file if x.strip() != '']
    cpp_file = '\n'.join(cpp_file)
    return cpp_file

# def get_features(df):
#     df_cp = df.copy()
#     features = []
#     len_features = []
#     for id_, headers, cpps, hpps, len_headers, len_cpps, len_hpps, total_len in df_cp.values:
#         temp_list = []
#         if len_cpps > 0:
#             for path in cpps:
#                 with open(path, "r", errors='ignore') as f:
#                     temp_file = preprocess_appfile(f.read())
                    
#                     temp_list.append(temp_file)
#         elif len_headers > 0:
#             for path in headers:
#                 with open(path, "r", errors='ignore') as f:
#                     temp_list.append(preprocess_appfile(f.read()))
                    
#         elif len_hpps > 0:
#             for path in hpps:
#                 with open(path, "r", errors='ignore') as f:
#                     temp_list.append(preprocess_appfile(f.read()))
        
#         features.append(temp_list)
#         len_features.append(len(temp_list))
#     df_cp["features"] = features
#     df_cp["len_features"] = len_features
#     return df_cp

def get_features(df):
    df_cp = df.copy()
    features = []
    len_features = []
    for id_, headers, cpps, hpps, len_headers, len_cpps, len_hpps, total_len in df_cp.values:
        temp_list = []
        if len_cpps > 0:
            for path in cpps:
                with open(path, "r", errors='ignore') as f:
                    temp_file = preprocess_appfile(f.read())
                    features = extract_features(temp_file)
                    if features != 'null':
                        temp_list.append(features)
        elif len_headers > 0:
            for path in headers:
                with open(path, "r", errors='ignore') as f:
                    temp_file = preprocess_appfile(f.read())
                    features = extract_features(temp_file)
                    if features != 'null':
                        temp_list.append(features)
                    
        elif len_hpps > 0:
            for path in hpps:
                with open(path, "r", errors='ignore') as f:
                    temp_file = preprocess_appfile(f.read())
                    features = extract_features(temp_file)
                    if features != 'null':
                        temp_list.append(features)
        
        features.append(temp_list)
        len_features.append(len(temp_list))
    df_cp["features"] = features
    df_cp["len_features"] = len_features
    return df_cp

In [None]:
temp_df = get_features(temp_df)

In [None]:
temp_df = temp_df[temp_df.len_features > 0].copy()

In [None]:
temp_df = temp_df[["id", "headers", "cpps", "hpps", "features", "len_features"]].copy()

In [None]:
df_merge = pd.merge(left=df_merge, right=temp_df, on="id", how="inner")

In [None]:
# df_merge.to_csv("ckpt_files/df_merge_2.csv", index=False)

In [None]:
categories = ("display", "sensors", "signal-input-output", "device-control")
df_crawler = []
for category in categories:
    filepath_crawler = f"lib_crawler/lib_url_{category}.csv"
    temp_df = pd.read_csv(filepath_crawler)
    temp_df["cat"] = category
    df_crawler.append(
        temp_df
    )

df_crawler = pd.concat(df_crawler)

In [None]:
df_crawler.drop_duplicates(subset="url", inplace=True)

In [None]:
df_merge.rename(columns={'desc':'desc_repo'}, inplace=True)
df_crawler.drop(columns=["cat", "url", "sensor"], inplace=True)
df_crawler.rename(columns={'description':'desc_ardulib', 'git_link':'url'}, inplace=True)

In [None]:
df_merge = pd.merge(left=df_merge, right=df_crawler, on="url", how="left")

In [None]:
len(df_merge.columns)

In [None]:
col = ["id", "fullname", "desc_repo", "desc_ardulib", "url", "url_clone", "is_uart", "is_spi", "is_i2c", "is_none", "has_label", "is_downloaded", "cat", "dirname", "headers", "cpps", "hpps", "features", "len_features"]
len(col)

In [None]:
df_merge = df_merge[col].copy()

In [None]:
df_merge

In [None]:
df_merge.info()

# get readme

In [None]:
def get_valid_header_filepath(basepath):
    valid_headers_dict = {
        "id": [],
        "readme": [],
        "len_readme": [],
    }
    
    directories = os.listdir(basepath)
    for directory in directories:
        temp_path = os.path.join(basepath, directory)
        temp_directories = os.listdir(os.path.join(basepath, directory))
        assert(len(temp_directories)==1)
        
        hw_dirname = os.path.join(temp_path, temp_directories[0])
        
        headers = [os.path.join(hw_dirname, f) for f in os.listdir(hw_dirname) if os.path.isfile(os.path.join(hw_dirname, f)) and f=="README.md"]
        temp_list = []
        for item in headers:
            with open(item, "r", errors="ignore") as f:
                readme = f.read()
                temp_list.append(readme)
                
        valid_headers_dict["id"].append(int(directory))
        valid_headers_dict["readme"].append(temp_list)
        valid_headers_dict["len_readme"].append(len(temp_list))
        
    
    return valid_headers_dict
        

In [None]:
temp_dict = get_valid_header_filepath(basepath)

In [None]:
temp_df = pd.DataFrame(temp_dict)
temp_df

In [None]:
temp_df.drop_duplicates(subset="id")

In [None]:
temp_df.len_readme.value_counts()

In [None]:
df_merge = pd.merge(left=df_merge, right=temp_df, on="id", how="left")

In [None]:
df_merge

In [None]:
# df_merge.to_csv("ckpt_files/df_merge_3.csv", index=False)

In [None]:
df_merge

# revise feature 8 october

In [None]:
import re

In [None]:
df = pd.read_csv("ckpt_files/df_merge_3.csv")

In [None]:
df.columns

In [None]:
def convert_string(x):
    try:
        return literal_eval(x)
    except:
        return 'null'

In [None]:
for colname in ("cpps", "headers", "hpps"):
    df[colname] = df[colname].progress_apply(lambda x: convert_string(x))

In [None]:
def extract_features(cpp_file):
    cpp_file_splitted = cpp_file.split("\n")
    
    # seqs = x.seqs
    # seqs = seqs.split("###")
    
    # queries = [x.split(".")[-1] for x in seqs]
    
    # if len(seqs) > 0:
    #     obj_identifier = seqs[0].split(".")[0]
    #     queries.append(obj_identifier)
    
    tree = parser.parse(bytes(cpp_file, "utf8"))
    root_node=tree.root_node
    
    query = CP_LANGUAGE.query("""
    (translation_unit (function_definition) @function_def)
    """)

    captures = query.captures(tree.root_node)
    
    features = []

    for result, _ in captures:
        namespace_identifiers = []
        found_obj_identifier = False
        found_method = False
        
        for node, path in traverse_tree_with_path(result):
            temp1, temp2 = get_node_name(node=node, loc_list=cpp_file_splitted)

            # if path[-1] == 'namespace_identifier' and temp2 in queries:
                # found_obj_identifier = True
                
            if len(path) >= 2:
            # and found_obj_identifier == True:
                
                if path[-2] == 'qualified_identifier' and path[-1] == 'identifier':
                    # if is_gold:
                    #     if temp2 in queries:
                    #         found_method = True
                    #         break
                    # else:
                    found_method = True
                    break
            # print()
        if found_method == True:
            declaration = get_substring(node=result, loc_list=cpp_file_splitted)
            features.append(declaration)
        
    if len(features) > 0:
        return '[SEP]'.join(features)
    else:
        return 'null'

In [None]:
def preprocess_appfile(cpp_file):
    cpp_file = re.sub(r"(/\*.+?(?=\*/)\*/)", "", cpp_file, flags=re.DOTALL)
    cpp_file = re.sub(r"(//.+?\n)", r"\n", cpp_file, flags=re.DOTALL)
    # cpp_file = re.sub(r"(#.*?\n)|(#(endif|else))", r"\n", cpp_file)
    cpp_file = cpp_file.split('\n')
    cpp_file = [x.strip() for x in cpp_file if x.strip() != '']
    cpp_file = '\n'.join(cpp_file)
    return cpp_file

# def get_features(df):
#     df_cp = df.copy()
#     features = []
#     len_features = []
#     for id_, headers, cpps, hpps, len_headers, len_cpps, len_hpps, total_len in df_cp.values:
#         temp_list = []
#         if len_cpps > 0:
#             for path in cpps:
#                 with open(path, "r", errors='ignore') as f:
#                     temp_file = preprocess_appfile(f.read())
                    
#                     temp_list.append(temp_file)
#         elif len_headers > 0:
#             for path in headers:
#                 with open(path, "r", errors='ignore') as f:
#                     temp_list.append(preprocess_appfile(f.read()))
                    
#         elif len_hpps > 0:
#             for path in hpps:
#                 with open(path, "r", errors='ignore') as f:
#                     temp_list.append(preprocess_appfile(f.read()))
        
#         features.append(temp_list)
#         len_features.append(len(temp_list))
#     df_cp["features"] = features
#     df_cp["len_features"] = len_features
#     return df_cp

def get_features(df):
    df_cp = df.copy()
    features = []
    len_features = []
    for (id_, fullname, desc_repo, desc_ardulib, url, url_clone,
       is_uart, is_spi, is_i2c, is_none, has_label, is_downloaded,
       cat, dirname, headers, cpps, hpps, features_, len_features_,
       readme, len_readme) in df_cp.values:
    # for id_, headers, cpps, hpps, len_headers, len_cpps, len_hpps, total_len in df_cp.values:
        temp_list = []
        if len(cpps) > 0:
            for path in cpps:
                with open(path, "r", errors='ignore') as f:
                    temp_file = preprocess_appfile(f.read())
                    temp_features = extract_features(temp_file)
                    if temp_features != 'null':
                        temp_list.append(temp_features)
        elif len(headers) > 0:
            for path in headers:
                with open(path, "r", errors='ignore') as f:
                    temp_file = preprocess_appfile(f.read())
                    temp_features = extract_features(temp_file)
                    if temp_features != 'null':
                        temp_list.append(temp_features)
                    
        elif len(hpps) > 0:
            for path in hpps:
                with open(path, "r", errors='ignore') as f:
                    temp_file = preprocess_appfile(f.read())
                    temp_features = extract_features(temp_file)
                    if temp_features != 'null':
                        temp_list.append(temp_features)
        
        features.append(temp_list)
        len_features.append(len(temp_list))
    df_cp["features"] = features
    df_cp["len_features"] = len_features
    return df_cp

In [None]:
temp_df = get_features(df)

In [None]:
temp_df

In [None]:
temp_df.to_csv("ckpt_files/df_merge_4.csv", index=False)

# exclude not found feature 31 oct

In [None]:
df = pd.read_csv("../ckpt_files/df_merge_4.csv")

In [None]:
def convert_string(x):
    try:
        return literal_eval(x)
    except:
        return 'null'

In [None]:
df['features'] = df.features.progress_apply(lambda x: convert_string(x))

In [None]:
df.head(5)

In [None]:
df = df[df.len_features > 0].copy()

In [None]:
len(df)

In [None]:
constructor_mapping = pd.read_csv("../generate_constructor_mapping/lib_to_constructor.csv")

In [None]:
constructor_mapping

In [None]:
df = df[df.id.isin(constructor_mapping.id)].copy()

In [None]:
df.drop(columns=['url_clone', 'is_uart', 'is_spi', 'is_uart', 'is_i2c', 'is_none', 'is_downloaded', 'headers', 'cpps', 'hpps', 'len_features', 'readme', 'len_readme', 'has_label', 'fullname'], inplace=True)

In [None]:
df = df[['id', 'dirname', 'desc_repo', 'desc_ardulib', 'url', 'cat', 'features']].copy()
df.rename(columns={'dirname':'library'}, inplace=True)

In [None]:
df.fillna("null", inplace=True)

In [None]:
df['features'] = df.features.progress_apply(lambda x: "###".join(x))

In [None]:
df.head(5)

In [None]:
df.to_csv("lib_to_features.csv", index=False)