In [120]:
import os
from solcx import compile_files, install_solc, get_installed_solc_versions
import re
import os
import pandas as pd

# Function to extract the pragma version from a Solidity file, defaulting to 0.4.23
def extract_solidity_version(file_path):
    pragma_pattern = re.compile(r'pragma solidity (\^?)(\d+\.\d+\.\d+);')
    with open(file_path, 'r') as file:
        for line in file:
            pragma_match = pragma_pattern.search(line)
            if pragma_match:
                # Ignore the range indicator and return the version part
                return pragma_match.group(2)
    # Return default version if no pragma is found
    return '0.4.23'


total = 1713
# Function to traverse directories and process .sol files
def process_directory(parent_directory):
    #total_files = sum(len(files) for _, _, files in os.walk(parent_directory) if any(file.endswith('.sol') for file in files))
    asts = {}  
    processed_files = 0  # Counter for the number of files processed into ASTs

    i = 1
    for root, dirs, files in os.walk(parent_directory):
        subdirectory = os.path.relpath(root, parent_directory)
        for filename in files:
            if filename.endswith('.sol'):
                file_path = os.path.join(root, filename)
                print(f"Converting Solidity Files to AST : {i}/{total}", end="\r", flush=True)
                i += 1

                sol_version = extract_solidity_version(file_path)

                if sol_version not in get_installed_solc_versions():
                    install_solc(sol_version)

                compiled_sol = compile_files([file_path], solc_version=sol_version, output_values=["ast"])

                if subdirectory not in asts:
                    asts[subdirectory] = {}

                for contract_name, compiled_data in compiled_sol.items():
                    if 'ast' in compiled_data:
                        asts[subdirectory][contract_name] = compiled_data['ast']
                        processed_files += 1  # Increment only when an AST is successfully added

    print("")
    print(f"Total files processed: {i - 1}")
    print(f"ASTs successfully added: {processed_files}")
    return asts

In [121]:
# Define the parent directory containing subdirectories with .sol files
parent_directory = 'dataset'  # Update this to your parent directory
asts = process_directory(parent_directory)

# Output the loaded contracts and their ASTs organized by subdirectory
print("\n\n----------\n")
print("Contracts and their ASTs loaded")

#asts = {class: {
#                         "CONTRACT NAME":  {AST1},
#                         "CONTRACT NAME2": {AST2},
#                         ...},
#                          
#                       }


Converting Solidity Files to AST : 1713/1713
Total files processed: 1713
ASTs successfully added: 3441


----------

Contracts and their ASTs loaded


In [127]:
def flatten(ast, current_key=None): #Flatten the AST into a single list of [node:value] pairs
    node_list = []

    if isinstance(ast, list):
        for item in ast:
            node_list.extend(get_all_nodes_and_values(item, current_key))
    elif isinstance(ast, dict):
        for key, value in ast.items():
            node_list.extend(get_all_nodes_and_values(value, key))
    else:
        node_list.append((current_key, ast))

    return node_list

def get_common_nodes(ast_a, ast_b): #Get common nodes between 2 flattened asts
    nodes_a = flatten_ast(ast_a)
    nodes_b = flatten_ast(ast_b)
    
    common_nodes = [node for node in nodes_a if node in nodes_b]
    return common_nodes


In [134]:
i = 1
total = 3441
# Iterate through each classification class
for class_name, contracts in asts.items():
    print(f"Processing class: {class_name}")
    
    # Iterate through each contract within this class
    for contract_name, ast in contracts.items():
        print(f"Flattened ASTs : {i}/{total}", end="\r", flush=True)
        i+=1
        
        asts[class_name][contract_name] = flatten(ast)  
        


Processing class: safe
Processing class: reentrancy
Processing class: unchecked_low_level_calls
Processing class: denial_of_service
Processing class: access_control
Processing class: arithmetic
Processing class: time_manipulation
Flattened ASTs : 3441/3441

<class 'list'>
