In [253]:
import os
from solcx import compile_files, install_solc, get_installed_solc_versions
import re
import os
import pandas as pd

# Function to extract the pragma version from a Solidity file, defaulting to 0.4.23
def extract_solidity_version(file_path):
    pragma_pattern = re.compile(r'pragma solidity (\^?)(\d+\.\d+\.\d+);')
    with open(file_path, 'r') as file:
        for line in file:
            pragma_match = pragma_pattern.search(line)
            if pragma_match:
                # Ignore the range indicator and return the version part
                return pragma_match.group(2)
    # Return default version if no pragma is found
    return '0.4.23'


total = 1426
# Function to traverse directories and process .sol files
def process_directory(parent_directory):
    #total_files = sum(len(files) for _, _, files in os.walk(parent_directory) if any(file.endswith('.sol') for file in files))
    asts = {}  
    processed_files = 0  # Counter for the number of files processed into ASTs

    i = 1
    for root, dirs, files in os.walk(parent_directory):
        subdirectory = os.path.relpath(root, parent_directory)
        for filename in files:
            if filename.endswith('.sol'):
                file_path = os.path.join(root, filename)
                print(f"Converting Solidity Files to AST : {i}/{total}", end="\r", flush=True)
                i += 1

                sol_version = extract_solidity_version(file_path)

                if sol_version not in get_installed_solc_versions():
                    install_solc(sol_version)

                compiled_sol = compile_files([file_path], solc_version=sol_version, output_values=["ast"])

                if subdirectory not in asts:
                    asts[subdirectory] = {}

                for contract_name, compiled_data in compiled_sol.items():
                    if 'ast' in compiled_data:
                        asts[subdirectory][contract_name] = compiled_data['ast']
                        processed_files += 1  # Increment only when an AST is successfully added

    print("")
    print(f"Total files processed: {i - 1}")
    print(f"ASTs successfully added: {processed_files}")
    return asts

In [254]:
# Define the parent directory containing subdirectories with .sol files
parent_directory = 'dataset'  # Update this to your parent directory
asts = process_directory(parent_directory)

# Output the loaded contracts and their ASTs organized by subdirectory
print("\n\n----------\n")
print("Contracts and their ASTs loaded")

#asts = {class: {
#                         "CONTRACT NAME":  {AST1},
#                         "CONTRACT NAME2": {AST2},
#                         ...},
#                          
#                       }


Converting Solidity Files to AST : 1426/1573
Total files processed: 1426
ASTs successfully added: 1777


----------

Contracts and their ASTs loaded


In [255]:
def flatten(ast, current_key=None): #Flatten the AST into a single list of [node:value] pairs
    node_list = []

    if isinstance(ast, list):
        for item in ast:
            node_list.extend(get_all_nodes_and_values(item, current_key))
    elif isinstance(ast, dict):
        for key, value in ast.items():
            node_list.extend(get_all_nodes_and_values(value, key))
    else:
        node_list.append((current_key, ast))

    return node_list

def get_number_of_common_nodes(ast_a, ast_b): #Get common nodes between 2 flattened asts
    nodes_a = flatten(ast_a)
    nodes_b = flatten(ast_b)
    
    common_nodes = [node for node in nodes_a if node in nodes_b]
    return len(common_nodes)

def build_feature_vector(asts, ast, class_of_ast, ast_name, total_asts,j):
    i=0
    features = {}
    for class_name, contracts in asts.items():
        common_sub_nodes = 0
        count = 0
        for contract_name, contract_ast in contracts.items():
            # Clear the line by printing return and enough spaces
            print('\r' + ' ' * 100, end='\r')
            # Construct the dynamic message
            #message = f"Comparing AST \"{ast_name}\" to other all other ASTs : {i}/{total_asts-1} | Feature vectors built: {j}"
            # Print the message, ending with a carriage return to stay on the same line
            #print(message, end="\r", flush=True)
            i+=1
            if contract_name == ast_name:
                continue
            common_sub_nodes += get_number_of_common_nodes(ast, contract_ast)
            count += 1
        if count > 0:
            features[class_name] = common_sub_nodes / count
        else:
            features[class_name] = 0
            
    features["class"] = class_of_ast
    return features


In [None]:
cols = []
for class_name,contracts in asts.items():
    cols.append(class_name)
cols.append('class')
        
total_asts = 1777
df = pd.DataFrame(columns = cols)
df
i = 0
#THIS TAKES OVER 10 HOURS, make sure to leave computer plugged in and with screen sleep off (if running on local machine)
for class_name,contracts in asts.items():
    for contract_name, contract_ast in contracts.items():
        print(f"Building feature vector for {contract_name} ... | " +
           f"Feature vectors built: {i}/{total_asts}")
        feature = build_feature_vector(asts,contract_ast,class_name,contract_name,total_asts,i)
        i+=1
        feature_df = pd.DataFrame([feature])
        df = pd.concat([df, feature_df], ignore_index=True)  # Concatenate
        #print(f"Converted {i}/{total_asts} ASTs into feature vectors", end="\r", flush=True)


Building feature vector for dataset/safe/28479.sol:Ownable ... | Feature vectors built: 0/1777
Building feature vector for dataset/safe/28479.sol:RobotCoinSeller ... | Feature vectors built: 1/1777
Building feature vector for dataset/safe/28479.sol:token ... | Feature vectors built: 2/1777        
Building feature vector for dataset/safe/28337.sol:Token ... | Feature vectors built: 3/1777        
Building feature vector for dataset/safe/28337.sol:WOCoin ... | Feature vectors built: 4/1777       
Building feature vector for dataset/safe/28054.sol:GexCryptoIco ... | Feature vectors built: 5/1777 
Building feature vector for dataset/safe/28054.sol:SafeMath ... | Feature vectors built: 6/1777     
Building feature vector for dataset/safe/28054.sol:owned ... | Feature vectors built: 7/1777        
Building feature vector for dataset/safe/28444.sol:TradeIO ... | Feature vectors built: 8/1777      
Building feature vector for dataset/safe/28491.sol:Owned ... | Feature vectors built: 9/1777   

Building feature vector for dataset/safe/28353.sol:token ... | Feature vectors built: 80/1777       
Building feature vector for dataset/safe/28384.sol:AXLCrowdsale ... | Feature vectors built: 81/1777
Building feature vector for dataset/safe/28384.sol:Token ... | Feature vectors built: 82/1777       
Building feature vector for dataset/safe/28179.sol:AirDrop ... | Feature vectors built: 83/1777     
Building feature vector for dataset/safe/28179.sol:ERC20 ... | Feature vectors built: 84/1777       
Building feature vector for dataset/safe/28151.sol:FundsKeeper ... | Feature vectors built: 85/1777 
Building feature vector for dataset/safe/28151.sol:InterfaceDeusETH ... | Feature vectors built: 86/1777
Building feature vector for dataset/safe/28151.sol:SafeMath ... | Feature vectors built: 87/1777    
Building feature vector for dataset/safe/28153.sol:InterfaceDeusETH ... | Feature vectors built: 88/1777
Building feature vector for dataset/safe/28153.sol:StockExchange ... | Feature vect

Building feature vector for dataset/safe/28265.sol:Ownable ... | Feature vectors built: 160/1777    
Building feature vector for dataset/safe/28265.sol:token ... | Feature vectors built: 161/1777      
Building feature vector for dataset/safe/28107.sol:CEOThrone ... | Feature vectors built: 162/1777  
Building feature vector for dataset/safe/28107.sol:Ownable ... | Feature vectors built: 163/1777    
Building feature vector for dataset/safe/28113.sol:GuessTheNumber ... | Feature vectors built: 164/1777
Building feature vector for dataset/safe/28111.sol:Q1SCrowdsale ... | Feature vectors built: 165/1777
Building feature vector for dataset/safe/28111.sol:Token ... | Feature vectors built: 166/1777      
Building feature vector for dataset/safe/28299.sol:IERC20 ... | Feature vectors built: 167/1777     
Building feature vector for dataset/safe/28299.sol:POS ... | Feature vectors built: 168/1777        
Building feature vector for dataset/safe/28138.sol:EtherReceiver ... | Feature vectors 

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     

Building feature vector for dataset/reentrancy/19925.sol:ERC223TokenCompatible ... | Feature vectors built: 61/1777
Building feature vector for dataset/reentrancy/reentrance.sol:Reentrance ... | Feature vectors built: 62/1777
Building feature vector for dataset/reentrancy/18145.sol:CrowdsaleProxy ... | Feature vectors built: 63/1777
Building feature vector for dataset/reentrancy/22636.sol:LuckyETH ... | Feature vectors built: 64/1777
Building feature vector for dataset/reentrancy/14353.sol:ICOBuyer ... | Feature vectors built: 65/1777
Building feature vector for dataset/reentrancy/reentrancy_dao.sol:ReentrancyDAO ... | Feature vectors built: 66/1777
Building feature vector for dataset/reentrancy/40299.sol:YesNo ... | Feature vectors built: 67/1777 
Building feature vector for dataset/reentrancy/4598.sol:mnyminer ... | Feature vectors built: 68/1777
Building feature vector for dataset/reentrancy/modifier_reentrancy.sol:Bank ... | Feature vectors built: 69/1777
Building feature vector fo

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     

Building feature vector for dataset/reentrancy/40415.sol:EtherDelta ... | Feature vectors built: 136/1777
Building feature vector for dataset/reentrancy/35713.sol:MoldCoin ... | Feature vectors built: 137/1777
Building feature vector for dataset/reentrancy/37676.sol:DeadMansSwitch ... | Feature vectors built: 138/1777
Building feature vector for dataset/reentrancy/22247.sol:PIGGY_BANK ... | Feature vectors built: 139/1777
Building feature vector for dataset/reentrancy/0x941d225236464a25eb18076df7da6a91d0f95e9e.sol:ETH_FUND ... | Feature vectors built: 140/1777
Building feature vector for dataset/reentrancy/0x941d225236464a25eb18076df7da6a91d0f95e9e.sol:Log ... | Feature vectors built: 141/1777
Building feature vector for dataset/reentrancy/40366.sol:BranchWallet ... | Feature vectors built: 142/1777
Building feature vector for dataset/reentrancy/14284.sol:Halo3D ... | Feature vectors built: 143/1777
Building feature vector for dataset/reentrancy/40789.sol:SendBalance ... | Feature vect

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     

Building feature vector for dataset/reentrancy/18510.sol:A2ACrowdsale ... | Feature vectors built: 211/1777
Building feature vector for dataset/reentrancy/40784.sol:SimpleDAO ... | Feature vectors built: 212/1777
Building feature vector for dataset/reentrancy/8873.sol:ELTWagerLedger ... | Feature vectors built: 213/1777
Building feature vector for dataset/reentrancy/22075.sol:SIMPLE_PIGGY_BANK ... | Feature vectors built: 214/1777
Building feature vector for dataset/reentrancy/39327.sol:Congress ... | Feature vectors built: 215/1777
Building feature vector for dataset/reentrancy/40357.sol:AmIOnTheFork ... | Feature vectors built: 216/1777
Building feature vector for dataset/reentrancy/40357.sol:SellETCSafely ... | Feature vectors built: 217/1777
Building feature vector for dataset/reentrancy/40425.sol:MyEtherBank ... | Feature vectors built: 218/1777
Building feature vector for dataset/reentrancy/30101.sol:TelcoinSaleCapEscrow ... | Feature vectors built: 219/1777
Building feature vect

Building feature vector for dataset/reentrancy/40782.sol:Token ... | Feature vectors built: 284/1777
Building feature vector for dataset/reentrancy/0x7541b76cb60f4c60af330c208b0623b7f54bf615.sol:Log ... | Feature vectors built: 285/1777
Building feature vector for dataset/reentrancy/0x7541b76cb60f4c60af330c208b0623b7f54bf615.sol:U_BANK ... | Feature vectors built: 286/1777
Building feature vector for dataset/reentrancy/0x96edbe868531bd23a6c05e9d0c424ea64fb1b78b.sol:LogFile ... | Feature vectors built: 287/1777
Building feature vector for dataset/reentrancy/0x96edbe868531bd23a6c05e9d0c424ea64fb1b78b.sol:PENNY_BY_PENNY ... | Feature vectors built: 288/1777
Building feature vector for dataset/reentrancy/36563.sol:SharkProxy ... | Feature vectors built: 289/1777
Building feature vector for dataset/reentrancy/39644.sol:AddressOwnershipVerification ... | Feature vectors built: 290/1777
Building feature vector for dataset/reentrancy/27334.sol:AddressLottery ... | Feature vectors built: 291/17

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     

Building feature vector for dataset/reentrancy/simple_dao.sol:SimpleDAO ... | Feature vectors built: 357/1777
Building feature vector for dataset/reentrancy/2308.sol:Kleros ... | Feature vectors built: 358/1777
Building feature vector for dataset/reentrancy/40736.sol:EtherStore ... | Feature vectors built: 359/1777
Building feature vector for dataset/reentrancy/0xaae1f51cf3339f18b6d3f3bdc75a5facd744b0b8.sol:DEP_BANK ... | Feature vectors built: 360/1777
Building feature vector for dataset/reentrancy/0xaae1f51cf3339f18b6d3f3bdc75a5facd744b0b8.sol:LogFile ... | Feature vectors built: 361/1777
Building feature vector for dataset/reentrancy/14993.sol:DividendToken ... | Feature vectors built: 362/1777
Building feature vector for dataset/reentrancy/17518.sol:RipioOracle ... | Feature vectors built: 363/1777
Building feature vector for dataset/reentrancy/33450.sol:VVToken ... | Feature vectors built: 364/1777
Building feature vector for dataset/reentrancy/40092.sol:PullPaymentCapable ... | F

Building feature vector for dataset/unchecked_low_level_calls/0xe82f0742a71a02b9e9ffc142fdcb6eb1ed06fb87.sol:Freebie ... | Feature vectors built: 43/1777
Building feature vector for dataset/unchecked_low_level_calls/458.sol:BintechToken ... | Feature vectors built: 44/1777
Building feature vector for dataset/unchecked_low_level_calls/458.sol:ContractReceiver ... | Feature vectors built: 45/1777
Building feature vector for dataset/unchecked_low_level_calls/458.sol:ERC223 ... | Feature vectors built: 46/1777
Building feature vector for dataset/unchecked_low_level_calls/458.sol:Ownable ... | Feature vectors built: 47/1777
Building feature vector for dataset/unchecked_low_level_calls/458.sol:SafeMath ... | Feature vectors built: 48/1777
Building feature vector for dataset/unchecked_low_level_calls/0x2972d548497286d18e92b5fa1f8f9139e5653fd2.sol:demo ... | Feature vectors built: 49/1777
Building feature vector for dataset/unchecked_low_level_calls/0xd5967fed03e85d1cce44cab284695b41bc675b5c.s

Building feature vector for dataset/unchecked_low_level_calls/1710.sol:MiddleSaleService ... | Feature vectors built: 108/1777
Building feature vector for dataset/unchecked_low_level_calls/20892.sol:OrganizeFunds ... | Feature vectors built: 109/1777
Building feature vector for dataset/unchecked_low_level_calls/0xbebbfe5b549f5db6e6c78ca97cac19d1fb03082c.sol:Proxy ... | Feature vectors built: 110/1777
Building feature vector for dataset/unchecked_low_level_calls/0xbebbfe5b549f5db6e6c78ca97cac19d1fb03082c.sol:VaultProxy ... | Feature vectors built: 111/1777
Building feature vector for dataset/unchecked_low_level_calls/20313.sol:MultiSigWallet ... | Feature vectors built: 112/1777
Building feature vector for dataset/unchecked_low_level_calls/20313.sol:MultiSigWalletWithDailyLimit ... | Feature vectors built: 113/1777
Building feature vector for dataset/unchecked_low_level_calls/1275.sol:MultiSigWallet ... | Feature vectors built: 114/1777
Building feature vector for dataset/unchecked_low_

Building feature vector for dataset/unchecked_low_level_calls/21397.sol:owned ... | Feature vectors built: 172/1777
Building feature vector for dataset/unchecked_low_level_calls/21397.sol:tokenRecipient ... | Feature vectors built: 173/1777
Building feature vector for dataset/unchecked_low_level_calls/1064.sol:ContractReceiver ... | Feature vectors built: 174/1777
Building feature vector for dataset/unchecked_low_level_calls/1064.sol:ERC20Interface ... | Feature vectors built: 175/1777
Building feature vector for dataset/unchecked_low_level_calls/1064.sol:Owned ... | Feature vectors built: 176/1777
Building feature vector for dataset/unchecked_low_level_calls/1064.sol:SafeMath ... | Feature vectors built: 177/1777
Building feature vector for dataset/unchecked_low_level_calls/1064.sol:XToken ... | Feature vectors built: 178/1777
Building feature vector for dataset/unchecked_low_level_calls/255.sol:ContractReceiver ... | Feature vectors built: 179/1777
Building feature vector for dataset

Building feature vector for dataset/unchecked_low_level_calls/21287.sol:TrueToken ... | Feature vectors built: 236/1777
Building feature vector for dataset/unchecked_low_level_calls/0x9d06cbafa865037a01d322d3f4222fa3e04e5488.sol:Delta ... | Feature vectors built: 237/1777
Building feature vector for dataset/unchecked_low_level_calls/21697.sol:SPRING_BOARD_1_ETH ... | Feature vectors built: 238/1777
Building feature vector for dataset/unchecked_low_level_calls/4.sol:Wrapped4 ... | Feature vectors built: 239/1777
Building feature vector for dataset/unchecked_low_level_calls/22629.sol:Factory ... | Feature vectors built: 240/1777
Building feature vector for dataset/unchecked_low_level_calls/22629.sol:MultiSigWallet ... | Feature vectors built: 241/1777
Building feature vector for dataset/unchecked_low_level_calls/22629.sol:MultiSigWalletWithDailyLimit ... | Feature vectors built: 242/1777
Building feature vector for dataset/unchecked_low_level_calls/22629.sol:MultiSigWalletWithDailyLimitF