In [3]:
#Dataset: https://github.com/sujeetc/ScrawlD/tree/main?tab=readme-ov-file

In [1]:
import pandas as pd
import requests
import json
import os
from time import sleep

# 1. Get Source Code from etherscan API:

In [2]:
ETHERSCAN_API_KEY = "ZAZGB7REEJ4TSG38SS9PT6MR55KN83ZZ43"

In [3]:
def get_contract_source(address):
    """Fetch contract source code from Etherscan API"""
    url = f"https://api.etherscan.io/api"
    params = {
        "module": "contract",
        "action": "getsourcecode",
        "address": address,
        "apikey": ETHERSCAN_API_KEY
    }
    
    try:
        response = requests.get(url, params=params)
        data = response.json()
        
        if data["status"] == "1" and data["message"] == "OK":
            return data["result"][0]
        else:
            print(f"Error fetching contract {address}: {data['message']}")
            return None
    except Exception as e:
        print(f"Exception occurred while fetching contract {address}: {str(e)}")
        return None

In [None]:
df = pd.read_csv('contracts.csv')

# Create a new DataFrame to store contract data
contract_data_list = []

# Process each contract address
for index, row in df.iterrows():
    address = row['address']
    print(f"Processing contract {index + 1}/{len(df)}: {address}")
    
    # Get contract source
    contract_data = get_contract_source(address)
    
    if contract_data:
        # Extract contract information
        contract_name = contract_data.get("ContractName", f"contract_{address}")
        source_code = contract_data.get("SourceCode", "")
        
        # Add to our list
        contract_data_list.append({
            'contract_address': address,
            'contract_name': contract_name,
            'source_code': source_code
        })
        
        print(f"Processed contract: {contract_name}")
    
    # Sleep to avoid rate limiting
    sleep(0.2)

# Create DataFrame from collected data
contracts_df = pd.DataFrame(contract_data_list)

# Save to CSV
output_file = "contract_sources.csv"
contracts_df.to_csv(output_file, index=False)
print(f"\nSaved {len(contracts_df)} contracts to {output_file}")


Processing contract 1/9252: 0x0000000000075efbee23fe2de1bd0b7690883cc9
Processed contract: OwnedUpgradeabilityProxy
Processing contract 2/9252: 0x00000000000da14c27c155bb7c1ac9bd7519eb3b
Processed contract: DepositAddressRegistrar
Processing contract 3/9252: 0x0000000000b3f879cb30fe243b4dfee438691c04
Processed contract: GasToken2
Processing contract 4/9252: 0x000000002bb43c83ece652d161ad0fa862129a2c
Processed contract: AccountRegistry
Processing contract 5/9252: 0x00000000441378008ea67f4284a57932b1c000a5
Processed contract: TrueGBP
Processing contract 6/9252: 0x00000000bbcf7700a1b403c9eb666f350707b900
Processed contract: TGBPController
Processing contract 7/9252: 0x00000000e86b5156e8fd624255bf7a6d722a8f1f
Processed contract: ARIYAX
Processing contract 8/9252: 0x0000009a317684a5f840484357fa587aca76454c


In [5]:
contracts_df.shape

(9252, 3)

In [6]:
contracts_df.head(2)

Unnamed: 0,contract_address,contract_name,source_code
0,0x0000000000075efbee23fe2de1bd0b7690883cc9,OwnedUpgradeabilityProxy,/**\r\n *Submitted for verification at Ethersc...
1,0x00000000000da14c27c155bb7c1ac9bd7519eb3b,DepositAddressRegistrar,pragma solidity ^0.4.23;\r\n\r\n// File: contr...


# 2. Add the Vulnerability Labels from json (ScrawID)

## 2.1 Aux Functions:

In [4]:
def load_vulnerabilities():
    """Load vulnerability data from JSON file"""
    try:
        with open('vulnerabilities.json', 'r') as f:
            return json.load(f)
    except Exception as e:
        print(f"Error loading vulnerabilities file: {str(e)}")
        return {}

def get_all_vulnerability_lines(contract_vulns, vuln_type):
    """Combine line numbers from all tools for a specific vulnerability type"""
    all_lines = set()
    if vuln_type in contract_vulns:
        tool_data = contract_vulns[vuln_type]
        if isinstance(tool_data, dict):
            # If tool_data is a dictionary of tools
            for tool in tool_data.values():
                if isinstance(tool, list):
                    all_lines.update(tool)
        elif isinstance(tool_data, list):
            # If tool_data is directly a list of lines
            all_lines.update(tool_data)
    return sorted(list(all_lines))

## 2.2  Append Vulnerability Labels:

- Load Previous step dataset:

In [None]:
contracts_df = pd.read_csv('contract_sources.csv')

In [None]:
print(f"\nSaved initial data with {len(contracts_df)} contracts")

# Step 2: Add vulnerability information
print("\nStep 2: Adding vulnerability information...")
vulnerabilities = load_vulnerabilities()

# Add vulnerability columns
vulnerability_types = ['ARTHM', 'DOS', 'LE', 'RENT', 'TimeM', 'TimeO', 'Tx-Origin', 'UE']

for vuln_type in vulnerability_types:
    column_name = f'{vuln_type}_lines'
    contracts_df[column_name] = contracts_df['contract_address'].apply(
        lambda x: get_all_vulnerability_lines(
            vulnerabilities.get(f"{x}_ext.sol", {}),
            vuln_type
        )
    )

# Save final DataFrame to CSV
output_file = "contract_sources_with_vulnerabilities.csv"
contracts_df.to_csv(output_file, index=False)
print(f"\nSaved {len(contracts_df)} contracts with vulnerability information to {output_file}")

- Filter Smart contract source_code where token len > 2048

In [None]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")

def filter_by_token_length(source_code, tokenizer, max_length=2048):
    tokens = tokenizer.encode(source_code, add_special_tokens=True)
    return len(tokens) <= max_length

# Apply the filtering to the DataFrame
filtered_df = contracts_df[contracts_df['source_code'].apply(lambda x: filter_by_token_length(x, tokenizer, max_length=2048))]

# Print statistics
print(f"Original number of contracts: {len(contracts_df)}")
print(f"Number of contracts after filtering: {len(filtered_df)}")
print(f"Removed {len(contracts_df) - len(filtered_df)} contracts that were too long")

In [None]:
filtered_df.shape

In [None]:
output_file = "contract_sources_with_vulnerabilities_2048_token_size.csv"
filtered_df.to_csv(output_file, index=False)
print(f"\nSaved {len(filtered_df)} contracts with vulnerability information to {output_file}")