In [1]:
import json

In [4]:
with open("data/data-1600-9.json", "r") as f:
  json_data = json.load(f)
len(json_data["testCases"])

100

In [1]:
import subprocess

In [2]:
for i in range(25, 25 + 16):
  subprocess.run(f"curl https://samate.nist.gov/SARD/api/test-cases/search?language%5B%5D=java&page={i}&limit=100 -o data/data-{i}.json")

In [None]:
import os
import json
import requests
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

# Create directories if they don't exist
os.makedirs("data_cpp/json", exist_ok=True)

def get_test_cases(page):
    url = f"https://samate.nist.gov/SARD/api/test-cases/search?language%5B%5D=java&state%5B%5D=mixed&page={page}&limit=100"
    try:
        response = requests.get(url)
        if response.status_code == 200:
            data = response.json()
            # Fix the directory path to match the one we created
            filename = f"data_cpp/json/data-mixed-{page}.json"
            with open(filename, 'w') as f:
                json.dump(data, f, indent=2)
            return f"Downloaded page {page}"
        else:
            return f"Failed to download page {page}: HTTP {response.status_code}"
    except Exception as e:
        return f"Error downloading page {page}: {str(e)}"

# Get first page to check total records
first_page = requests.get("https://samate.nist.gov/SARD/api/test-cases/search?language%5B%5D=java&state%5B%5D=mixed&page=1&limit=100")
total_records = min(first_page.json()['total'], 2000)
print(total_records)
total_pages = (total_records + 99) // 100  # Ceiling division

# Download pages in parallel
with ThreadPoolExecutor(max_workers=10) as executor:
    futures = [executor.submit(get_test_cases, page) for page in range(1, total_pages + 1)]
    
    for future in tqdm(as_completed(futures), total=len(futures), desc="Downloading pages"):
        try:
            result = future.result()
            if "Error" in result or "Failed" in result:
                print(f"\n{result}")
        except Exception as e:
            print(f"\nUnexpected error: {str(e)}")

2000


Downloading pages: 100%|██████████| 20/20 [00:05<00:00,  3.49it/s]


In [14]:
import os
import json
import requests
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
from functools import partial
import time
import urllib3

# Increase max retries for the urllib3 connection pool
urllib3.util.retry.Retry.DEFAULT.backoff_factor = 1
urllib3.util.retry.Retry.DEFAULT.total = 5

def download_file(url, download_dir, max_retries=3, retry_delay=2):
    filename = url.split('/')[-1]
    # Add -mixed suffix before .zip extension
    filename_parts = filename.rsplit('.', 1)
    filename = f"{filename_parts[0]}-bad.{filename_parts[1]}"
    
    filepath = os.path.join(download_dir, filename)
    
    # Skip if file already exists and has size > 0
    if os.path.exists(filepath) and os.path.getsize(filepath) > 0:
        return f"Skipped {filename} (already exists)"
    
    for attempt in range(max_retries):
        try:
            # Create a session with custom settings
            session = requests.Session()
            session.mount('https://', requests.adapters.HTTPAdapter(
                max_retries=urllib3.util.Retry(
                    total=5,
                    backoff_factor=1,
                    status_forcelist=[500, 502, 503, 504]
                )
            ))
            
            # Download with increased timeout
            response = session.get(url, stream=True, timeout=30)
            if response.status_code == 200:
                with open(filepath, 'wb') as f:
                    for chunk in response.iter_content(chunk_size=8192):
                        if chunk:
                            f.write(chunk)
                # Verify file was written
                if os.path.exists(filepath) and os.path.getsize(filepath) > 0:
                    return f"Successfully downloaded {filename}"
                else:
                    raise Exception("File was not written correctly")
            else:
                raise requests.exceptions.RequestException(f"HTTP {response.status_code}")
                
        except Exception as e:
            if attempt < max_retries - 1:
                time.sleep(retry_delay * (attempt + 1))  # Exponential backoff
                continue
            return f"Error downloading {filename} after {max_retries} attempts: {str(e)}"
        finally:
            session.close()
    
    return f"Failed to download {filename} after all retries"

# Get download URLs from JSON files
download_urls = []
json_dir = "data_cpp/json"
for json_file in os.listdir(json_dir):
    if json_file.startswith("data-bad-"):
        with open(os.path.join(json_dir, json_file), 'r') as f:
            data = json.load(f)
            for test_case in data.get('testCases', []):
                download_url = test_case.get('download')
                if download_url:
                    download_urls.append(download_url)

# Create a directory for downloads if it doesn't exist
os.makedirs('data_cpp/zips', exist_ok=True)

# Reduce number of concurrent downloads to avoid overwhelming the connection
max_workers = 20  # Reduced from 50 to 10 for more stability
batch_size = 10    # Reduced batch size

# Create a partial function with the download directory
download_func = partial(download_file, download_dir='data_java1/zips')

# Keep track of failed downloads for potential retry
failed_downloads = []

# Use ThreadPoolExecutor for parallel downloads
with ThreadPoolExecutor(max_workers=max_workers) as executor:
    futures = []
    
    # Process in smaller batches
    for i in range(0, len(download_urls), batch_size):
        batch = download_urls[i:i + batch_size]
        batch_futures = [executor.submit(download_func, url) for url in batch]
        futures.extend(batch_futures)
        
        # Small delay between batches
        time.sleep(1)
    
    # Process completed downloads with progress bar
    with tqdm(total=len(download_urls), desc="Retrying failed downloads") as pbar:
        for future in as_completed(futures):
            try:
                result = future.result()
                if "Error" in result or "Failed" in result:
                    print(f"\n{result}")
                    # Add to failed downloads list for potential future retry
                    failed_downloads.append(result.split()[2])  # Extract filename
            except Exception as e:
                print(f"\nUnexpected error: {str(e)}")
            finally:
                pbar.update(1)

# Print summary
print("\nDownload retry completed")
if failed_downloads:
    print(f"Files that still failed to download ({len(failed_downloads)}):")
    for file in failed_downloads:
        print(f"- {file}")
else:
    print("All files were downloaded successfully!")

Retrying failed downloads:  97%|█████████▋| 1939/2000 [00:00<00:00, 15876.26it/s]


Error downloading 501239-v1.0.0-bad.zip after 3 attempts: HTTP 404

Error downloading 501242-v1.0.0-bad.zip after 3 attempts: HTTP 404

Error downloading 501204-v1.0.0-bad.zip after 3 attempts: HTTP 404

Error downloading 501202-v1.0.0-bad.zip after 3 attempts: HTTP 404

Error downloading 501234-v1.0.0-bad.zip after 3 attempts: HTTP 404

Error downloading 501232-v1.0.0-bad.zip after 3 attempts: HTTP 404

Error downloading 501210-v1.0.0-bad.zip after 3 attempts: HTTP 404

Error downloading 501196-v1.0.0-bad.zip after 3 attempts: HTTP 404

Error downloading 501209-v1.0.0-bad.zip after 3 attempts: HTTP 404

Error downloading 501190-v1.0.0-bad.zip after 3 attempts: HTTP 404

Error downloading 501231-v1.0.0-bad.zip after 3 attempts: HTTP 404

Error downloading 501220-v1.0.0-bad.zip after 3 attempts: HTTP 404

Error downloading 501192-v1.0.0-bad.zip after 3 attempts: HTTP 404

Error downloading 501205-v1.0.0-bad.zip after 3 attempts: HTTP 404

Error downloading 501188-v1.0.0-bad.zip after 3

Retrying failed downloads: 100%|██████████| 2000/2000 [00:06<00:00, 300.77it/s]  


Download retry completed
Files that still failed to download (60):
- 501239-v1.0.0-bad.zip
- 501242-v1.0.0-bad.zip
- 501204-v1.0.0-bad.zip
- 501202-v1.0.0-bad.zip
- 501234-v1.0.0-bad.zip
- 501232-v1.0.0-bad.zip
- 501210-v1.0.0-bad.zip
- 501196-v1.0.0-bad.zip
- 501209-v1.0.0-bad.zip
- 501190-v1.0.0-bad.zip
- 501231-v1.0.0-bad.zip
- 501220-v1.0.0-bad.zip
- 501192-v1.0.0-bad.zip
- 501205-v1.0.0-bad.zip
- 501188-v1.0.0-bad.zip
- 501237-v1.0.0-bad.zip
- 501208-v1.0.0-bad.zip
- 501184-v1.0.0-bad.zip
- 501186-v1.0.0-bad.zip
- 501195-v1.0.0-bad.zip
- 501222-v1.0.0-bad.zip
- 501233-v1.0.0-bad.zip
- 501203-v1.0.0-bad.zip
- 501199-v1.0.0-bad.zip
- 501193-v1.0.0-bad.zip
- 501227-v1.0.0-bad.zip
- 501191-v1.0.0-bad.zip
- 501235-v1.0.0-bad.zip
- 501217-v1.0.0-bad.zip
- 501187-v1.0.0-bad.zip
- 501215-v1.0.0-bad.zip
- 501225-v1.0.0-bad.zip
- 501212-v1.0.0-bad.zip
- 501206-v1.0.0-bad.zip
- 501214-v1.0.0-bad.zip
- 501236-v1.0.0-bad.zip
- 501240-v1.0.0-bad.zip
- 501183-v1.0.0-bad.zip
- 501229-v1.0.0-bad.




In [4]:
# Unzip files from data/zips to data/unzips using ThreadPoolExecutor
import os
import zipfile
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

def unzip_file(zip_filename):
    """
    Unzip a single zip file
    
    Args:
        zip_filename (str): Name of the zip file to unzip
    """
    zip_path = os.path.join('data_java1', 'zips', zip_filename)
    try:
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            # Extract to the destination directory
            zip_ref.extractall(os.path.join('data_java1', 'unzips', zip_filename.split(".zip")[0]))
        return f"Successfully unzipped: {zip_filename}"
    except Exception as e:
        return f"Error unzipping {zip_filename}: {str(e)}"

def main():
    # Create the destination directory if it doesn't exist
    os.makedirs(os.path.join('data_java1', 'unzips'), exist_ok=True)

    # Get list of zip files in the source directory
    zip_files = [f for f in os.listdir(os.path.join('data_java1', 'zips')) if f.endswith('.zip')]

    print(f"Found {len(zip_files)} zip files to process")

    # Use ThreadPoolExecutor for parallel unzipping
    with ThreadPoolExecutor(max_workers=20) as executor:
        # Submit all tasks
        futures = [executor.submit(unzip_file, zip_file) for zip_file in zip_files]
        
        # Process results with progress bar
        with tqdm(total=len(zip_files), desc="Unzipping files") as pbar:
            for future in as_completed(futures):
                try:
                    result = future.result()
                    if "Error" in result:
                        print(f"\n{result}")
                except Exception as e:
                    print(f"\nUnexpected error: {str(e)}")
                finally:
                    pbar.update(1)

    # List the extracted files
    print("\nExtracted files:")
    for extracted_file in os.listdir(os.path.join('data_java1', 'unzips')):
        print(f"- {extracted_file}")

if __name__ == '__main__':
    main()

Found 3848 zip files to process


Unzipping files: 100%|██████████| 3848/3848 [00:24<00:00, 157.38it/s]



Extracted files:
- 155089-v1.0.0-bad
- 155090-v1.0.0-bad
- 155091-v1.0.0-bad
- 155092-v1.0.0-bad
- 155093-v1.0.0-bad
- 155094-v1.0.0-bad
- 155095-v1.0.0-bad
- 155096-v1.0.0-bad
- 155097-v1.0.0-bad
- 155098-v1.0.0-bad
- 155099-v1.0.0-bad
- 155100-v1.0.0-bad
- 155101-v1.0.0-bad
- 155102-v1.0.0-bad
- 155103-v1.0.0-bad
- 155104-v1.0.0-bad
- 155105-v1.0.0-bad
- 155106-v1.0.0-bad
- 155107-v1.0.0-bad
- 155108-v1.0.0-bad
- 155109-v1.0.0-bad
- 155110-v1.0.0-bad
- 155111-v1.0.0-bad
- 155112-v1.0.0-bad
- 155113-v1.0.0-bad
- 155114-v1.0.0-bad
- 155115-v1.0.0-bad
- 155116-v1.0.0-bad
- 155117-v1.0.0-bad
- 155118-v1.0.0-bad
- 155119-v1.0.0-bad
- 155120-v1.0.0-bad
- 155121-v1.0.0-bad
- 155122-v1.0.0-bad
- 155123-v1.0.0-bad
- 155124-v1.0.0-bad
- 155125-v1.0.0-bad
- 155126-v1.0.0-bad
- 155127-v1.0.0-bad
- 155128-v1.0.0-bad
- 155129-v1.0.0-bad
- 155130-v1.0.0-bad
- 155131-v1.0.0-bad
- 155132-v1.0.0-bad
- 155133-v1.0.0-bad
- 155134-v1.0.0-bad
- 155135-v1.0.0-bad
- 155136-v1.0.0-bad
- 155137-v1.0.0-bad
- 

In [6]:
import os
for root, dirs, files in os.walk('data_java1/unzips'):
    print(f"Root: {root}")
    print(f"Directories: {dirs}")
    print(f"Files: {len(files)}")
    break

Root: data_java1/unzips
Directories: ['155089-v1.0.0-bad', '155090-v1.0.0-bad', '155091-v1.0.0-bad', '155092-v1.0.0-bad', '155093-v1.0.0-bad', '155094-v1.0.0-bad', '155095-v1.0.0-bad', '155096-v1.0.0-bad', '155097-v1.0.0-bad', '155098-v1.0.0-bad', '155099-v1.0.0-bad', '155100-v1.0.0-bad', '155101-v1.0.0-bad', '155102-v1.0.0-bad', '155103-v1.0.0-bad', '155104-v1.0.0-bad', '155105-v1.0.0-bad', '155106-v1.0.0-bad', '155107-v1.0.0-bad', '155108-v1.0.0-bad', '155109-v1.0.0-bad', '155110-v1.0.0-bad', '155111-v1.0.0-bad', '155112-v1.0.0-bad', '155113-v1.0.0-bad', '155114-v1.0.0-bad', '155115-v1.0.0-bad', '155116-v1.0.0-bad', '155117-v1.0.0-bad', '155118-v1.0.0-bad', '155119-v1.0.0-bad', '155120-v1.0.0-bad', '155121-v1.0.0-bad', '155122-v1.0.0-bad', '155123-v1.0.0-bad', '155124-v1.0.0-bad', '155125-v1.0.0-bad', '155126-v1.0.0-bad', '155127-v1.0.0-bad', '155128-v1.0.0-bad', '155129-v1.0.0-bad', '155130-v1.0.0-bad', '155131-v1.0.0-bad', '155132-v1.0.0-bad', '155133-v1.0.0-bad', '155134-v1.0.0-ba

In [None]:
import shutil
import os

# Create java-src directory if it doesn't exist
os.makedirs('data_cpp/cpp-src', exist_ok=True)

# Walk through the unzips directory
for root, dirs, files in os.walk('data_cpp/unzips'):
    for dir in dirs:
        if dir.endswith("good"):
            for root_sub, _, files in os.walk(os.path.join(root, dir)):
                for file in files:
                    if file.endswith(".cpp") or file.endswith(".c"):
                        source_file_path = os.path.join(root_sub, file)
                        os.makedirs("data_cpp/cpp-src" +"/" + dir, exist_ok=True)
                        dest_file_path = os.path.join("data_cpp/cpp-src", dir, f"{file}")
                        print(dest_file_path)
                        shutil.copy2(source_file_path, dest_file_path)

print("Done")

data_cpp/cpp-src\1448-v1.0.0-good\badfree_024.c
data_cpp/cpp-src\1458-v1.0.0-good\memoryleak_002.c
data_cpp/cpp-src\1487-v1.0.0-good\Figure2-3-windows.cpp
data_cpp/cpp-src\1489-v1.0.0-good\Figure2-5-windows.cpp
data_cpp/cpp-src\1495-v1.0.0-good\Figure2-29-windows.cpp
data_cpp/cpp-src\1498-v1.0.0-good\Figure2-33-windows.cpp
data_cpp/cpp-src\1503-v1.0.0-good\Figure3-10-windows.cpp
data_cpp/cpp-src\1953-v1.0.0-good\HeapOverflow_good.cpp
data_cpp/cpp-src\1956-v1.0.0-good\HeapOverflow_Scope_good.cpp
data_cpp/cpp-src\1959-v1.0.0-good\HeapOverflow_ArrayAddress_good.cpp
data_cpp/cpp-src\1960-v1.0.0-good\LeftOverDebug_good.cpp
data_cpp/cpp-src\1962-v1.0.0-good\HeapOverflow_ArrayIndex_good.cpp
data_cpp/cpp-src\1963-v1.0.0-good\memory_leak_basic_good.cpp
data_cpp/cpp-src\1966-v1.0.0-good\xss_basic_good.cpp
data_cpp/cpp-src\1968-v1.0.0-good\memory_leak_container_good.cpp
data_cpp/cpp-src\1970-v1.0.0-good\memory_leak_control_flow_good.cpp
data_cpp/cpp-src\1972-v1.0.0-good\StackOverflow_good.cpp
dat

In [3]:
import os
import shutil
import json

def extract_mixed_java_files(unzips_dir='data_cpp/unzips', cpp_src_dir='data_cpp/cpp-src'):
    """
    For each folder in unzips_dir ending with 'mixed', parse its manifest.sarif,
    extract the Java file paths from the 'results'->'locations'->'artifactLocation'->'uri' fields,
    and copy those files into java_src_dir/<unzipped_folder>/
    """
    os.makedirs(cpp_src_dir, exist_ok=True)
    for entry in os.listdir(unzips_dir):
        if entry.endswith('mixed'):
            mixed_folder = os.path.join(unzips_dir, entry)
            manifest_path = os.path.join(mixed_folder, 'manifest.sarif')
            if not os.path.isfile(manifest_path):
                print(f"Warning: manifest.sarif not found in {mixed_folder}")
                continue
            try:
                with open(manifest_path, 'r', encoding='utf-8') as f:
                    manifest = json.load(f)
            except Exception as e:
                print(f"Error reading {manifest_path}: {e}")
                continue

            # Defensive: SARIF structure
            runs = manifest.get('runs', [])
            for run in runs:
                results = run.get('results', [])
                for result in results:
                    locations = result.get('locations', [])
                    for loc in locations:
                        artifact_loc = loc.get('physicalLocation', {}).get('artifactLocation', {})
                        uri = artifact_loc.get('uri')
                        if uri and uri.endswith('.cpp'):
                            src_file_path = os.path.join(mixed_folder, uri.replace('/', os.sep))
                            if not os.path.isfile(src_file_path):
                                print(f"Source file not found: {src_file_path}")
                                continue
                            # Create destination directory for this mixed folder
                            dest_dir = os.path.join(cpp_src_dir, entry)
                            os.makedirs(dest_dir, exist_ok=True)
                            dest_file_path = os.path.join(dest_dir, os.path.basename(uri))
                            print(f"Copying {src_file_path} -> {dest_file_path}")
                            shutil.copy2(src_file_path, dest_file_path)
    print("Done")

extract_mixed_java_files()

Copying data_cpp/unzips\233705-v2.0.0-mixed\src\testcases\CWE122_Heap_Based_Buffer_Overflow\s09\CWE122_Heap_Based_Buffer_Overflow__c_CWE806_wchar_t_memcpy_81_bad.cpp -> data_cpp/cpp-src\233705-v2.0.0-mixed\CWE122_Heap_Based_Buffer_Overflow__c_CWE806_wchar_t_memcpy_81_bad.cpp
Copying data_cpp/unzips\233843-v2.0.0-mixed\src\testcases\CWE122_Heap_Based_Buffer_Overflow\s10\CWE122_Heap_Based_Buffer_Overflow__c_CWE806_wchar_t_snprintf_72b.cpp -> data_cpp/cpp-src\233843-v2.0.0-mixed\CWE122_Heap_Based_Buffer_Overflow__c_CWE806_wchar_t_snprintf_72b.cpp
Copying data_cpp/unzips\234107-v2.0.0-mixed\src\testcases\CWE122_Heap_Based_Buffer_Overflow\s10\CWE122_Heap_Based_Buffer_Overflow__c_src_wchar_t_cpy_82_bad.cpp -> data_cpp/cpp-src\234107-v2.0.0-mixed\CWE122_Heap_Based_Buffer_Overflow__c_src_wchar_t_cpy_82_bad.cpp
Copying data_cpp/unzips\234115-v2.0.0-mixed\src\testcases\CWE122_Heap_Based_Buffer_Overflow\s11\CWE122_Heap_Based_Buffer_Overflow__placement_new_06.cpp -> data_cpp/cpp-src\234115-v2.0.0-

In [5]:
import shutil
import os

# Walk through the c-src directory
for root, dirs, files in os.walk('data_c/c-src'):
    for file in files:
        if file.endswith('.cpp'):
            # Get the path relative to c-src directory
            rel_path = os.path.relpath(root, 'data_c/c-src')
            # Get the first folder name after c-src
            parent_folder = rel_path.split(os.sep)[0]
            
            # Source and destination paths
            src_path = os.path.join(root, file)
            dest_path = os.path.join('data_cpp/unzips', parent_folder, file)
            
            # Delete the file
            os.remove(src_path)
            print(f"Deleted {file} from {root}")

# Remove empty directories
for root, dirs, files in os.walk('data_c/c-src', topdown=False):
    for dir in dirs:
        dir_path = os.path.join(root, dir)
        try:
            os.rmdir(dir_path)
            print(f"Removed empty directory: {dir_path}")
        except OSError:
            pass  # Directory not empty

# Try to remove the main c-src directory
try:
    os.rmdir('data_c/c-src')
    print("\nRemoved c-src directory")
except OSError:
    print("\nCould not remove c-src directory (may not be empty)")


Deleted Figure2-1-windows.cpp from data_c/c-src\1485-v1.0.0
Deleted Figure2-2-windows.cpp from data_c/c-src\1486-v1.0.0
Deleted Figure2-3-windows.cpp from data_c/c-src\1487-v1.0.0
Deleted Figure2-4-windows.cpp from data_c/c-src\1488-v1.0.0
Deleted RenderListMarker.cpp from data_c/c-src\148819-v1.0.0
Deleted Geolocation.cpp from data_c/c-src\148820-v1.0.0
Deleted Element.cpp from data_c/c-src\148821-v1.0.0
Deleted markup.cpp from data_c/c-src\148822-v1.0.0
Deleted Element.cpp from data_c/c-src\148823-v1.0.0
Deleted FixedTableLayout.cpp from data_c/c-src\148826-v1.0.0
Deleted EventHandler.cpp from data_c/c-src\148827-v1.0.0
Deleted HTMLFrameElementBase.cpp from data_c/c-src\148827-v1.0.0
Deleted Element.cpp from data_c/c-src\148828-v1.0.0
Deleted EventHandler.cpp from data_c/c-src\148828-v1.0.0
Deleted FixedTableLayout.cpp from data_c/c-src\148828-v1.0.0
Deleted Geolocation.cpp from data_c/c-src\148828-v1.0.0
Deleted HTMLFrameElementBase.cpp from data_c/c-src\148828-v1.0.0
Deleted markup