In [35]:
import os
import re
import csv
from pathlib import Path

def clean_description(description):
    # Remove attribution lines (e.g., "_Submitted by ..._")
    description = re.sub(r"_Submitted by.*?_", "", description, flags=re.DOTALL)
    
    # Remove markdown bold links (e.g., **[username confirmed](link)**)
    description = re.sub(r"\*\*\[.*?\]\(.*?\)\*\*", "", description)
    
    # Remove dispute/confirmation messages and any following quoted text
    description = re.sub(r"\*\*\[[^\]]+\]\(https:\/\/github\.com\/[^)]+\):\*\*\s*(?:>.*(?:\n|$))*", "", description)

    # Remove extra whitespace
    return description.strip()

def extract_vulnerabilities(md_file):
    with open(md_file, "r", encoding="utf-8") as file:
        content = file.read()
    
    severity_levels = {"H": "high", "M": "medium", "L": "low"}
    
    # Improved regex to properly separate vulnerabilities
    findings = re.findall(
        r"## \[\[(H|M|L)-\d+\] (.*?)\]\(.*?\)\s*\n+"  # Capture severity & title
        r"([\s\S]*?)(?=\n## \[\[|\n# |\Z)",  # Capture description until the next vulnerability header (## [[)
        content, re.DOTALL
    )
    
    extracted_data = []
    for severity, title, description in findings:
        cleaned_description = clean_description(description)

        severity_final = severity_levels.get(severity)
        
        extracted_data.append([
            os.path.basename(md_file), title.strip(), severity_final, cleaned_description
        ])
    
    return extracted_data

def process_markdown_files(directory):
    vulnerabilities = []
    
    for md_file in Path(directory).rglob("*.md"):
        vulnerabilities.extend(extract_vulnerabilities(md_file))
    
    with open("vulnerabilities.csv", "w", newline="", encoding="utf-8") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["file_name", "vuln_title", "severity", "description"])
        writer.writerows(vulnerabilities)
    
    print(f"Extracted {len(vulnerabilities)} vulnerabilities into vulnerabilities.csv")

# Example usage
process_markdown_files("Web3Bugs/reports")

Extracted 1744 vulnerabilities into vulnerabilities.csv


In [36]:
import pandas as pd 

df = pd.read_csv("vulnerabilities.csv")
df

Unnamed: 0,file_name,vuln_title,severity,description
0,35.md,Unsafe cast in `ConcentratedLiquidityPool.burn...,high,The `ConcentratedLiquidityPool.burn` function ...
1,35.md,Wrong usage of `positionId` in `ConcentratedLi...,high,#### Impact\nIn the `subscribe` function of `C...
2,35.md,`ConcentratedLiquidityPoolManager`'s incentive...,high,The `ConcentratedLiquidityPoolManager` keeps a...
3,35.md,Overflow in the `mint` function of `Concentrat...,high,#### Impact\nSimilar to a previous finding in ...
4,35.md,Incorrect usage of typecasting in `_getAmounts...,high,#### Impact\nThe `_getAmountsForLiquidity` fun...
...,...,...,...,...
1739,52.md,Unclear `TwapOracle.consult` algorithm,medium,The `TWAPOracle.consult` function is unclear t...
1740,52.md,Tokens with fee on transfer are not supported,medium,There are ERC20 tokens that charge fee for eve...
1741,52.md,VaderPoolV2.rescue results in loss of funds ra...,medium,#### Impact\n\nAny unaccounted for tokens on `...
1742,52.md,No way to remove GasThrottle after deployment,medium,#### Impact\n\nPotential DOS on swaps\n\n#### ...
