In [10]:
import os
import pandas as pd

def create_cpv_dataframe(root_dir):
    """
    Crawls a directory structure, collects file paths for each cpv folder,
    and returns a pandas DataFrame.

    Args:
        root_dir (str): The path to the root directory to start crawling from.

    Returns:
        pandas.DataFrame: A DataFrame with the collected file paths.
    """
    data = []
    for item in os.listdir(root_dir):
        item_path = os.path.join(root_dir, item)
        if os.path.isdir(item_path) and item.startswith('cpv'):
            cpv_id = item
            
            # Initialize paths with None
            cpvinfo_path = None
            pov_pou_info_path = None
            test_path = None
            bad_patch_path = None
            good_patch_path = None
            test_input_path = None

            # Check for top-level files in cpv directory
            if os.path.exists(os.path.join(item_path, "CPVINFO.md")):
                cpvinfo_path = os.path.join(item_path, "CPVINFO.md")
            if os.path.exists(os.path.join(item_path, "pov_pou_info")):
                pov_pou_info_path = os.path.join(item_path, "pov_pou_info")

            # Check for files in subdirectories
            private_tests_dir = os.path.join(item_path, "private_tests")
            if os.path.isdir(private_tests_dir):
                test_files = os.listdir(private_tests_dir)
                if test_files:
                    test_path = os.path.join(private_tests_dir, test_files[0])

            patches_dir = os.path.join(item_path, "patches", "nginx")
            if os.path.isdir(patches_dir):
                if os.path.exists(os.path.join(patches_dir, "bad_patch.diff")):
                    bad_patch_path = os.path.join(patches_dir, "bad_patch.diff")
                if os.path.exists(os.path.join(patches_dir, "good_patch.diff")):
                    good_patch_path = os.path.join(patches_dir, "good_patch.diff")

            blobs_dir = os.path.join(item_path, "blobs")
            if os.path.isdir(blobs_dir):
                blob_files = [f for f in os.listdir(blobs_dir) if f.endswith('.bin') or f.endswith('.blob')]
                if blob_files:
                    test_input_path = os.path.join(blobs_dir, blob_files[0])

            data.append({
                "cpv_id": cpv_id,
                "CPVINFO": cpvinfo_path,
                "pov_pou_info": pov_pou_info_path,
                "private_test": test_path,
                "bad_patch": bad_patch_path,
                "good_patch": good_patch_path,
                "test_input": test_input_path,
            })

    return pd.DataFrame(data)

In [11]:
df = create_cpv_dataframe("D:\My Notes\My Vault\work\challenge-004-nginx-cp\.internal_only")

In [12]:
df

Unnamed: 0,cpv_id,CPVINFO,pov_pou_info,private_test,bad_patch,good_patch,test_input
0,cpv1,D:\My Notes\My Vault\work\challenge-004-nginx-...,D:\My Notes\My Vault\work\challenge-004-nginx-...,D:\My Notes\My Vault\work\challenge-004-nginx-...,D:\My Notes\My Vault\work\challenge-004-nginx-...,D:\My Notes\My Vault\work\challenge-004-nginx-...,D:\My Notes\My Vault\work\challenge-004-nginx-...
1,cpv10,D:\My Notes\My Vault\work\challenge-004-nginx-...,D:\My Notes\My Vault\work\challenge-004-nginx-...,D:\My Notes\My Vault\work\challenge-004-nginx-...,D:\My Notes\My Vault\work\challenge-004-nginx-...,D:\My Notes\My Vault\work\challenge-004-nginx-...,D:\My Notes\My Vault\work\challenge-004-nginx-...
2,cpv11,D:\My Notes\My Vault\work\challenge-004-nginx-...,D:\My Notes\My Vault\work\challenge-004-nginx-...,D:\My Notes\My Vault\work\challenge-004-nginx-...,D:\My Notes\My Vault\work\challenge-004-nginx-...,D:\My Notes\My Vault\work\challenge-004-nginx-...,D:\My Notes\My Vault\work\challenge-004-nginx-...
3,cpv12,D:\My Notes\My Vault\work\challenge-004-nginx-...,D:\My Notes\My Vault\work\challenge-004-nginx-...,D:\My Notes\My Vault\work\challenge-004-nginx-...,D:\My Notes\My Vault\work\challenge-004-nginx-...,D:\My Notes\My Vault\work\challenge-004-nginx-...,D:\My Notes\My Vault\work\challenge-004-nginx-...
4,cpv13,D:\My Notes\My Vault\work\challenge-004-nginx-...,D:\My Notes\My Vault\work\challenge-004-nginx-...,D:\My Notes\My Vault\work\challenge-004-nginx-...,D:\My Notes\My Vault\work\challenge-004-nginx-...,D:\My Notes\My Vault\work\challenge-004-nginx-...,D:\My Notes\My Vault\work\challenge-004-nginx-...
5,cpv14,D:\My Notes\My Vault\work\challenge-004-nginx-...,D:\My Notes\My Vault\work\challenge-004-nginx-...,D:\My Notes\My Vault\work\challenge-004-nginx-...,D:\My Notes\My Vault\work\challenge-004-nginx-...,D:\My Notes\My Vault\work\challenge-004-nginx-...,D:\My Notes\My Vault\work\challenge-004-nginx-...
6,cpv15,D:\My Notes\My Vault\work\challenge-004-nginx-...,D:\My Notes\My Vault\work\challenge-004-nginx-...,D:\My Notes\My Vault\work\challenge-004-nginx-...,D:\My Notes\My Vault\work\challenge-004-nginx-...,D:\My Notes\My Vault\work\challenge-004-nginx-...,D:\My Notes\My Vault\work\challenge-004-nginx-...
7,cpv17,D:\My Notes\My Vault\work\challenge-004-nginx-...,D:\My Notes\My Vault\work\challenge-004-nginx-...,D:\My Notes\My Vault\work\challenge-004-nginx-...,D:\My Notes\My Vault\work\challenge-004-nginx-...,D:\My Notes\My Vault\work\challenge-004-nginx-...,D:\My Notes\My Vault\work\challenge-004-nginx-...
8,cpv2,D:\My Notes\My Vault\work\challenge-004-nginx-...,D:\My Notes\My Vault\work\challenge-004-nginx-...,D:\My Notes\My Vault\work\challenge-004-nginx-...,D:\My Notes\My Vault\work\challenge-004-nginx-...,D:\My Notes\My Vault\work\challenge-004-nginx-...,D:\My Notes\My Vault\work\challenge-004-nginx-...
9,cpv3,D:\My Notes\My Vault\work\challenge-004-nginx-...,D:\My Notes\My Vault\work\challenge-004-nginx-...,D:\My Notes\My Vault\work\challenge-004-nginx-...,D:\My Notes\My Vault\work\challenge-004-nginx-...,D:\My Notes\My Vault\work\challenge-004-nginx-...,D:\My Notes\My Vault\work\challenge-004-nginx-...


In [13]:
def read_files_to_dataframe(df_paths):
    """
    Reads the content of files from a DataFrame of paths.

    Args:
        df_paths (pandas.DataFrame): A DataFrame where each cell in certain columns is a file path.

    Returns:
        pandas.DataFrame: A DataFrame with the file contents.
    """
    data = []
    for index, row in df_paths.iterrows():
        row_data = {"cpv_id": row["cpv_id"]}
        for col_name, file_path in row.items():
            if col_name == "cpv_id":
                continue
            
            content = None
            if file_path and os.path.exists(file_path):
                try:
                    # Using 'rb' for read binary to handle all file types without error
                    with open(file_path, 'rb') as f:
                        content = f.read()
                except Exception as e:
                    content = f"Error reading file: {e}"
            row_data[col_name] = content
        data.append(row_data)
        
    return pd.DataFrame(data)

In [15]:
df = read_files_to_dataframe(df)

In [17]:
df.to_csv("sample_errors.csv", index=False)

In [18]:
df.head(3)

Unnamed: 0,cpv_id,CPVINFO,pov_pou_info,private_test,bad_patch,good_patch,test_input
0,cpv1,"b""# Overview\r\n\r\nID: cpv1 \r\nType: [CWE-...","b'pov_harness, AddressSanitizer: heap-buffer-o...",b'#!/usr/bin/perl\r\n\r\n# (C) Maxim Dounin\r\...,"b""diff --git a/src/http/ngx_http_request.c b/s...","b""diff --git a/src/http/ngx_http_request.c b/s...",b'GET / HTTP/1.1\r\nHost: localhost\r\nUser-Ag...
1,cpv10,b'# Overview\r\n\r\nID: cpv10<br>\r\nType: [CW...,"b'pov_harness, AddressSanitizer: attempting do...","b""#!/usr/bin/perl\r\n\r\n# (C) Maxim Dounin\r\...",b'diff --git a/src/http/ngx_http_request.c b/s...,b'diff --git a/src/http/ngx_http_request.c b/s...,b'GET / HTTP/1.1\r\nHost: localhost\r\nPrefer:...
2,cpv11,b'# Overview\r\n\r\nID: cpv11<br>\r\nType: [CW...,"b'pov_harness, AddressSanitizer: heap-use-afte...",b'#!/usr/bin/perl\r\n\r\n# (C) Maxim Dounin\r\...,b'diff --git a/src/core/ngx_cycle.c b/src/core...,b'diff --git a/src/core/ngx_cycle.c b/src/core...,b'GET /host_specs HTTP/1.1\r\nHost: localhost\...


# column explanation

1. cpv_id

This column serves as a unique identifier for each entry in the CSV file. "CPV" likely stands for "Common Programming Vulnerability" or a similar designation within the context of this dataset. Each cpv entry, like cpv1, cpv10, etc., represents a distinct vulnerability case.

2. CPVINFO

This column contains a detailed, human-readable description of the vulnerability. It typically includes:

An Overview: A summary of the vulnerability, including its ID and type, often referencing a Common Weakness Enumeration (CWE) identifier. For example, CWE-787 for "Out-of-bounds Write".

Details: An in-depth explanation of how the vulnerability is introduced into the code. This section describes the new functionality being added and how that new code contains a flaw.

Vulnerability Explanation: A specific breakdown of the nature of the vulnerability, explaining how it can be triggered. For instance, in cpv1, it mentions that two consecutive dots (..) at the beginning of an email address can cause a buffer underrun.

Difficulty Assessment: For some entries, it provides an estimation of how difficult the vulnerability is to discover and to patch (e.g., "Hard" to discover, "Easy" to patch).

3. pov_pou_info

This column appears to provide information related to the "Proof of Vulnerability" (POV) or "Proof of Unexploitability" (POU). It often specifies the type of sanitizer used to detect the vulnerability, such as "AddressSanitizer: heap-buffer-overflow". This indicates the tool and the specific error it reported, confirming the presence of the vulnerability.

4. private_test

This column contains a test script, often written in Perl, used to verify the functionality related to the patched code. It's considered a "private" test in the sense that it's part of the development and testing process and not typically exposed to end-users. These scripts automate the process of sending specific requests to the NGINX server to check its behavior.

5. bad_patch

This column displays a code diff that introduces the vulnerability. The diff format shows the changes made to the source code files. It highlights the lines of code that were added or modified, which result in the security flaw described in the CPVINFO column. This is essentially the "vulnerable" version of the code.

6. good_patch

Conversely, this column contains a code diff that represents the fix for the vulnerability. It shows the necessary changes to correct the flawed logic introduced in the "bad_patch". This is the secure version of the code that mitigates the identified vulnerability.

7. test_input

This column provides the specific input data required to trigger the vulnerability.[1] This is a crucial piece of information for demonstrating the exploit. The input is often in the form of a raw HTTP request. For example, in cpv2, the test_input is a GET request with a very long, specially crafted Authorization header designed to cause a buffer overflow.

# high-level analysis

The vulnerabilities presented are exclusively classic C-style memory safety and logic errors introduced into the NGINX codebase. The analysis reveals three dominant patterns:
- New Feature Flaws: Every vulnerability is introduced while adding a new feature, highlighting the inherent risk of extending a complex, mature system.
- Input Validation Failures: The vast majority of bugs stem from insufficient validation of user-controlled input, whether it's the input's size, format, or sequence.
- Context-Dependent Complexity: Many vulnerabilities are simple to patch (often a one-line fix) but are rated as "hard" to discover. This difficulty arises from complex control flows, state management, and resource lifecycles, where the error's context is more critical than the code itself.

# categorization

## category 1: Buffer Overflows (Out-of-Bounds Writes)

This is the most prevalent category, where data is written past the allocated boundary of a buffer. The root cause is consistently a failure to validate the size of user-provided input before a copy operation.
- cpv2 (Heap-based Buffer Overflow): A fixed-size buffer is used for decoding a Base64 Authorization header. A long encoded string causes an overflow. This is a fundamental input validation error.
- cpv3 (Out of Bounds Write): A new TRACE request handler allocates a small, 200-byte fixed buffer to copy all incoming headers. A request with many or long headers easily overflows this buffer.
- cpv8 (Out of Bounds Write): In the POP3 mail handler, a buffer for a username is allocated with a fixed size of 100 bytes, but a longer username from the client can be copied into it, causing an overflow.
- cpv12 (Heap-based Buffer Overflow): A feature to serve a reversed byte-range of a file allocates a buffer using a hard-coded maximum size instead of the requested size, leading to an overflow.
- cpv15 (Out-of-bounds write): An overflow is caused by removing a length check on a uid cookie value. An overly long cookie leads to an out-of-bounds write when the value is base64-decoded.
- cpv1 (Out-of-bounds write): An interesting variant where a logical flaw in handling .. at the start of an email in the From header leads to a buffer underrun (writing before the buffer's start).
- cpv4 (Heap buffer overflow): A more subtle overflow where the buffer size calculation for a new Browser-Cookie for Safari users fails to account for the length of a pre-existing cookie that gets appended to it. This requires multiple conditions to be met: a specific User-Agent and the presence of a Cookie header.

### how to reveal it
- taint analysis (static)
    1. Sources: The analysis tool identifies where untrusted, external data enters the program. This data is marked as "tainted."
    2. Sinks: The tool has a list of "dangerous" functions (sinks) where tainted data can cause harm if not handled properly. These are typically memory copy or formatting functions.
    3. Flow Tracking: The tool analyzes all possible execution paths to see if tainted data from a source can reach a dangerous sink without first being validated or "sanitized."

- fuzzing 
    - how to smartly mutate the input? 
    - type-aware mutation. For example: Base64 string input --> this can be an interesting target and try different mutations which is longer than this data type

- instrumentations
    - adds memory safety checks to the compiled code (this is basically what AddressSanitizers (ASan) does)
    - when memory is allocated (e.g., by ngx_pnalloc), ASan surrounds this memory block with small, inaccessible "redzones."
    - if the program attempts to write or read even one byte into a redzone, ASan immediately stops the program and reports a buffer overflow, providing the exact location of the error.

## category 2: Use-After-Free (UAF) and Double Free

These vulnerabilities involve the incorrect handling of memory lifecycles, leading to the use of pointers that no longer point to valid memory. They are generally more complex to identify as they often involve interactions between different functions or states.
- cpv9 (Use-After-Free): Occurs when removing an IP from a custom blacklist. The code frees the memory for the list entry but fails to correctly re-link the surrounding entries in the doubly-linked list, leaving a dangling pointer.
- cpv10 (Double Free): Triggered when a request contains more than one Prefer header. The code frees the memory associated with the header but does not nullify the pointer, allowing a second free operation on the same memory.
- cpv11 (Use-After-Free): A subtle logic flaw where memory for host specs is allocated before checking if the feature is enabled. If the feature is off, the memory is freed, but the pointers are not nulled, leading to a UAF if the pointers are accessed later. This is notable for its dependency on runtime configuration.
- cpv17 (Use-After-Free): An error in the SMTP NOOP command handler. The function prematurely closes the connection (which frees the associated memory pool) and returns an error. The calling function, unaware of the cleanup, then tries to access the freed connection structure, causing a crash.

### how to reveal it
- instrumentation + fuzzing
    - fuzzing to explore the path
    - when free() is called on a piece of memory, it doesn't immediately return it to the system. 
    - instead, it places the memory in a "quarantine" for a short period. 
    - the memory is poisoned so any access to it is invalid.



## category 3: Null Pointer Dereferences & Logic Flaws

These vulnerabilities stem from incorrect program logic, often related to iterating over data structures or handling control flow.
- cpv5 (Null Pointer Dereference): A faulty for loop attempts to retrieve connection history from a linked list. On the second request for this data, the loop iterates past the end of the list and dereferences a NULL pointer. This requires two sequential requests to trigger.
- cpv13 (Null-Pointer Dereference): In the POP3 mail handler, a combination of a needlessly complex pointer-to-a-pointer and an incorrect loop condition for iterating through a list of users causes the code to read past the end and dereference a NULL pointer.
- cpv14 (Out of Bounds Read): A subtle control-flow bug. A function correctly identifies an error (a rewritten URI is too long) and sets the script engine's instruction pointer to an exit routine, but fails to return. The program flow continues, incorrectly using the now-invalid instruction pointer, leading to a crash.

### how to reveal it
- pointer analysis
    - Sources of Null: The analysis knows that certain operations can result in a NULL pointer.
    - Sinks (Dereferences): The tool identifies every place a pointer is dereferenced (e.g., p->member or *p).
    - Path Analysis: It checks every path from a NULL source to a dereference sink. If it finds a path where a pointer can be NULL and is then used without a prior if (pointer != NULL) check, it flags a "Potential Null Pointer Dereference."

- fuzzing
    - but, the challenge is must be "Stateful" as some errors (e.g., cpv14 and cpv5) can only be triggered using sequence of inputs

# ideas to tackle using LLMs: 
- LLM-guided fuzzing target identification
    - the idea here is to identify which part to fuzz in the codebase --> then the fuzzer can focus to fuzz into this part of code

- targeted QA
    - ask llm iteratively which part of the code is highly sensitive to bufferoverflows
    - perform analysis using llms for that part of the code

- The Agentic Security Analysis Team
    - The "Manager" agent directs the workflow, assigning tasks to specialist agents.
    - The Planner Agent (The Manager): The orchestrator. It takes the high-level goal ("Find and verify all buffer overflows in this NGINX module") and breaks it down into a sequence of tasks.
    - The Code Analyst Agent (The Brains): The core semantic expert. This agent reads and understands C code. Its "tool" is its deep understanding of logic, pointers, and memory operations.
    - The SAST Agent (The Scout): A tool-using agent. It runs traditional, fast static analysis tools (like Semgrep or CodeQL) to find potential areas of interest and report them to the Planner. This is a "first pass" filter.
    - The Proof-of-Concept (PoC) Generator Agent (The Attacker): This agent's job is to take a vulnerability hypothesis and write a concrete input that will trigger the bug.
    - The Verification Agent (The Tester): A sandboxed agent that can compile code with sanitizers (like ASan), run the server, and execute the PoC to confirm a crash.