# Challenge: identify bugs in pieces of code (or lack thereof)

In [21]:
import openai
from dataclasses import dataclass

client = openai.Client(
    api_key="-vLvwj3RvYcDz2uODM_GwokEFQWw-R-0HzgXmDms6uc",
    base_url="https://api.poe.com/v1"
)

model_list = ["gpt-5.1-codex-mini", "llama-3.1-8b-di", "gemini-2.5-flash-lite", "grok-code-fast-1"]

def ask_ai(code: str, language: str, model: str) -> str | None:
    instruction = f"The following is a {language} program. Is there a bug? Explain why or why not."
    
    result = client.chat.completions.create(
        model = model,
        messages = [
            {
                "content": instruction,
                "role": "system"
            },
            {
                "content": code,
                "role": "user"
            }
        ],
        max_tokens=512,
        temperature=0.7
    )
    answer = result.choices[0].message.content
    return answer

@dataclass
class TestResult:
    is_correct: bool | None = None
    llm_answer: str | None = None
    is_correct_explanation: str | None = None

@dataclass
class TestProblem:
    code: str
    language: str
    model: str
    real_answer: str

    def __hash__(self) -> int:
        return (self.code + self.language + self.model + self.real_answer).__hash__()

test_results: dict[TestProblem, TestResult] = {}

def evaluate(problem: TestProblem, rerun=False) -> TestResult:
    if not rerun:
        cached = test_results.get(problem)
        if cached is not None:
            return cached
    
    llm_answer = ask_ai(problem.code, problem.language, problem.model)
    
    instruction = f"You will be given a large language model's answer and the true answer to whether a piece of code has a bug. Evaluate whether the LLM's answer is correct. You must start your answer with 'Yes' or 'No'."

    result = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {
                "content": instruction,
                "role": "system"
            },
            {
                "content": f"Large language model's answer: \n'{llm_answer}'\nTrue answer: \n'{problem.real_answer}'",
                "role": "user"
            }
        ],
    )

    is_correct_explanation = result.choices[0].message.content
    matches = None
    if is_correct_explanation is None or is_correct_explanation == "":
        pass
    elif is_correct_explanation.strip().lower().startswith("yes"):
        matches = True
    elif is_correct_explanation.strip().lower().startswith("no"):
        matches = False
    test_results[problem] = TestResult(
        llm_answer=llm_answer,
        is_correct_explanation=is_correct_explanation,
        is_correct=matches
    )
    return test_results[problem]


In [45]:
def get_code_piece(code_piece: str) -> str:
    with open(code_piece, encoding="utf8") as source_code:
        return source_code.read()

problems = [
    [TestProblem(
        code=get_code_piece("code_pieces/null_deref.c"),
        language="C",
        real_answer="Use after free/ null pointer dereferencing bug",
        model=model
    ) for model in model_list],
    [TestProblem(
        code=get_code_piece("code_pieces/use_before_init.cpp"),
        language="C++",
        real_answer="Missing check for emptiness for tasks, resulting in use out of bounds access",
        model=model
    ) for model in model_list],
    [TestProblem(
        code=get_code_piece("code_pieces/number_overflow.cpp"),
        language="C++",
        real_answer="int overflow when calculating investment value",
        model=model
    ) for model in model_list],
    [TestProblem(
        code=get_code_piece("code_pieces/memory_leak.c"),
        language="C",
        real_answer="Line buffer is leaked in every loop iteration as it is allocated on the heap but never freed",
        model=model
    ) for model in model_list],
    [TestProblem(
        code=get_code_piece("code_pieces/auth_gate.dart"),
        language="Flutter/Dart",
        real_answer="Did not check if snapshot has data, resulting in lack of authentication enforcement and allowing users not logged in to view app content",
        model=model
    ) for model in model_list],
    [TestProblem(
        code=get_code_piece("code_pieces/sql_injection.php"),
        language="PHP",
        real_answer="SQL injection vulnerability in getUserData, since $username is unsanitised and directly concatenated into the SQL query",
        model=model
    ) for model in model_list],
    [TestProblem(
        code=get_code_piece("code_pieces/arbitrary_code_execution.py"),
        language="Python",
        real_answer="Arbitrary code execution in eval() chat completion response, potentially exploited by unsanitized input",
        model=model
    ) for model in model_list],
    [TestProblem(
        code=get_code_piece("code_pieces/PublicApiServer.java"),
        language="Java",
        real_answer="Exposed API endpoint, potential denial of service attempt, no rate limiting",
        model=model
    ) for model in model_list],
]

control = [
    [TestProblem(
        code=get_code_piece("code_pieces/null_deref_safe.c"),
        language="C",
        real_answer="Use after free/ null pointer dereferencing bug",
        model=model
    ) for model in model_list],
    [TestProblem(
        code=get_code_piece("code_pieces/use_before_init_safe.cpp"),
        language="C++",
        real_answer="Missing check for emptiness for tasks, resulting in use out of bounds access",
        model=model
    ) for model in model_list],
    [TestProblem(
        code=get_code_piece("code_pieces/number_overflow_safe.cpp"),
        language="C++",
        real_answer="int overflow when calculating investment value",
        model=model
    ) for model in model_list],
    [TestProblem(
        code=get_code_piece("code_pieces/memory_leak_safe.c"),
        language="C",
        real_answer="Line buffer is leaked in every loop iteration as it is allocated on the heap but never freed",
        model=model
    ) for model in model_list],
    [TestProblem(
        code=get_code_piece("code_pieces/auth_gate_safe.dart"),
        language="Flutter/Dart",
        real_answer="Did not check if snapshot has data, resulting in lack of authentication enforcement and allowing users not logged in to view app content",
        model=model
    ) for model in model_list],
    [TestProblem(
        code=get_code_piece("code_pieces/sql_injection_safe.php"),
        language="PHP",
        real_answer="SQL injection vulnerability in getUserData, since $username is unsanitised and directly concatenated into the SQL query",
        model=model
    ) for model in model_list],
    [TestProblem(
        code=get_code_piece("code_pieces/arbitrary_code_execution_safe.py"),
        language="Python",
        real_answer="Arbitrary code execution in eval() chat completion response, potentially exploited by unsanitized input",
        model=model
    ) for model in model_list],
]

def evaluateSafe(problem: TestProblem) -> TestResult:
    result = evaluate(problem)
    return TestResult(
        is_correct=not result.is_correct,
        llm_answer=result.llm_answer,
        is_correct_explanation=result.is_correct_explanation
    )

## Test

In [11]:
[evaluate(problem) for problem in problems[0]]

[TestResult(is_correct=True, llm_answer='Yes, there is a bug. In `main`, after `freeList(&head)` is called, the list nodes are freed but `head` is not updated to `NULL`. Immediately after that, `sumList(head)` is called. Since `', is_correct_explanation='Yes'),
 TestResult(is_correct=True, llm_answer='Yes, there is a bug in the provided C program.\n\nThe bug is in the `sumList` function. After calling `freeList(&head)` in the `main` function, the `head` pointer is set to `NULL` because the entire linked list is deallocated. However, in the `sumList` function, the code tries to access the `data` member of the `head` node, which is now `NULL`.\n\nWhen `sumList(head)` is called after `freeList(&head)`, the program will crash with a segmentation fault because it tries to access memory that has already been freed.\n\nTo fix this bug, you should add a check in the `sumList` function to return an error or a special value when the list is empty (i.e., `head` is `NULL`).\n\nHere\'s the correcte

In [18]:
[evaluate(problem) for problem in problems[1]]

[TestResult(is_correct=True, llm_answer='Yes, there is a bug.\n\nIn `TaskManager::getLastTask()`, the code unconditionally calls `tasks.back()` and returns the result. If `tasks` is empty, calling `back()` is undefined behavior. In `main()`, after starting with an empty `TaskManager`, the code calls `printLastTaskDescription()`, and inside that function `getLastTask()` is called—even though there are no tasks. This causes undefined behavior (likely a crash) because `tasks.back()` is being called on an', is_correct_explanation='Yes'),
 TestResult(is_correct=False, llm_answer="The provided C++ program has a bug. \n\nThe bug is in the `TaskManager` class's `removeTask` method. \n\nWhen `removeTask` is called, it first deletes the task at the specified index, and then it erases that task from the `tasks` vector. However, this can lead to undefined behavior if the task being deleted is the last one in the vector, because `tasks.back()` will return a dangling pointer after the task is delete

In [22]:
[evaluate(problem) for problem in problems[2]]

[TestResult(is_correct=False, llm_answer='', is_correct_explanation='No'),
 TestResult(is_correct=False, llm_answer='The provided C++ program appears to be a simple financial calculator with methods to calculate compound interest, portfolio value, and simulate investment growth. However, there are a few potential issues and areas for improvement:\n\n1.  **Precision**: The program uses a multiplier (`PRECISION_MULTIPLIER`) to convert between dollars and cents. While this is a good practice for maintaining precision, it might be more readable to use a constant like `CentsPerDollar` or `DOLLARS_TO_CENTS` instead of `PRECISION_MULTIPLIER`.\n\n2.  **Naming**: Some variable names could be more descriptive. For example, `amount` in the `calculateCompoundInterest` function could be renamed to `finalAmount` to better convey its purpose.\n\n3.  **Code Organization**: The `calculateCompoundInterest` function has a loop that calculates the compound interest for each day. While this is a simple cas

In [24]:
[evaluate(problem) for problem in problems[3]]

[TestResult(is_correct=True, llm_answer='Yes, there is a bug: the program leaks memory on every loop iteration. `make_line` allocates a new buffer with `malloc`, and `process_line` returns that `Line`, but the caller (`main`) never calls `free` on `l.buffer`. Because the `for` loop is `for (int i = 0;; i++)` with no exit, this happens repeatedly forever, so the process keeps allocating more memory until it runs out.', is_correct_explanation='Yes'),
 TestResult(is_correct=False, llm_answer='Yes, there is a bug in the provided C program.\n\nThe issue lies in the `process_line` function where it creates a temporary `Line` object `temp` using the `make_line` function. Then, it assigns the address of `temp` to `out` using `Line out = temp;`. This is problematic because `temp` is a local variable and will be destroyed when the function returns. \n\nHowever, the `out` variable in `main` is assigned the address of `temp`, which is now a dangling pointer. This means that `out` is pointing to me

In [26]:
[evaluate(problem) for problem in problems[4]]

[TestResult(is_correct=True, llm_answer='', is_correct_explanation='Yes'),
 TestResult(is_correct=False, llm_answer='The provided code is generally well-structured and clean. However, there are a few potential issues and improvements that can be suggested:\n\n1. **Missing null safety checks**: In the `StreamBuilder` widget, the `snapshot.data` is used without checking if it\'s null. This can lead to a runtime error if the snapshot doesn\'t contain any data. You should add null checks to handle this scenario.\n\n2. **Missing error handling**: In the `StreamBuilder` widget, the `snapshot.hasError` is checked, but the error is not handled. You should add error handling to provide a meaningful error message to the user.\n\n3. **Potential NPE**: In the `AuthInfo` class, the `uid` and `email` properties are not checked for null before being used. You should add null checks to handle this scenario.\n\n4. **Magic strings**: The code uses magic strings like `"users"` and `"uid"`. You should con

LLM evaluation failure: result[0] should be None and result[3] should be True

In [29]:
[evaluate(problem) for problem in problems[5]]

[TestResult(is_correct=True, llm_answer='Yes, there is a serious bug: the code is vulnerable to SQL injection because it directly interpolates user input into the SQL query without any sanitization or parameterization. An attacker could supply a specially crafted `username` in the query string (e.g., `?username=foo\' OR \'1\'=\'1`) and manipulate the query to return or modify data they shouldn’t have access to.\n\nTo fix it, use prepared statements (e.g., `$stmt = $conn->prepare("SELECT * FROM users WHERE username = ?");` and bind the parameter) or at least escape the input with `$conn->real_escape_string()` before including it in the query.', is_correct_explanation='Yes'),
 TestResult(is_correct=True, llm_answer='Yes, there is a bug in this PHP program.\n\nThe bug lies in the way the user input is being used in the SQL query. The input is being directly concatenated into the SQL query without any sanitization or escaping. This makes the program vulnerable to SQL injection attacks.\n\n

In [38]:
[evaluate(problem) for problem in problems[6]]

[TestResult(is_correct=True, llm_answer='Yes, there is a bug — it’s a security vulnerability in the way the LLM’s response is validated and executed.\n\nThe code tries to guard against arbitrary code execution by checking whether the response starts with certain tokens (`"True"`, `"False"`, `"0"`, etc.). However, this check is superficial: an attacker-controlled model (or a prompt that causes a malicious response) can still return something like `True; __import__(\'os\').system(\'rm -rf /\')`. Because it begins with `"True"`, it passes the `validate_response` check, and when `eval` runs it, the destructive code is executed.\n\nIn short, the validation is too weak to prevent arbitrary code execution via `eval`. A safer approach would be to avoid `eval` entirely or to use a much stronger validation', is_correct_explanation='Yes'),
 TestResult(is_correct=False, llm_answer='The provided Python program appears to be well-structured and follows good practices. However, there are a few potent

In [42]:
[evaluate(problem) for problem in problems[7]]

[TestResult(is_correct=False, llm_answer='', is_correct_explanation='No'),
 TestResult(is_correct=False, llm_answer='The provided Java program appears to be a simple HTTP server that listens for GET requests on the `/compute` endpoint. It simulates a resource-intensive computation by summing up numbers from 0 to the input number.\n\nHowever, there are a few potential issues with this code:\n\n1. **Resource leak**: In the `handle` method, the `OutputStream os` is closed after writing to it. However, this is not necessary because the `HttpExchange` class takes care of closing the output stream for you. Closing it manually can lead to resource leaks if an exception occurs before the stream is closed.\n\n2. **Potential `NullPointerException`**: In the `handle` method, the `query` string is split using the `split` method without checking if the result array has at least two elements. If the query string does not contain an `=` character, the `query.split("=")[1]` expression will throw a `Nu

In [44]:
[evaluate(problem) for problem in problems[8]]

[TestResult(is_correct=True, llm_answer='Yes, there is a bug: the program has a data race on the global `counter`. Each thread performs `counter++`, but that operation is not atomic—it involves a read, increment, and write. When multiple threads execute it concurrently, their reads/writes can interleave in ways that corrupt the result, leading to lost increments and undefined behavior.\n\nThe `std::this_thread::sleep_for` in the loop doesn’t prevent the race; it only introduces delays, but the underlying shared access is still unsynchronized.\n\nTo fix it, protect `counter` with synchronization, for example:\n\n```cpp\nstd::atomic<int> counter = 0;\n```\n\nor guard the increment with a `std::mutex`.', is_correct_explanation='Yes'),
 TestResult(is_correct=True, llm_answer='The provided C++ program appears to be a multi-threaded program that increments a shared counter variable by 10,000 times (10 threads x 1,000 increments per thread) using 10 threads. However, there is a potential bug 

In [46]:
[[evaluateSafe(problem) for problem in problems] for problems in control]

[[TestResult(is_correct=True, llm_answer='There is no bug in the code as written. The only place where the list is', is_correct_explanation='No'),
  TestResult(is_correct=False, llm_answer='Yes, there is a bug in the provided C program.\n\nThe bug is in the `sumList` function. After calling `freeList(&head)`, the `head` pointer is set to `NULL`. However, in the `sumList` function, the code tries to access the `head` pointer to start traversing the list. Since `head` is `NULL`, this will result in a segmentation fault.\n\nTo fix this bug, you should move the call to `sumList` before calling `freeList`. This way, the list is still intact when you try to calculate its sum.\n\nHere\'s the corrected code:\n\n```c\nint main() {\n    Node* head = NULL;\n\n    insertNode(&head, 5);\n    insertNode(&head, 10);\n    insertNode(&head, 15);\n\n    printList(head);\n\n    int total = sumList(head);\n    printf("Sum of list: %d\\n", total);\n\n    freeList(&head);\n\n    return 0;\n}\n```\n\nWith th

In [41]:
from pprint import pprint
print("LLM answers:")
# pprint(llm_answer)
print("Test results:")
test_results

LLM answers:
Test results:


{'There is no bug in this program. The linked list is properly built, printed, freed, and then the sum function is called afterward. Because `freeList` sets `*head = NULL` after releasing the nodes, the call to `sumList(head)` sees a `head` of `NULL` and simply returns 0; no\xa0freed memory is accessed, so the program runs correctly.Use after free/ null pointer dereferencing bug': False,
 'Yes, there is a bug.\n\nAfter calling `freeList(&head);`, the list nodes are freed, but `freeList` does not set `*head` to `NULL`. As a result, `head` in `main` still holds a stale pointer value that points to memory that has been freed. The subsequent call to `sumList(head)` dereferences this pointer and attempts to traverse the freed list, which is undefined behavior.\n\nTo fix it, `freeList` should set `*head = NULL;` after the loop (or you should avoid using the list after freeing it).Use after free/ null pointer dereferencing bug': True,
 'Yes, there is a **critical bug** in the `main` function.