# initial tests

In [2]:
pip install python-dotenv

Note: you may need to restart the kernel to use updated packages.


In [189]:
from dotenv import load_dotenv
import os
import requests

load_dotenv()

API_KEY=os.getenv("API_KEY")
# MODEL="gpt-3.5-turbo-0125"
MODEL="gpt-4-0125-preview"
additional_context_enabled = True
previous_attempts_enabled = True

def listModels():
    url = "https://api.openai.com/v1/models"
    headers = {
        "Content-Type": "application/json",
        "Authorization": "Bearer " + API_KEY,
    }
    response = requests.get(url, headers=headers)
    print(response.json())
def gpt(code, additional_context = '', error = '', ae = '', previous_code = ''):
    url = "https://api.openai.com/v1/chat/completions"
    headers = {
        "Content-Type": "application/json",
        "Authorization": "Bearer " + API_KEY,
    }
    context = "You are a tool used to automatically generate source code for unit tests. Source code for a function that is to be tested will be provided and you should generate an appropriate unit test that can be ran to ensure the code is correct. Code input will be in Python and you should respond with a Python code block containing the unit test as output. Do not use any testing frameworks. Do not start output with \"```python\". Don't use Parametrize. For assertions dealing with numbers, allow a tolerance of 1e^-5. Ensure the file can be run directly using a check for __name__==\"main\". Add assertion messages for failures. Do not include the original function in the response. Use the global keyword with all function names and required imports that need to be called in the test as the first lines of the test function."
    if additional_context and additional_context_enabled:
        context += " Some additional context about the method under test is \""+additional_context+"\"."
    if previous_code and previous_attempts_enabled:
        context += " Previously, you generated \""+previous_code+"\"."
    if error and previous_attempts_enabled:
        context += " This was executed and the following was the result \""+error+"\". Provide a new solution in the same format, which fixes the problems."
    if ae and previous_attempts_enabled:
        context += " This was executed and there was a failed assertion \""+ae+"\". Try fix the test for the failed assertion, leaving all the passing assertions unchanged and return the updated result."
#     print(context)
#     print(code)
    data = {
        "model": MODEL,
        "messages": [
          {
            "role": "system",
            "content": context
          },
          {
            "role": "user",
            "content": """
              import math
              
              def add(a, b):
                return a + b"
              """
          },
          {
            "role": "assistant",
            "content": "def test_add():\n    global test_add, math\n    assert add(3, 5) == 8, \"Test case 1 failed\"\n    assert add(-3, -5) == -8, \"Test case 2 failed\"\n    assert add(3, -5) == -2, \"Test case 3 failed\"\nif __name__ == \"__main__\":\n    test_add()"
          },
          {
            "role": "user",
            "content": code
          },
        ],
        "max_tokens": 1000,
        "temperature": 1.0,
    }
    response = requests.post(url, headers=headers, json=data)
    output = response.json()['choices'][0]['message']['content']

    return output

In [191]:
import json
import traceback
import re
import time    
SAMPLES_TO_RUN = 164
ADDITIONAL_ATTEMPTS = 2
# TEST_NAME = "gpt-3.5-turbo-0125-2attempt-context"
TEST_NAME = "gpt-4-0125-preview-2attempt-context"
count = 0
def exec_code(instance):
#     eval_program, prompt, check_program, out, attempts=0
    error = ''
    ae = ''
    instance2j = json.dumps(instance)
    instance2= json.loads(instance2j)
    check_program = (
                instance2['prompt'] + 
                instance2['canonical_solution'] + "\n" + 
                instance2['test'] + "\n"
#                 "def test_below_zero():\n    assert not below_zero([1, 2, 3]), \"Test case 1 failed: should not fall below zero\"\n    assert below_zero([1, 2, -4, 5]), \"Test case 2 failed: should fall below zero\"\n    assert not below_zero([]), \"Test case 3 failed: empty list should not fall below zero\"\n    assert below_zero([-1, 2, -3, 4]), \"Test case 4 failed: should fall below zero on first operation\"\n    assert not below_zero([100, -50, -25]), \"Test case 5 failed: should not fall below zero\"\n    assert below_zero([-1, -2, -3]), \"Test case 6 failed: should fall below zero on the first operation\"\n\nif __name__ == \"__main__\":\n    test_below_zero()"
#                 "def test_below_zero():\n    global below_zero\n    assert not below_zero([1, 2, 3]), \"Test case 1 failed: should not fall below zero\"\n    assert below_zero([1, 2, -4, 5]), \"Test case 2 failed: should fall below zero\"\n    assert not below_zero([]), \"Test case 3 failed: empty list should not fall below zero\"\n    assert below_zero([-1, 2, -3, 4]), \"Test case 4 failed: should fall below zero on first operation\"\n    assert not below_zero([100, -50, -25]), \"Test case 5 failed: should not fall below zero\"\n    assert below_zero([-1, -2, -3]), \"Test case 6 failed: should fall below zero on the first operation\"\n\nif __name__ == \"__main__\":\n    test_below_zero()"

#                 "if __name__ == \"__main__\":\n    below_zero([-1, 1])"
            )
    
    try:
        compiled_code = compile(check_program, "<string>", "exec")
        exec(compiled_code)
        instance["runs"] = True
        instance["passes"] = True
        print(f"{instance['task_id']}, passes(attempt {instance['attempts_made']})")
        return instance
    except AssertionError as err:
        tb = traceback.format_exc()
        instance['runs'] = True
        instance['passes'] = False
        print(f"{instance['task_id']} - attempt {instance['attempts_made']}, failed assertion")
        try:
            ae = re.match(r'.*(AssertionError: .*)', tb.replace("\n","")).group(1)
        except Exception as f:
            ae = err
        instance['error'] = ae
    except Exception as e:
        instance['runs'] = False
        instance['passes'] = False
        print(f"{instance['task_id']} - attempt {instance['attempts_made']}, failed to run")
#         print("\'''\n" + check_program + "\n'''")
#         print(str(e))
        error = str(e)
        instance['error'] = error
    if instance['attempts_made'] > ADDITIONAL_ATTEMPTS:
        if not instance['passes']:
            print(error)
        return instance
    next_instance = {
        'task_id': instance['task_id'],
        'prompt': instance['prompt'],
        'canonical_solution': instance['canonical_solution'],
        'attempts_made': instance['attempts_made']+1,
        'previous_attempt': instance
    }
#     next_attempt = "      def has_close_elemnts(numbers, threshold):\n    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx != idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n    return False\ndef test_has_close_elements():\n    assert has_close_elements([1.0, 2.0, 3.0], 0.5) == False, \"Test case 1 failed\"\n    assert has_close_elements([1.0, 2.8, 3.0], 0.21) == True, \"Test case 2 failed\"\n    assert has_close_elements([0.1, 0.2, 0.3], 0.11) == True, \"Test case 3 failed\"\n    assert has_close_elements([10, 20, 30], 10.1) == True, \"Test case 4 failed\"\n    assert has_close_elements([100, 200, 300], 100.1) == True, \"Test case 5 failed\"\n    assert has_close_elements([], 0.5) == False, \"Test case 6 failed\"\n    assert has_close_elements([1.000001, 1.000002], 0.0000001) == False, \"Test case 7 failed\"\n    assert has_close_elements([1.000001, 1.000002], 0.000001) == True, \"Test case 8 failed\"\nif __name__ == \"__main__\":\n    test_has_close_elements()"
#     if instance['attempts_made'] > 1:
#         next_attempt = "def has_close_elements(numbers, threshold):\n    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx != idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n    return False\ndef test_has_close_elements():\n    assert has_close_elements([1.0, 2.0, 3.0], 0.5) == False, \"Test case 1 failed\"\n    assert has_close_elements([1.0, 2.8, 3.0], 0.21) == True, \"Test case 2 failed\"\n    assert has_close_elements([0.1, 0.2, 0.3], 0.11) == True, \"Test case 3 failed\"\n    assert has_close_elements([10, 20, 30], 10.1) == True, \"Test case 4 failed\"\n    assert has_close_elements([100, 200, 300], 100.1) == True, \"Test case 5 failed\"\n    assert has_close_elements([], 0.5) == False, \"Test case 6 failed\"\n    assert has_close_elements([1.000001, 1.000002], 0.0000001) == False, \"Test case 7 failed\"\nif __name__ == \"__main__\":\n    test_has_close_elements()"
    next_attempt = gpt(instance['canonical_solution'], instance["prompt"], error, ae, instance["test"])
    next_instance['test'] = next_attempt
    return exec_code(next_instance)

with open('HumanEval.jsonl') as file:
    timestamp = time.time()
    filename = f"results/{TEST_NAME}_{timestamp}.jsonl"
    open(filename, "x")
    for line in file:
        count += 1
        if count < 30:
            continue
        instance = json.loads(line)
        instance['attempts_made'] = 1
        split_prompt = re.match(r"^(.*def .*?:\n)(.*)", instance['prompt'], re.M|re.DOTALL)
        instance['canonical_solution'] = split_prompt.group(1) + instance['canonical_solution']
        instance['prompt'] = split_prompt.group(2).replace("    ","")
        instance['test'] = gpt(instance['canonical_solution'], instance["prompt"])
#         instance['test'] = "def has_close_elements(numbers, threshold):\n    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx != idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n    return False\ndef test_has_close_elements():\n    assert has_close_elements([1.0, 2.0, 3.0], 0.5) == False, \"Test case 1 failed\"\n    assert has_close_elements([1.0, 2.8, 3.0], 0.21) == True, \"Test case 2 failed\"\n    assert has_close_elements([0.1, 0.2, 0.3], 0.11) == True, \"Test case 3 failed\"\n    assert has_close_elements([10, 20, 30], 10.1) == True, \"Test case 4 failed\"\n    assert has_close_elements([100, 200, 300], 100.1) == True, \"Test case 5 failed\"\n    assert has_close_elements([], 0.5) == False, \"Test case 6 failed\"\n    assert has_close_elements([1.000001, 1.000002], 0.0000001) == False, \"Test case 7 failed\"\n    assert has_close_elements([1.000001, 1.000002], 0.000001) == True, \"Test case 8 failed\"\nif __name__ == \"__main__\":\n    test_has_close_elements()"
        out = exec_code(instance)
        
        with open(filename, 'a') as file:
            json.dump(out, file)
            file.write('\n')
        if count > SAMPLES_TO_RUN-1:
            break
        

HumanEval/29, passes(attempt 1)
HumanEval/30, passes(attempt 1)
HumanEval/31, passes(attempt 1)
HumanEval/32 - attempt 1, failed assertion
HumanEval/32 - attempt 2, failed assertion
HumanEval/32 - attempt 3, failed assertion

HumanEval/33 - attempt 1, failed assertion
HumanEval/33 - attempt 2, failed assertion
HumanEval/33 - attempt 3, failed assertion

HumanEval/34, passes(attempt 1)
HumanEval/35, passes(attempt 1)
HumanEval/36, passes(attempt 1)
HumanEval/37, passes(attempt 1)
HumanEval/38 - attempt 1, failed assertion
HumanEval/38 - attempt 2, failed assertion
HumanEval/38 - attempt 3, failed assertion

HumanEval/39, passes(attempt 1)
HumanEval/40, passes(attempt 1)
HumanEval/41, passes(attempt 1)
HumanEval/42, passes(attempt 1)
HumanEval/43, passes(attempt 1)
HumanEval/44 - attempt 1, failed assertion
HumanEval/44 - attempt 2, failed assertion
HumanEval/44, passes(attempt 3)
HumanEval/45 - attempt 1, failed to run
HumanEval/45, passes(attempt 2)
HumanEval/46, passes(attempt 1)
Huma