# initial tests

In [2]:
pip install python-dotenv

Note: you may need to restart the kernel to use updated packages.


In [69]:
from dotenv import load_dotenv
import os
import requests

load_dotenv()

API_KEY=os.getenv("API_KEY")
# MODEL="gpt-3.5-turbo-0125"
MODEL="gpt-4-0125-preview"
additional_context = True
previous_attempts: True

def listModels():
    url = "https://api.openai.com/v1/models"
    headers = {
        "Content-Type": "application/json",
        "Authorization": "Bearer " + API_KEY,
    }
    response = requests.get(url, headers=headers)
    print(response.json())
def gpt(code, additional_context = '', error = '', ae = '', previous_code = ''):
    url = "https://api.openai.com/v1/chat/completions"
    headers = {
        "Content-Type": "application/json",
        "Authorization": "Bearer " + API_KEY,
    }
    context = "You are a tool used to automatically generate source code for unit tests. Source code for a function that is to be tested will be provided and you should generate an appropriate unit test that can be ran to ensure the code is correct. Code input will be in Python and you should respond with a Python code block containing the unit test as output. Do not use any testing frameworks. Do not start output with \"```python\". Don't use Parametrize. For numeric equality assertions, allow a tolerance of 1e^-5. Ensure the file can be run directly using a check for __name__==\"main\". Add assertion messages for failures. Do not include the original function in the response."
    if additional_context and additional_context:
        context += " Some additional context about the method under test is \""+additional_context+"\"."
    if previous_code and previous_attempts:
        context += " Previously, you generated \""+previous_code+"\"."
    if error and previous_attempts:
        context += " This was executed and the following was the result \""+error+"\". Provide a new solution in the same format, which fixes the problems."
    if ae and previous_attempts:
        context += " This was executed and there was a failed assertion \""+ae+"\". Try fix the test for the failed assertion, leaving all the passing assertions unchanged and return the updated result."
    data = {
        "model": MODEL,
        "messages": [
          {
            "role": "system",
            "content": context
          },
          {
            "role": "user",
            "content": """
              def add(a, b):
                return a + b"
              """
          },
          {
            "role": "assistant",
            "content": "def test_add():\n    assert add(3, 5) == 8, \"Test case 1 failed\"\n    assert add(-3, -5) == -8, \"Test case 2 failed\"\n    assert add(3, -5) == -2, \"Test case 3 failed\"\nif __name__ == \"__main__\":\n    test_add()"
          },
          {
            "role": "user",
            "content": code
          },
        ],
        "max_tokens": 1000,
        "temperature": 1.0,
    }
    response = requests.post(url, headers=headers, json=data)
    output = response.json()['choices'][0]['message']['content']

    return output



In [52]:
import json
import traceback
import re
SAMPLES_TO_RUN = 10
ADDITIONAL_ATTEMPTS = 2
out = [[],[]]
count = 0
def exec_code(eval_program, prompt, check_program, out, attempts=0):
    attempts += 1
    error = ''
    ae = ''
    try:
        exec(check_program)
        out[0].append(f"{d['task_id']}, passes(attempt {attempts})")
        return
    except AssertionError as err:
        tb = traceback.format_exc()

        print("Failing Test\n\n")
        print("\'''\n" + check_program + "\n'''")
        out[1].append(f"{d['task_id']}, failed assertion")
        try:
            ae = re.match(r'.*(AssertionError: .*)', tb.replace("\n","")).group(1)
        except Exception as f:
            ae = err
        print(ae)
    except Exception as e:
        out[1].append(f"{d['task_id']} - attempt {attempts}, failed to run")
        print("\'''\n" + check_program + "\n'''")
        print(str(e))
        error = str(e)
    if attempts > ADDITIONAL_ATTEMPTS:
        return
    next_attempt = gpt(eval_program, prompt, error, ae, check_program)
    check_program = (
                next_attempt + "\n"
            )
    return exec_code(eval_program, prompt, check_program, out, attempts)

with open('HumanEval.jsonl') as file:
    for line in file:
        count += 1
        d = json.loads(line)
        d['soln'] = gpt(d['canonical_solution'], d["prompt"])
#         d['soln'] = "def has_close_elements(numbers, threshold):\n    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx != idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n    return False\ndef test_has_close_elements():\n    assert has_close_elements([1.0, 2.0, 3.0], 0.5) == False, \"Test case 1 failed\"\n    assert has_close_elements([1.0, 2.8, 3.0], 0.21) == True, \"Test case 2 failed\"\n    assert has_close_elements([0.1, 0.2, 0.3], 0.11) == True, \"Test case 3 failed\"\n    assert has_close_elements([10, 20, 30], 10.1) == True, \"Test case 4 failed\"\n    assert has_close_elements([100, 200, 300], 100.1) == True, \"Test case 5 failed\"\n    assert has_close_elements([], 0.5) == False, \"Test case 6 failed\"\n    assert has_close_elements([1.000001, 1.000002], 0.0000001) == False, \"Test case 7 failed\"\n    assert has_close_elements([1.000001, 1.000002], 0.000001) == True, \"Test case 8 failed\"\nif __name__ == \"__main__\":\n    test_has_close_elements()"
        check_program = (
                d["soln"] + "\n"
            )
        exec_code(d['canonical_solution'], d["prompt"], check_program, out)
        if count > SAMPLES_TO_RUN-1:
            break
        
print(out)


[['HumanEval/0, passes(attempt 1)', 'HumanEval/1, passes(attempt 1)', 'HumanEval/2, passes(attempt 1)'], []]


In [83]:
import json
import traceback
import re
import time    
SAMPLES_TO_RUN = 3
ADDITIONAL_ATTEMPTS = 2
TEST_NAME = "testing"
count = 0
def exec_code(instance):
#     eval_program, prompt, check_program, out, attempts=0
    error = ''
    ae = ''
    check_program = (
                instance['prompt'] + 
                instance['canonical_solution'] + "\n" + 
                instance['test'] + "\n"
            )
    try:
        exec(check_program)
        instance["runs"] = True
        instance["passes"] = True
        print(f"{instance['task_id']}, passes(attempt {instance['attempts_made']})")
        return instance
    except AssertionError as err:
        tb = traceback.format_exc()
        instance['runs'] = True
        instance['passes'] = False
        print(f"{instance['task_id']} - attempt {instance['attempts_made']}, failed assertion")
        try:
            ae = re.match(r'.*(AssertionError: .*)', tb.replace("\n","")).group(1)
        except Exception as f:
            ae = err
        instance['error'] = ae
    except Exception as e:
        instance['runs'] = False
        instance['passes'] = False
        print(f"{instance['task_id']} - attempt {instance['attempts_made']}, failed to run")
#         print("\'''\n" + check_program + "\n'''")
#         print(str(e))
        error = str(e)
        instance['error'] = error
    if instance['attempts_made'] > ADDITIONAL_ATTEMPTS:
        return instance
    next_instance = {
        'task_id': instance['task_id'],
        'prompt': instance['prompt'],
        'canonical_solution': instance['canonical_solution'],
        'attempts_made': instance['attempts_made']+1,
        'previous_attempt': instance
    }
#     next_attempt = "      def has_close_elemnts(numbers, threshold):\n    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx != idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n    return False\ndef test_has_close_elements():\n    assert has_close_elements([1.0, 2.0, 3.0], 0.5) == False, \"Test case 1 failed\"\n    assert has_close_elements([1.0, 2.8, 3.0], 0.21) == True, \"Test case 2 failed\"\n    assert has_close_elements([0.1, 0.2, 0.3], 0.11) == True, \"Test case 3 failed\"\n    assert has_close_elements([10, 20, 30], 10.1) == True, \"Test case 4 failed\"\n    assert has_close_elements([100, 200, 300], 100.1) == True, \"Test case 5 failed\"\n    assert has_close_elements([], 0.5) == False, \"Test case 6 failed\"\n    assert has_close_elements([1.000001, 1.000002], 0.0000001) == False, \"Test case 7 failed\"\n    assert has_close_elements([1.000001, 1.000002], 0.000001) == True, \"Test case 8 failed\"\nif __name__ == \"__main__\":\n    test_has_close_elements()"
#     if instance['attempts_made'] > 1:
#         next_attempt = "def has_close_elements(numbers, threshold):\n    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx != idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n    return False\ndef test_has_close_elements():\n    assert has_close_elements([1.0, 2.0, 3.0], 0.5) == False, \"Test case 1 failed\"\n    assert has_close_elements([1.0, 2.8, 3.0], 0.21) == True, \"Test case 2 failed\"\n    assert has_close_elements([0.1, 0.2, 0.3], 0.11) == True, \"Test case 3 failed\"\n    assert has_close_elements([10, 20, 30], 10.1) == True, \"Test case 4 failed\"\n    assert has_close_elements([100, 200, 300], 100.1) == True, \"Test case 5 failed\"\n    assert has_close_elements([], 0.5) == False, \"Test case 6 failed\"\n    assert has_close_elements([1.000001, 1.000002], 0.0000001) == False, \"Test case 7 failed\"\nif __name__ == \"__main__\":\n    test_has_close_elements()"
    next_attempt = gpt(instance['canonical_solution'], instance["prompt"], error, ae, instance["test"])
    next_instance['test'] = next_attempt
    return exec_code(next_instance)

with open('HumanEval.jsonl') as file:
    timestamp = time.time()
    filename = f"results/{TEST_NAME}_{timestamp}.jsonl"
    open(filename, "x")
    for line in file:
        count += 1
        instance = json.loads(line)
        instance['attempts_made'] = 1
        instance['test'] = gpt(instance['canonical_solution'], instance["prompt"])
#         instance['test'] = "def has_close_elements(numbers, threshold):\n    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx != idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n    return False\ndef test_has_close_elements():\n    assert has_close_elements([1.0, 2.0, 3.0], 0.5) == False, \"Test case 1 failed\"\n    assert has_close_elements([1.0, 2.8, 3.0], 0.21) == True, \"Test case 2 failed\"\n    assert has_close_elements([0.1, 0.2, 0.3], 0.11) == True, \"Test case 3 failed\"\n    assert has_close_elements([10, 20, 30], 10.1) == True, \"Test case 4 failed\"\n    assert has_close_elements([100, 200, 300], 100.1) == True, \"Test case 5 failed\"\n    assert has_close_elements([], 0.5) == False, \"Test case 6 failed\"\n    assert has_close_elements([1.000001, 1.000002], 0.0000001) == False, \"Test case 7 failed\"\n    assert has_close_elements([1.000001, 1.000002], 0.000001) == True, \"Test case 8 failed\"\nif __name__ == \"__main__\":\n    test_has_close_elements()"
        out = exec_code(instance)
        with open(filename, 'a') as file:
            json.dump(out, file)
            file.write('\n')
        if count > SAMPLES_TO_RUN-1:
            break
        

HumanEval/0, passes(attempt 1)
HumanEval/1, passes(attempt 1)
HumanEval/2, passes(attempt 1)
