# Experiments

In [None]:
dataset_name = "humaneval_tuned_prompts"
runs = 10

In [None]:
import openai
from dotenv import load_dotenv
import os
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
client = openai.OpenAI(api_key=api_key)

def ask_question_with_openai(role_description, prompt):
    prompt = f"""
{role_description}\n
{prompt}
"""
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "user", "content": prompt}
        ]
    )

    return response.choices[0].message.content

In [None]:
from datasets import load_from_disk
dataset = load_from_disk(dataset_name)

In [None]:
prompts = [entry["prompt"] for entry in dataset]

In [None]:
role_description = "You are a Python programming expert. Please solve the following problem using Python code. Please make sure to just answer with the function with the same signture as the given one without any additional text or comments."

In [None]:
def generate_code_with_openai(role_description, prompt):
    response = ask_question_with_openai(role_description, prompt)
    return response

In [None]:
def generate_solution_dataset(role_description, dataset):
    solutions = []
    for entry in dataset:
        prompt = entry["prompt"]
        solution = generate_code_with_openai(role_description, prompt)
        solutions.append(solution)
    return solutions

## Solution dataset creation

In [None]:
import json

for i in range(runs):
    generated_solutions = generate_solution_dataset(role_description, prompts)

    # Save the generated solutions to a JSON file
    with open(rf"../data/{dataset_name}_{i}_solutions.json", "w") as f:
        json.dump(generated_solutions, f, indent=4)

## Experiment evaluation

In [None]:
import json
from datasets import load_from_disk, load_dataset

dataset = load_dataset("openai_humaneval", split="test")



for run in range(runs):

    # Load your generated solutions
    with open(rf"../data/{dataset_name}_{run}_solutions.json", "r") as f:
        loaded_solutions = json.load(f)
    
    results = []

    for i, row in enumerate(dataset):
        prompt = row['prompt']  # This has imports, helpers, and the function signature (no impl)
        solution = loaded_solutions[i]
        # Remove code block markers if present
        code = solution
        if code.startswith('```'):
            code = code.split('```')[1] if '```' in code else code
        code = code.replace('python', '').strip('` \n')
        
        # Combine prompt and solution (prompt includes function signature, solution may also include it, which is fine)
        combined_code = prompt + "\n" + code

        test_code = row['test']
        entry_point = row['entry_point']
        task_id = row['task_id']
        test_passed = False
        error = None

        try:
            namespace = {}
            exec(combined_code, namespace)         # Define helpers, imports, and main function
            exec(test_code, namespace)             # Define 'check'
            namespace['candidate'] = namespace[entry_point]  # Solution as candidate
            namespace['check'](namespace['candidate'])        # <-- THIS RUNS THE TESTS
            test_passed = True
        except Exception as e:
            error = str(e)
            test_passed = False

        results.append({
            "task_id": task_id,
            "test_passed": test_passed,
            "error": error,
        })

    out_file = rf"../results/{dataset_name}_{i}_results.json"

    with open(out_file, "w") as f:
        json.dump(results, f, indent=2)

    print(f"Finished run {run}. Results saved to", out_file)

## Approximate Price Estimation

In [None]:
import tiktoken
import json

# Select the encoding for your model
encoding = tiktoken.encoding_for_model("gpt-4o")

with open(rf"../data/{dataset_name}_solutions.json", "r") as f:
    loaded_solutions = json.load(f)

# Your input prompt
input = ", ".join(prompts)
output = ", ".join(loaded_solutions)

# Count tokens
num_input_tokens = len(encoding.encode(input))
num_output_tokens = len(encoding.encode(output))  # Assuming no output tokens for this example, adjust as needed

In [None]:
runs = 10
datasets = 5
# Prices for GPT-4o
price_per_million_input_tokens = 2.5
price_per_million_output_tokens = 10

print(f"Estimated cost for {runs} runs with {datasets} datasets: {runs * datasets * (num_input_tokens * price_per_million_input_tokens + num_output_tokens * price_per_million_output_tokens) / 1_000_000}")