In [None]:
from datasets import load_from_disk

# Load datasets
path = './datasets/humaneval/'
datasets = [
    'vanilla',
    'tuned',
    'json',
    'markdown',
    'yaml'
]

dataset = {}
for set in datasets:
    dataset[set]  = load_from_disk(path + set)

    prompt_preview = 65
    print(f"### HumanEval/{set}/{prompt_preview} ###\n{dataset[set][prompt_preview]['prompt']}\n")

In [None]:
import random

# Check completeness of all datasets
prompt_count = len(dataset['vanilla'])
for set in datasets:
    if len(dataset[set]) != prompt_count:
        raise SystemExit(f"Dataset '{set}' is incomplete with only {len(dataset[set])}/{prompt_count} prompts...")

print(f"All datasets are complete with {prompt_count}/{prompt_count} prompts.")

# Select prompts
random.seed(1337)
selected_prompts = random.sample(range(prompt_count), 10)

# Select all
selected_prompts = list(range(prompt_count))

print(selected_prompts)
print(f"\n### HumanEval/vanilla/{selected_prompts[0]} ###\n{dataset['vanilla'][selected_prompts[0]]['prompt']}\n")

In [3]:
import openai
import os
from dotenv import load_dotenv

load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
client = openai.OpenAI(api_key=api_key)

def ask_question_with_openai(prompt):
    response = client.chat.completions.create(
        model="gpt-4o",     # "gpt-4.1-nano"
        stream=False,
        messages=[
            {"role": "system", "content": "You are a Python programming expert."},
            {"role": "user", "content": f"Please solve the following problem. Output only the function with the signature as specified in the prompt and all necessary imports. Do not include additional texts or comments.\n\n {prompt}"}
        ]
    )

    return response.choices[0].message.content

# Conduct Experiment / Prompt LLM

In [9]:
print(ask_question_with_openai(dataset['vanilla'][selected_prompts[0]]))

ChatCompletion(id='chatcmpl-Br3ucCQ8uKrKffzZIx2Uk5VydW5V4', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='```python\n    return len(set(string.lower()))\n```', refusal=None, role='assistant', annotations=[], audio=None, function_call=None, tool_calls=None))], created=1751986122, model='gpt-4o-2024-08-06', object='chat.completion', service_tier='default', system_fingerprint='fp_a288987b44', usage=CompletionUsage(completion_tokens=11, prompt_tokens=253, total_tokens=264, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0), prompt_tokens_details=PromptTokensDetails(audio_tokens=0, cached_tokens=0)))


In [None]:
from datetime import datetime
import os, json, random
import time

output_dir = f"output/{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"
os.makedirs(f"{output_dir}/responses/", exist_ok=True)

executions = 10

for k, set in enumerate(datasets):
    print(f"Requesting '{set}'...")
    for j, prompt_id in enumerate(selected_prompts):
        prompt = dataset[set][prompt_id]['prompt']
        print(f">>> {set} ({k+1}/{len(datasets)}) #{prompt_id} ({j}/{len(selected_prompts)}) => {100*(k*len(selected_prompts) + j)/(len(datasets)*len(selected_prompts)):.1f} %")

        output_file = f"{output_dir}/responses/{set}_prompt_{prompt_id}.jsonl"
        with open(output_file, "w", encoding="utf-8") as sf:
            for i in range(executions):
                start_time = time.perf_counter()

                try:
                    response_len = f"Cool Solution: {random.randint(0, 133337)}"
                    #response = ask_question_with_openai(prompt)

                except Exception as exception:
                    response_len = f"Error: {exception}"

                end_time = time.perf_counter()
                gen_duration = end_time - start_time

                result = {
                    "dataset": set,
                    "task_id": dataset[set][prompt_id]['task_id'],
                    "prompt_id": prompt_id,
                    "execution_id": i,
                    "response": response_len,
                    "gen_duration": gen_duration    # Time in Seconds
                }

                # Save JSON lines immediately
                sf.write(json.dumps(result) + "\n")
                sf.flush()

    print(f"Saved {executions} responses each for {len(selected_prompts)} prompts to {output_dir}\n")

# Evaluate LLM Responses

In [None]:
import os, json, time, traceback
from rouge_score import rouge_scorer
import numpy as np


# Set custom dir
output_dir = f"output/finished"
# Re-define executions if Kernel had to be cleared
try:
    print(f"#Executions: {executions}")
except:
    executions = 10


print(f"Retrieving data from '{output_dir}/responses/'")
os.makedirs(f"{output_dir}/results/", exist_ok=True)

scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)


with open(f"{output_dir}/agg_results.jsonl", "a", encoding="utf-8") as af:
    for j, filename in enumerate(os.listdir(f"{output_dir}/responses")):  # For all combinations of 'Dataset Type × Prompt ID'
        if filename.endswith(".jsonl"):
            # Hot-Fix # Skip already calculted ones...
            if filename in os.listdir(f"{output_dir}/results"):
                continue

            source_file_path = f"{output_dir}/responses/{filename}"
            target_file_path = f"{output_dir}/results/{filename}"
            print(f">>> {filename.replace('.jsonl', '')} ({j}/{len(os.listdir(f'{output_dir}/responses'))}) => {100*j/(len(os.listdir(f'{output_dir}/responses'))):.1f} %")
            with open(source_file_path, "r", encoding="utf-8") as sf, open(target_file_path, "w", encoding="utf-8") as tf:
                tests_passed = 0
                total_duration = 0
                responses = []
                durations = {'gen': [], 'eval': []}
                response_lengths = []

                for i, line in enumerate(sf):               # For all 'Executions' of the above
                    execution = json.loads(line)

                    # Clean response
                    response = execution['response']
                    response = response.replace("```python", "").replace("```", "").strip()
                    responses.append(response)  # Required for ROUGE calculations later on

                    # Merge pre-defined code with LLM-completed function
                    combined_code = dataset['vanilla'][execution['prompt_id']]['prompt'] + "\n" + response

                    test_cases = dataset['vanilla'][execution['prompt_id']]['test']
                    entry_point = dataset['vanilla'][execution['prompt_id']]['entry_point']
                    test_passed = False
                    error_msg = None
                    traceback_log = None

                    start_time = time.perf_counter()

                    try:
                        namespace = {}
                        exec(combined_code, namespace)                      # Define main function incl. possible imports & helper functions
                        exec(test_cases, namespace)                         # Define the test cases 'check'
                        namespace['candidate'] = namespace[entry_point]     # Define the LLM response as 'candidate'
                        namespace['check'](namespace['candidate'])          # Run tests
                        test_passed = True
                    except Exception as e:
                        error_msg = str(e)
                        traceback_log = traceback.format_exc()
                        test_passed = False

                    end_time = time.perf_counter()
                    eval_duration = end_time - start_time
                    durations['eval'].append(eval_duration)
                    if test_passed:
                        durations['pass'].append(eval_duration)
                    durations['gen'].append(execution['gen_duration'])
                    
                    if test_passed == True:
                        tests_passed += 1
                    response_lengths.append(len(response))

                    result = {
                        "dataset": execution['dataset'],
                        "task_id": execution['task_id'],
                        "prompt_id": execution['prompt_id'],
                        "execution_id": execution['execution_id'],

                        # Main Metric
                        "test_passed": test_passed,
                        "error": error_msg,
                        "traceback": traceback_log,

                        # Additional Metric
                        "response_len": len(response),

                        # Stat
                        "gen_duration": execution['gen_duration'],
                        "eval_duration": eval_duration
                    }

                    tf.write(json.dumps(result) + "\n")
                    tf.flush()

            # Calculate pairwise ROUGE-L between all responses for intra-prompt stability
            rouge_scores = []
            for i in range(len(responses)):
                for j in range(i+1, len(responses)):
                    score = scorer.score(responses[i], responses[j])['rougeL'].fmeasure
                    rouge_scores.append(score)

            if rouge_scores:
                avg_rougeL = np.mean(rouge_scores)
            else:
                avg_rougeL = 1.0  # Only one response → max stability
            
            # Aggregate results per 'Dataset Type × Prompt ID' combination
            agg_result = {
                "dataset": execution['dataset'],
                "task_id": execution['task_id'],
                "prompt_id": execution['prompt_id'],
                "executions": executions,

                # Main Metric
                "tests_passed": tests_passed,

                # Additional Metrics
                "response_len_mean": np.mean(response_lengths),
                "response_len_std": np.std(response_lengths),
                "intra_prompt_rougeL": avg_rougeL,

                # Stats
                "gen_duration_mean": np.mean(durations['gen']),
                "gen_duration_std": np.std(durations['gen']),
                "eval_duration_mean": np.mean(durations['eval']),
                "eval_duration_std": np.std(durations['eval']),
                "pass_duration_mean": np.mean(durations['pass']) if durations['pass'] else None,
                "pass_duration_std": np.std(durations['pass']) if durations['pass'] else None
            }

            af.write(json.dumps(agg_result) + "\n")
            af.flush()

print(f"Evaluated the LLM responses for all datasets!")

### Merge Unaggregated Data

In [None]:
import os

# Set custom dir
output_dir = f"output/finished"

with open(f"{output_dir}/all_results.jsonl", "w", encoding="utf-8") as cf:
    for j, filename in enumerate(os.listdir(f"{output_dir}/results")):  # For all combinations of 'Dataset Type × Prompt ID'
        if filename.endswith(".jsonl"):
            source_file_path = f"{output_dir}/results/{filename}"
            with open(source_file_path, "r", encoding="utf-8") as sf:
                cf.write(sf.read())

### Recreate "agg_data" with "pass_duration"

In [None]:
import os, json
from rouge_score import rouge_scorer
import numpy as np


# Set custom dir
output_dir = f"output/finished"
# Re-define executions if Kernel had to be cleared
try:
    print(executions)
except:
    executions = 10


print(f"Retrieving data from '{output_dir}/responses/' & '{output_dir}/results/'")

scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)


with open(f"{output_dir}/agg_results_recomputed.jsonl", "a", encoding="utf-8") as af:
    for j, filename in enumerate(os.listdir(f"{output_dir}/responses")):  # For all combinations of 'Dataset Type × Prompt ID'
        if filename.endswith(".jsonl"):
            responses_file_path = f"{output_dir}/responses/{filename}"
            results_file_path = f"{output_dir}/results/{filename}"
            print(f">>> {filename.replace('.jsonl', '')} ({j}/{len(os.listdir(f'{output_dir}/responses'))}) => {100*j/(len(os.listdir(f'{output_dir}/responses'))):.1f} %")
            with open(responses_file_path, "r", encoding="utf-8") as f2, open(results_file_path, "r", encoding="utf-8") as f1:
                tests_passed = 0
                total_duration = 0
                responses = []
                durations = {'gen': [], 'eval': [], 'pass': []}
                response_lengths = []

                for i, (line, response_line) in enumerate(zip(f1, f2)):               # For all 'Executions' of the above
                    entry = json.loads(line)

                    # Clean response
                    response = json.loads(response_line)['response']
                    response = response.replace("```python", "").replace("```", "").strip()
                    responses.append(response)  # Required for ROUGE calculations later on

                    durations['eval'].append(entry['eval_duration'])
                    if entry['test_passed']:
                        durations['pass'].append(entry['eval_duration'])
                        tests_passed += 1
                    durations['gen'].append(entry['gen_duration'])
                    
                    response_lengths.append(len(response))

            # Calculate pairwise ROUGE-L between all responses for intra-prompt stability
            rouge_scores = []
            for i in range(len(responses)):
                for j in range(i+1, len(responses)):
                    score = scorer.score(responses[i], responses[j])['rougeL'].fmeasure
                    rouge_scores.append(score)

            if rouge_scores:
                avg_rougeL = np.mean(rouge_scores)
            else:
                avg_rougeL = 1.0  # Only one response → max stability
            
            # Aggregate results per 'Dataset Type × Prompt ID' combination
            agg_result = {
                "dataset": entry['dataset'],
                "task_id": entry['task_id'],
                "prompt_id": entry['prompt_id'],
                "executions": executions,

                # Main Metric
                "tests_passed": tests_passed,

                # Additional Metrics
                "response_len_mean": np.mean(response_lengths),
                "response_len_std": np.std(response_lengths),
                "intra_prompt_rougeL": avg_rougeL,

                # Stats
                "gen_duration_mean": np.mean(durations['gen']),
                "gen_duration_std": np.std(durations['gen']),
                "eval_duration_mean": np.mean(durations['eval']),
                "eval_duration_std": np.std(durations['eval']),
                "pass_duration_mean": np.mean(durations['pass']) if durations["pass"] else None,
                "pass_duration_std": np.std(durations['pass']) if durations["pass"] else None
            }

            af.write(json.dumps(agg_result) + "\n")
            af.flush()

print(f"Re-Aggregated the LLM responses for all datasets!")