### HumanEval

In [1]:
import os

unique_id = "HumanEval"
os.environ["LANGCHAIN_PROJECT"] = f"Tracing Walkthrough - {unique_id}"

In [2]:
# from langsmith import Client

# client = Client()

In [12]:
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain.chat_models import ChatOpenAI
from langchain.output_parsers import PydanticOutputParser
from langchain.prompts import PromptTemplate, ChatPromptTemplate
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

from human_eval.data import read_problems, write_jsonl

import re
import os

data = read_problems()

def extract_code_blocks(text):
    """
    Extracts code blocks from markdown text.
    """
    # This regex matches content between triple backticks
    code_blocks = re.findall(r'```python\n(.*?)\n```', text, re.DOTALL)
    return code_blocks

def generate_one_completion(base_prompt: str, task_prompt: str, llm):
    """
    Generates code completion for a given task prompt.
    """
    prompt_text = base_prompt.format(content=task_prompt)
    completion = llm.invoke(prompt_text)
    code_blocks = extract_code_blocks(completion.content)
    return code_blocks[0] if code_blocks else ""

def generate_llm_outputs(base_prompt: str, file_name: str, llm, sample_size: int = None):
    """
    Generates completions for all tasks in the human evaluation set.
    """
    from itertools import islice

    # generate completions for all tasks in parallel
    problems = read_problems()
    if sample_size:
        tasks = list(islice(problems.items(), sample_size))
    else:
        tasks = list(problems.items())

    samples = []

    with ThreadPoolExecutor() as executor:
        futures = {
            executor.submit(generate_one_completion, base_prompt, problems[task_id]["prompt"], llm): task_id
            for task_id, _ in tasks
        }

        for future in tqdm(as_completed(futures), total=len(futures)):
            task_id = futures[future]
            try:
                completion = future.result()
                samples.append(dict(task_id=task_id, completion=completion))
            except Exception as e:
                print(f"Task {task_id} generated an exception: {e}")

    # write_jsonl(f"/Users/iwatson/Documents/Research Project/prompt-optimisation/src/outputs/{file_name}.jsonl", samples)

In [13]:
from human_eval_prompts import HumanEvalPrompts

human_eval_prompts = HumanEvalPrompts()

baseline_prompt = human_eval_prompts.get_baseline_prompt()
emotive_prompt = human_eval_prompts.get_emotive_prompt()
CoT_prompt = human_eval_prompts.get_CoT_prompt()
reflection_prompts = human_eval_prompts.get_reflection_prompts()
authoritarian_prompts_0 = human_eval_prompts.get_authoritarian_prompts_0()
authoritarian_prompts_1 = human_eval_prompts.get_authoritarian_prompts_1()
market_prompts_0 = human_eval_prompts.get_market_prompts_0()
market_prompts_1 = human_eval_prompts.get_market_prompts_1()
hierarchical_prompts_0 = human_eval_prompts.get_hierarchical_prompts_0()
hierarchical_prompts_1 = human_eval_prompts.get_hierarchical_prompts_1()

In [14]:
llm = ChatOpenAI(
    temperature=1.0,
    model="gpt-4o",
)

In [15]:
generate_llm_outputs(baseline_prompt, "human_eval_baseline", llm)
generate_llm_outputs(emotive_prompt, "human_eval_emotive", llm)
generate_llm_outputs(CoT_prompt, "human_eval_CoT", llm)

 99%|█████████▉| 162/164 [00:36<00:00,  4.46it/s]


KeyboardInterrupt: 

In [13]:
prompts = ["baseline", "emotive", "CoT"]
for prompt in prompts:
    output_file = f"human_eval_{prompt}"
    # Define the paths
    host_volume_path = "/Users/iwatson/Documents/Research Project/prompt-optimisation/src/outputs"
    container_volume_path = "/outputs"
    input_file_path = f"/outputs/{output_file}.jsonl"

    docker_command = f'docker run --rm -v "{host_volume_path}:{container_volume_path}" humaneval-evaluation "{input_file_path}" --k 1'

    !{docker_command}

Reading samples...
164it [00:00, 18430.08it/s]
  0%|          | 0/164 [00:00<?, ?it/s]Running test suites...
100%|██████████| 164/164 [00:03<00:00, 54.22it/s] 
  0%|          | 0/164 [00:00<?, ?it/s]Writing results to /outputs/human_eval_baseline_results.jsonl...
100%|██████████| 164/164 [00:00<00:00, 14610.26it/s]
Pass@k results: {'pass@1': np.float64(0.9207317073170732)}
Reading samples...
164it [00:00, 6927.29it/s]
  0%|          | 0/164 [00:00<?, ?it/s]Running test suites...
100%|██████████| 164/164 [00:00<00:00, 221.41it/s]
  0%|          | 0/164 [00:00<?, ?it/s]Writing results to /outputs/human_eval_emotive_results.jsonl...
100%|██████████| 164/164 [00:00<00:00, 14798.86it/s]
Pass@k results: {'pass@1': np.float64(0.8902439024390244)}
Reading samples...
164it [00:00, 7517.91it/s]
Running test suites...
100%|██████████| 164/164 [00:02<00:00, 55.57it/s] 
  0%|          | 0/164 [00:00<?, ?it/s]Writing results to /outputs/human_eval_CoT_results.jsonl...
100%|██████████| 164/164 [00:00

In [6]:
print(len(hierarchical_prompts_1))

10


In [7]:
name = "hierarchical_1"
prompt_set = hierarchical_prompts_1

In [8]:
for idx, prompt in enumerate(prompt_set):
    generate_llm_outputs(prompt, "human_eval_{name}_{idx}".format(name=name, idx=idx), llm)

100%|██████████| 164/164 [02:03<00:00,  1.33it/s]
100%|██████████| 164/164 [01:08<00:00,  2.40it/s]
100%|██████████| 164/164 [01:15<00:00,  2.17it/s]
100%|██████████| 164/164 [02:15<00:00,  1.21it/s]
100%|██████████| 164/164 [00:58<00:00,  2.79it/s]
100%|██████████| 164/164 [01:24<00:00,  1.93it/s]
100%|██████████| 164/164 [03:21<00:00,  1.23s/it]
100%|██████████| 164/164 [01:40<00:00,  1.62it/s]
100%|██████████| 164/164 [02:36<00:00,  1.05it/s]
100%|██████████| 164/164 [02:48<00:00,  1.03s/it]


In [10]:
print(len(market_prompts_0))

10


In [22]:
# generate_llm_outputs(authoritarian_prompts_1[5], "human_eval_{name}_{idx}".format(name='authoritarian_1', idx=5), llm)

100%|██████████| 164/164 [01:02<00:00,  2.64it/s]


In [9]:
import os

for i in range(10):
    output_file = "human_eval_{name}_{i}".format(name=name, i=i)
    # Define the paths
    host_volume_path = "/Users/iwatson/Documents/Research Project/prompt-optimisation/src/outputs"
    container_volume_path = "/outputs"
    input_file_path = f"/outputs/{output_file}.jsonl"

    # Verify the file exists on the host
    if not os.path.isfile(os.path.join(host_volume_path, f"{output_file}.jsonl")):
        raise FileNotFoundError(f"File not found: {os.path.join(host_volume_path, output_file+'jsonl')}")

    # Construct the Docker command
    docker_command = f'docker run --rm -v "{host_volume_path}:{container_volume_path}" humaneval-evaluation "{input_file_path}" --k 1'

    # Run the Docker command
    !{docker_command}


Reading samples...
164it [00:00, 19112.16it/s]
Running test suites...
100%|██████████| 164/164 [00:00<00:00, 220.68it/s]
  0%|          | 0/164 [00:00<?, ?it/s]Writing results to /outputs/human_eval_hierarchical_1_0_results.jsonl...
100%|██████████| 164/164 [00:00<00:00, 16649.30it/s]
Pass@k results: {'pass@1': np.float64(0.9085365853658537)}
Reading samples...
0it [00:00, ?it/s]Running test suites...
164it [00:00, 5851.04it/s]
100%|██████████| 164/164 [00:03<00:00, 42.41it/s] 
Writing results to /outputs/human_eval_hierarchical_1_1_results.jsonl...
100%|██████████| 164/164 [00:00<00:00, 15305.29it/s]
Pass@k results: {'pass@1': np.float64(0.9207317073170732)}
Reading samples...
0it [00:00, ?it/s]Running test suites...
164it [00:00, 9532.64it/s]
100%|██████████| 164/164 [00:03<00:00, 48.89it/s] 
  0%|          | 0/164 [00:00<?, ?it/s]Writing results to /outputs/human_eval_hierarchical_1_2_results.jsonl...
100%|██████████| 164/164 [00:00<00:00, 15829.02it/s]
Pass@k results: {'pass@1': np

In [None]:
import os
import re

# Path to the directory containing the files
directory_path = "/Users/iwatson/Documents/Research Project/prompt-optimisation/src/outputs"

# Pattern to match and replace
# This pattern specifically looks for three numbers separated by underscores, with 'authoritarian' preceding them.
pattern_to_match = r"human_eval_authoritarian_1_(\d+)_(\d+)_results.jsonl"
pattern_to_replace_with = r"human_eval_authoritarian_\1_\2_results.jsonl"

# List all files in the directory
files = os.listdir(directory_path)

# Loop through each file
for file_name in files:
    # Check if the file name matches the pattern
    if re.match(pattern_to_match, file_name):
        new_file_name = re.sub(pattern_to_match, pattern_to_replace_with, file_name)
        
        # Construct the full old and new file paths
        old_file_path = os.path.join(directory_path, file_name)
        new_file_path = os.path.join(directory_path, new_file_name)
        
        # Rename the file
        os.rename(old_file_path, new_file_path)
        print(f"Renamed '{file_name}' to '{new_file_name}'")


Renamed 'human_eval_authoritarian_1_0_6_results.jsonl' to 'human_eval_authoritarian_0_6_results.jsonl'
Renamed 'human_eval_authoritarian_1_0_2_results.jsonl' to 'human_eval_authoritarian_0_2_results.jsonl'
Renamed 'human_eval_authoritarian_1_0_5_results.jsonl' to 'human_eval_authoritarian_0_5_results.jsonl'
Renamed 'human_eval_authoritarian_1_0_1_results.jsonl' to 'human_eval_authoritarian_0_1_results.jsonl'
Renamed 'human_eval_authoritarian_1_0_8_results.jsonl' to 'human_eval_authoritarian_0_8_results.jsonl'
Renamed 'human_eval_authoritarian_1_0_9_results.jsonl' to 'human_eval_authoritarian_0_9_results.jsonl'
Renamed 'human_eval_authoritarian_1_0_0_results.jsonl' to 'human_eval_authoritarian_0_0_results.jsonl'
Renamed 'human_eval_authoritarian_1_0_4_results.jsonl' to 'human_eval_authoritarian_0_4_results.jsonl'
Renamed 'human_eval_authoritarian_1_0_3_results.jsonl' to 'human_eval_authoritarian_0_3_results.jsonl'
Renamed 'human_eval_authoritarian_1_0_7_results.jsonl' to 'human_eval_aut