### HumanEval

In [1]:
# Reload imports
%load_ext autoreload
%autoreload 2

In [2]:
import os

unique_id = "HumanEval"
os.environ["LANGCHAIN_PROJECT"] = f"Tracing Walkthrough - {unique_id}"

In [3]:
# from langsmith import Client

# client = Client()

In [4]:
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain.chat_models import ChatOpenAI
from langchain.output_parsers import PydanticOutputParser
from langchain.prompts import PromptTemplate, ChatPromptTemplate
from langchain_core.messages import AIMessage
from concurrent.futures import ThreadPoolExecutor, as_completed

from human_eval.data import read_problems, write_jsonl
import promptbench as pb

import re
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [20]:
from prompts import human_eval

agent_model = "claude-3-5-sonnet"
infer_model = "mistralv0.3"


baseline_prompt = human_eval.get_baseline_prompt()
emotive_prompt = human_eval.get_emotive_prompt()
CoT_prompt = human_eval.get_CoT_prompt()
if agent_model == "gpt-3.5-turbo":
    from prompts.gpt_3_5_turbo import human_eval
    authoritarian_prompts = human_eval.get_authoritarian_prompts()
    market_prompts = human_eval.get_market_prompts()
    hierarchical_prompts = human_eval.get_hierarchical_prompts()
elif agent_model == "gpt-4o-mini":
    from prompts.gpt_4o_mini import human_eval
    authoritarian_prompts = human_eval.get_authoritarian_prompts()
    market_prompts = human_eval.get_market_prompts()
    hierarchical_prompts = human_eval.get_hierarchical_prompts()
elif agent_model == "gpt-4o":
    from prompts.gpt_4o import human_eval
    authoritarian_prompts = human_eval.get_authoritarian_prompts()
    market_prompts = human_eval.get_market_prompts()
    hierarchical_prompts = human_eval.get_hierarchical_prompts()
elif agent_model == "claude-3-haiku":
    from prompts.claude_3_haiku import human_eval
    authoritarian_prompts = human_eval.get_authoritarian_prompts()
    market_prompts = human_eval.get_market_prompts()
    hierarchical_prompts = human_eval.get_hierarchical_prompts()
elif agent_model == "claude-3-5-sonnet":
    from prompts.claude_3_5_sonnet import human_eval
    authoritarian_prompts = human_eval.get_authoritarian_prompts()
    market_prompts = human_eval.get_market_prompts()
    hierarchical_prompts = human_eval.get_hierarchical_prompts()

In [21]:
llm = pb.LLMModel(model=infer_model, max_new_tokens=4096, temperature=0)
print(llm.model_name)

ValueError: The model is not supported!

In [15]:
import asyncio
import json
import re
from tqdm import tqdm
from itertools import islice

def extract_code_blocks(text):
    """
    Extracts code blocks from markdown text.
    """
    # This regex matches content between triple backticks
    code_blocks = re.findall(r'```python\n(.*?)\n```', text, re.DOTALL)
    return code_blocks

async def async_process(data, base_prompt, llm):
    """
    Processes a single data point asynchronously.
    """
    task_prompt = data['prompt']
    prompt_text = base_prompt.format(content=task_prompt)
    try:
        # Set individual timeout for each LLM call
        completion = await asyncio.wait_for(llm(prompt_text), timeout=180)
    except asyncio.TimeoutError:
        return {"task_id": data['task_id'], "completion": "Timeout"}

    if isinstance(completion, AIMessage):
        completion = completion.content

    code_blocks = extract_code_blocks(completion)
    return {"task_id": data['task_id'], "completion": code_blocks[0] if code_blocks else ""}

async def process_data(problems, base_prompt, llm):
    """
    Processes all data points sequentially.
    """
    results = []
    for task_id, data in tqdm(problems.items()):
        data['task_id'] = task_id
        result = await async_process(data, base_prompt, llm)
        results.append(result)
    return results

async def main(base_prompt, llm, sample_size):
    problems = read_problems()
    if sample_size:
        problems = dict(islice(problems.items(), sample_size))

    results = await process_data(problems, base_prompt, llm)

    # Write results to a JSONL file
    output_file_path = f"/Users/iwatson/Documents/Research Project/prompt-optimisation/src/outputs/{infer_model}/human_eval/generations/{agent_model}_{prompt_type}.jsonl"
    if not os.path.exists(os.path.dirname(output_file_path)):
        os.makedirs(os.path.dirname(output_file_path))
    with open(output_file_path, 'w', encoding='utf-8') as jsonl_file:
        for result in results:
            jsonl_file.write(json.dumps(result) + '\n')


In [16]:
# data = read_problems()

# def extract_code_blocks(text):
#     """
#     Extracts code blocks from markdown text.
#     """
#     # This regex matches content between triple backticks
#     code_blocks = re.findall(r'```python\n(.*?)\n```', text, re.DOTALL)
#     return code_blocks

# def generate_one_completion(base_prompt: str, task_prompt: str, llm):
#     """
#     Generates code completion for a given task prompt.
#     """
#     prompt_text = base_prompt.format(content=task_prompt)
#     completion = llm(prompt_text)
#     if isinstance(completion, AIMessage):
#         completion = completion.content
#     code_blocks = extract_code_blocks(completion)
#     return code_blocks[0] if code_blocks else ""

# def generate_llm_outputs(base_prompt: str, prompt_type, llm, sample_size: int = None):
#     """
#     Generates completions for all tasks in the human evaluation set.
#     """
#     from itertools import islice

#     # generate completions for all tasks in parallel
#     problems = read_problems()
#     if sample_size:
#         tasks = list(islice(problems.items(), sample_size))
#     else:
#         tasks = list(problems.items())

#     samples = []

#     with ThreadPoolExecutor() as executor:
#         futures = {
#             executor.submit(generate_one_completion, base_prompt, problems[task_id]["prompt"], llm): task_id
#             for task_id, _ in tasks
#         }

#         for future in tqdm(as_completed(futures), total=len(futures)):
#             task_id = futures[future]
#             try:
#                 completion = future.result()
#                 samples.append(dict(task_id=task_id, completion=completion))
#             except Exception as e:
#                 print(f"Task {task_id} generated an exception: {e}")

#     write_jsonl(f"/Users/iwatson/Documents/Research Project/prompt-optimisation/src/outputs/{infer_model}/human_eval/generations/{agent_model}_{prompt_type}.jsonl", samples)

In [17]:
prompts = authoritarian_prompts + market_prompts + hierarchical_prompts
prompt_types = ["authoritarian_1", "authoritarian_2", "authoritarian_3", "market_1", "market_2", "market_3", "hierarchical_1", "hierarchical_2", "hierarchical_3"]
# prompts = [baseline_prompt, CoT_prompt, emotive_prompt]
# prompt_types = ["baseline", "CoT", "emotive"]

In [18]:
# Apply nest_asyncio for Jupyter notebook compatibility
import nest_asyncio
nest_asyncio.apply()

for prompt, prompt_type in zip(prompts, prompt_types):
    print(f"Generating completions for {prompt_type} prompts")
    if infer_model == "llama3.1" or infer_model == "mistral:v0.3": 
        await main(prompt, llm, sample_size=30)
    else:
        generate_llm_outputs(prompt, prompt_type, llm, sample_size=30)
    print(f"Completions for {prompt_type} prompts generated successfully")

Generating completions for authoritarian_1 prompts


  0%|          | 0/30 [00:00<?, ?it/s]

100%|██████████| 30/30 [23:04<00:00, 46.16s/it]


Completions for authoritarian_1 prompts generated successfully
Generating completions for authoritarian_2 prompts


100%|██████████| 30/30 [16:24<00:00, 32.81s/it]


Completions for authoritarian_2 prompts generated successfully
Generating completions for authoritarian_3 prompts


100%|██████████| 30/30 [08:44<00:00, 17.49s/it]


Completions for authoritarian_3 prompts generated successfully
Generating completions for market_1 prompts


100%|██████████| 30/30 [03:57<00:00,  7.92s/it]


Completions for market_1 prompts generated successfully
Generating completions for market_2 prompts


100%|██████████| 30/30 [17:22<00:00, 34.74s/it]


Completions for market_2 prompts generated successfully
Generating completions for market_3 prompts


100%|██████████| 30/30 [07:15<00:00, 14.50s/it]


Completions for market_3 prompts generated successfully
Generating completions for hierarchical_1 prompts


100%|██████████| 30/30 [02:27<00:00,  4.91s/it]


Completions for hierarchical_1 prompts generated successfully
Generating completions for hierarchical_2 prompts


100%|██████████| 30/30 [07:55<00:00, 15.84s/it]


Completions for hierarchical_2 prompts generated successfully
Generating completions for hierarchical_3 prompts


100%|██████████| 30/30 [02:32<00:00,  5.09s/it]

Completions for hierarchical_3 prompts generated successfully





In [22]:
prompts = ["baseline", 
    "emotive", 
    "CoT", 
    "claude-3-5-sonnet_authoritarian_1", 
    "claude-3-5-sonnet_authoritarian_2", 
    "claude-3-5-sonnet_authoritarian_3", 
    "claude-3-5-sonnet_market_1", 
    "claude-3-5-sonnet_market_2", 
    "claude-3-5-sonnet_market_3", 
    "claude-3-5-sonnet_hierarchical_1",
    "claude-3-5-sonnet_hierarchical_2",
    "claude-3-5-sonnet_hierarchical_3",
    "gpt-4o_authoritarian_1",
    "gpt-4o_authoritarian_2",
    "gpt-4o_authoritarian_3",
    "gpt-4o_market_1",
    "gpt-4o_market_2",
    "gpt-4o_market_3",
    "gpt-4o_hierarchical_1",
    "gpt-4o_hierarchical_2",
    "gpt-4o_hierarchical_3",
]
for prompt in prompts:
    # Define the paths
    host_volume_path = f"/Users/iwatson/Documents/Research Project/prompt-optimisation/src/outputs/{infer_model}/human_eval/"
    container_volume_path = f"/outputs/{infer_model}/human_eval/"
    input_file_path = f"/outputs/{infer_model}/human_eval/generations/{prompt}.jsonl"

    docker_command = f'docker run --rm -v "{host_volume_path}:{container_volume_path}" humaneval-evaluation "{input_file_path}" --k 1'

    !{docker_command}

Reading samples...
30it [00:00, 4166.53it/s]
Running test suites...
100%|██████████| 30/30 [00:00<00:00, 180.63it/s]
Writing results to /outputs/mistralv0.3/human_eval/generations/baseline_results.jsonl...
100%|██████████| 30/30 [00:00<00:00, 17932.04it/s]
Pass@k results: {'pass@1': np.float64(0.4666666666666667)}
Reading samples...
30it [00:00, 1691.55it/s]
Running test suites...
100%|██████████| 30/30 [00:00<00:00, 208.26it/s]
Writing results to /outputs/mistralv0.3/human_eval/generations/emotive_results.jsonl...
100%|██████████| 30/30 [00:00<00:00, 10781.35it/s]
Pass@k results: {'pass@1': np.float64(0.43333333333333335)}
Reading samples...
30it [00:00, 1716.07it/s]
Running test suites...
100%|██████████| 30/30 [00:00<00:00, 199.11it/s]
Writing results to /outputs/mistralv0.3/human_eval/generations/CoT_results.jsonl...
100%|██████████| 30/30 [00:00<00:00, 15233.55it/s]
Pass@k results: {'pass@1': np.float64(0.4666666666666667)}
0it [00:00, ?it/s]Reading samples...
30it [00:00, 862.97i

In [6]:
print(len(hierarchical_prompts_1))

10


In [7]:
name = "hierarchical_1"
prompt_set = hierarchical_prompts_1

In [8]:
for idx, prompt in enumerate(prompt_set):
    generate_llm_outputs(prompt, "human_eval_{name}_{idx}".format(name=name, idx=idx), llm)

100%|██████████| 164/164 [02:03<00:00,  1.33it/s]
100%|██████████| 164/164 [01:08<00:00,  2.40it/s]
100%|██████████| 164/164 [01:15<00:00,  2.17it/s]
100%|██████████| 164/164 [02:15<00:00,  1.21it/s]
100%|██████████| 164/164 [00:58<00:00,  2.79it/s]
100%|██████████| 164/164 [01:24<00:00,  1.93it/s]
100%|██████████| 164/164 [03:21<00:00,  1.23s/it]
100%|██████████| 164/164 [01:40<00:00,  1.62it/s]
100%|██████████| 164/164 [02:36<00:00,  1.05it/s]
100%|██████████| 164/164 [02:48<00:00,  1.03s/it]


In [10]:
print(len(market_prompts_0))

10


In [22]:
# generate_llm_outputs(authoritarian_prompts_1[5], "human_eval_{name}_{idx}".format(name='authoritarian_1', idx=5), llm)

100%|██████████| 164/164 [01:02<00:00,  2.64it/s]


In [9]:
import os

for i in range(10):
    output_file = "human_eval_{name}_{i}".format(name=name, i=i)
    # Define the paths
    host_volume_path = "/Users/iwatson/Documents/Research Project/prompt-optimisation/src/outputs"
    container_volume_path = "/outputs"
    input_file_path = f"/outputs/{output_file}.jsonl"

    # Verify the file exists on the host
    if not os.path.isfile(os.path.join(host_volume_path, f"{output_file}.jsonl")):
        raise FileNotFoundError(f"File not found: {os.path.join(host_volume_path, output_file+'jsonl')}")

    # Construct the Docker command
    docker_command = f'docker run --rm -v "{host_volume_path}:{container_volume_path}" humaneval-evaluation "{input_file_path}" --k 1'

    # Run the Docker command
    !{docker_command}


Reading samples...
164it [00:00, 19112.16it/s]
Running test suites...
100%|██████████| 164/164 [00:00<00:00, 220.68it/s]
  0%|          | 0/164 [00:00<?, ?it/s]Writing results to /outputs/human_eval_hierarchical_1_0_results.jsonl...
100%|██████████| 164/164 [00:00<00:00, 16649.30it/s]
Pass@k results: {'pass@1': np.float64(0.9085365853658537)}
Reading samples...
0it [00:00, ?it/s]Running test suites...
164it [00:00, 5851.04it/s]
100%|██████████| 164/164 [00:03<00:00, 42.41it/s] 
Writing results to /outputs/human_eval_hierarchical_1_1_results.jsonl...
100%|██████████| 164/164 [00:00<00:00, 15305.29it/s]
Pass@k results: {'pass@1': np.float64(0.9207317073170732)}
Reading samples...
0it [00:00, ?it/s]Running test suites...
164it [00:00, 9532.64it/s]
100%|██████████| 164/164 [00:03<00:00, 48.89it/s] 
  0%|          | 0/164 [00:00<?, ?it/s]Writing results to /outputs/human_eval_hierarchical_1_2_results.jsonl...
100%|██████████| 164/164 [00:00<00:00, 15829.02it/s]
Pass@k results: {'pass@1': np

In [None]:
# import os
# import re

# # Path to the directory containing the files
# directory_path = "/Users/iwatson/Documents/Research Project/prompt-optimisation/src/outputs"

# # Pattern to match and replace
# # This pattern specifically looks for three numbers separated by underscores, with 'authoritarian' preceding them.
# pattern_to_match = r"human_eval_authoritarian_1_(\d+)_(\d+)_results.jsonl"
# pattern_to_replace_with = r"human_eval_authoritarian_\1_\2_results.jsonl"

# # List all files in the directory
# files = os.listdir(directory_path)

# # Loop through each file
# for file_name in files:
#     # Check if the file name matches the pattern
#     if re.match(pattern_to_match, file_name):
#         new_file_name = re.sub(pattern_to_match, pattern_to_replace_with, file_name)
        
#         # Construct the full old and new file paths
#         old_file_path = os.path.join(directory_path, file_name)
#         new_file_path = os.path.join(directory_path, new_file_name)
        
#         # Rename the file
#         os.rename(old_file_path, new_file_path)
#         print(f"Renamed '{file_name}' to '{new_file_name}'")


Renamed 'human_eval_authoritarian_1_0_6_results.jsonl' to 'human_eval_authoritarian_0_6_results.jsonl'
Renamed 'human_eval_authoritarian_1_0_2_results.jsonl' to 'human_eval_authoritarian_0_2_results.jsonl'
Renamed 'human_eval_authoritarian_1_0_5_results.jsonl' to 'human_eval_authoritarian_0_5_results.jsonl'
Renamed 'human_eval_authoritarian_1_0_1_results.jsonl' to 'human_eval_authoritarian_0_1_results.jsonl'
Renamed 'human_eval_authoritarian_1_0_8_results.jsonl' to 'human_eval_authoritarian_0_8_results.jsonl'
Renamed 'human_eval_authoritarian_1_0_9_results.jsonl' to 'human_eval_authoritarian_0_9_results.jsonl'
Renamed 'human_eval_authoritarian_1_0_0_results.jsonl' to 'human_eval_authoritarian_0_0_results.jsonl'
Renamed 'human_eval_authoritarian_1_0_4_results.jsonl' to 'human_eval_authoritarian_0_4_results.jsonl'
Renamed 'human_eval_authoritarian_1_0_3_results.jsonl' to 'human_eval_authoritarian_0_3_results.jsonl'
Renamed 'human_eval_authoritarian_1_0_7_results.jsonl' to 'human_eval_aut