In [15]:
# Reload imports
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [16]:
from tqdm import tqdm
import json

import promptbench as pb

from langchain_core.messages import AIMessage

### Available Models and Dataset

In [17]:
# print all supported datasets in promptbench
print('All supported datasets: ')
for dataset in pb.SUPPORTED_DATASETS:
    print(f'  {dataset}')

All supported datasets: 
  sst2
  cola
  qqp
  mnli
  mnli_matched
  mnli_mismatched
  qnli
  wnli
  rte
  mrpc
  mmlu
  squad_v2
  un_multi
  iwslt2017
  math
  bool_logic
  valid_parentheses
  gsm8k
  csqa
  bigbench_date
  bigbench_object_tracking
  last_letter_concat
  numersense
  qasc
  bbh
  drop
  arc-easy
  arc-challenge


In [18]:
# print all supported models in promptbench
print('All supported models: ')
for model in pb.SUPPORTED_MODELS:
    print(f'  {model}')

All supported models: 
  google/flan-t5-large
  llama2-7b
  llama2-7b-chat
  llama2-13b
  llama2-13b-chat
  llama2-70b
  llama2-70b-chat
  phi-1.5
  phi-2
  palm
  gpt-3.5-turbo
  gpt-4
  gpt-4-1106-preview
  gpt-3.5-turbo-1106
  gpt-4-0125-preview
  gpt-3.5-turbo-0125
  gpt-4-turbo
  gpt-4o
  gpt-4o-mini
  vicuna-7b
  vicuna-13b
  vicuna-13b-v1.3
  google/flan-ul2
  gemini-pro
  mistralai/Mistral-7B-v0.1
  mistralai/Mistral-7B-Instruct-v0.1
  mistralai/Mixtral-8x7B-v0.1
  mistralai/Mixtral-8x7B-Instruct-v0.1
  01-ai/Yi-6B
  01-ai/Yi-34B
  01-ai/Yi-6B-Chat
  01-ai/Yi-34B-Chat
  baichuan-inc/Baichuan2-7B-Base
  baichuan-inc/Baichuan2-13B-Base
  baichuan-inc/Baichuan2-7B-Chat
  baichuan-inc/Baichuan2-13B-Chat
  mistral:v0.3
  llama3.1


### Test Custom Prompts

In [19]:
import random
random.seed(42)

dataset_name = "gsm8k"

dataset = pb.DatasetLoader.load_dataset(dataset_name)
length = len(dataset)
sample_indices = random.sample(range(length), 30)
print(sample_indices)
# extract 50 samples from the dataset
samples = [dataset[i] for i in sample_indices]

[1309, 228, 51, 563, 501, 457, 285, 209, 1116, 178, 1209, 864, 65, 61, 191, 447, 476, 1034, 1232, 54, 1149, 407, 859, 451, 919, 1206, 569, 13, 326, 865]


In [41]:
from prompts import gsm8k, sst2

agent_model = "claude-3-5-sonnet"
infer_model = "llama3.1"

if dataset_name == "gsm8k":
    baseline_prompt = gsm8k.get_baseline_prompt()
    emotive_prompt = gsm8k.get_emotive_prompt()
    CoT_prompt = gsm8k.get_CoT_prompt()
    if agent_model == "gpt-3.5-turbo":
        from prompts.gpt_3_5_turbo import gsm8k
        authoritarian_prompts = gsm8k.get_authoritarian_prompts()
        market_prompts = gsm8k.get_market_prompts()
        hierarchical_prompts = gsm8k.get_hierarchical_prompts()
    elif agent_model == "gpt-4o-mini":
        from prompts.gpt_4o_mini import gsm8k
        authoritarian_prompts = gsm8k.get_authoritarian_prompts()
        market_prompts = gsm8k.get_market_prompts()
        hierarchical_prompts = gsm8k.get_hierarchical_prompts()
    elif agent_model == "gpt-4o":
        from prompts.gpt_4o import gsm8k
        authoritarian_prompts = gsm8k.get_authoritarian_prompts()
        market_prompts = gsm8k.get_market_prompts()
        hierarchical_prompts = gsm8k.get_hierarchical_prompts()
    elif agent_model == "claude-3-haiku":
        from prompts.claude_3_haiku import gsm8k
        authoritarian_prompts = gsm8k.get_authoritarian_prompts()
        market_prompts = gsm8k.get_market_prompts()
        hierarchical_prompts = gsm8k.get_hierarchical_prompts()
    elif agent_model == "claude-3-5-sonnet":
        from prompts.claude_3_5_sonnet import gsm8k
        authoritarian_prompts = gsm8k.get_authoritarian_prompts()
        market_prompts = gsm8k.get_market_prompts()
        hierarchical_prompts = gsm8k.get_hierarchical_prompts()

elif dataset_name == "sst2":
    baseline_prompt = sst2.get_baseline_prompt()
    emotive_prompt = sst2.get_emotive_prompt()
    CoT_prompt = sst2.get_CoT_prompt()
    if agent_model == "gpt-3.5-turbo":
        from prompts.gpt_3_5_turbo import sst2
        authoritarian_prompts = sst2.get_authoritarian_prompts()
        market_prompts = sst2.get_market_prompts()
        hierarchical_prompts = sst2.get_hierarchical_prompts()
    elif agent_model == "gpt-4o-mini":
        from prompts.gpt_4o_mini import sst2
        authoritarian_prompts = sst2.get_authoritarian_prompts()
        market_prompts = sst2.get_market_prompts()
        hierarchical_prompts = sst2.get_hierarchical_prompts()
    elif agent_model == "gpt-4o":
        from prompts.gpt_4o import sst2
        authoritarian_prompts = sst2.get_authoritarian_prompts()
        market_prompts = sst2.get_market_prompts()
        hierarchical_prompts = sst2.get_hierarchical_prompts()
    elif agent_model == "claude-3-haiku":
        from prompts.claude_3_haiku import sst2
        authoritarian_prompts = sst2.get_authoritarian_prompts()
        market_prompts = sst2.get_market_prompts()
        hierarchical_prompts = sst2.get_hierarchical_prompts()
    elif agent_model == "claude-3-5-sonnet":
        from prompts.claude_3_5_sonnet import sst2
        authoritarian_prompts = sst2.get_authoritarian_prompts()
        market_prompts = sst2.get_market_prompts()
        hierarchical_prompts = sst2.get_hierarchical_prompts()


In [36]:
llm = pb.LLMModel(model=infer_model, max_new_tokens=1024, temperature=0)
print(llm.model_name)

llama3.1


In [42]:
prompt_type = "market"

if prompt_type == "baseline":
    prompts = pb.Prompt([baseline_prompt])
    iters = 3
elif prompt_type == "emotive":
    prompts = pb.Prompt([emotive_prompt])
    iters = 3
elif prompt_type == "CoT":
    prompts = pb.Prompt([CoT_prompt])
    iters = 3
elif prompt_type == "authoritarian":
    prompts = pb.Prompt(authoritarian_prompts)
    iters = 1
elif prompt_type == "market":
    prompts = pb.Prompt(market_prompts)
    iters = 1
elif prompt_type == "hierarchical":
    prompts = pb.Prompt(hierarchical_prompts)
    iters = 1

In [43]:
# prompts = prompts[2:]
for prompt in prompts:
    print(prompt)

Solve this math problem:
{content}

Show your work clearly. Include key steps and reasoning.

Example 1 (Simple arithmetic):
Problem: A train travels 120 km in 2 hours. What is its average speed?
Work:
1. Speed = Distance ÷ Time
2. Distance = 120 km
3. Time = 2 hours
4. Speed = 120 km ÷ 2 hours = 60 km/h
Answer: ##60

Example 2 (Algebra):
Problem: If 3x + 7 = 22, what is the value of x?
Work:
1. Subtract 7 from both sides: 3x = 15
2. Divide both sides by 3: x = 5
Answer: ##5

Example 3 (Geometry - incomplete, finish it):
Problem: A rectangle has a length of 8 cm and a width of 5 cm. What is its area?
Work:
1. Area of rectangle = length × width
2. Length = 8 cm, Width = 5 cm
3. Area = 8 cm × 5 cm = 40 cm²
Answer: ##40

Example 4 (Percentages):
Problem: A shirt originally priced at $80 is on sale for 25% off. What is the sale price?
Work:
1. Calculate the discount: 25% of $80 = 0.25 × $80 = $20
2. Subtract the discount from the original price: $80 - $20 = $60
Answer: ##60

Now solve the 

In [44]:
# for idx, prompt in enumerate(prompts):
#     preds = []
#     labels = []
#     questions = []
#     for data in tqdm(samples):
#         # process input
#         input_text = pb.InputProcess.basic_format(prompt, data)
#         label = data['label']
#         raw_pred = model(input_text)
#         # process output
#         pred = pb.OutputProcess.pattern_re(raw_pred, r"##(\d+)")
#         # print(f"Pred: {pred}, Label: {label}")
#         questions.append(data['content'])
#         preds.append(pred)
#         labels.append(label)

#         output = list(zip(dataset, pred, label))

#         # Define the output file path
#         output_file_path = f"/Users/iwatson/Documents/Research Project/prompt-optimisation/src/outputs/{dataset_name}_{prompt_type[idx]}.jsonl"

#         # Save the paired list to a JSON file
#         with open(output_file_path, 'w', encoding='utf-8') as jsonl_file:
#             for pair in output:
#                 jsonl_file.write(json.dumps(pair) + '\n')
    
#     # evaluate
#     score = pb.Eval.compute_cls_accuracy(preds, labels)
#     print(f"{score:.3f}, {prompt}")

In [45]:
import nest_asyncio
import asyncio
import json
from tqdm import tqdm

# Apply the nest_asyncio patch
nest_asyncio.apply()

async def async_process(data, model, prompt):
    # Process input
    input_text = pb.InputProcess.basic_format(prompt, data)
    label = data['label']
    try:
        # Set individual timeout for each LLM call
        raw_pred = await asyncio.wait_for(model(input_text), timeout=60)
        if isinstance(raw_pred, AIMessage):
            raw_pred = raw_pred.content
    except asyncio.TimeoutError:
        return (data['content'], "Timeout", -1, label)
    
    # Process output
    if dataset_name == "gsm8k":
        pred = pb.OutputProcess.pattern_re(raw_pred, r"##(\d+| \d+)")
    elif dataset_name == "sst2":
        pred = pb.OutputProcess.pattern_re(raw_pred, r"##(positive|negative|Positive|Negative| positive| negative| Positive| Negative)")
        pred = {"positive": 1, "negative": 0}.get(pred.lower(), -1)
    
    return (data['content'], raw_pred, pred, label)

async def process_data(samples, model, prompt):
    results = []
    for data in tqdm(samples):
        result = await async_process(data, model, prompt)
        results.append(result)
    return results

async def main():
    main_dict = {
        "results": {}
    }
    mean_scores = []
    for idx, prompt in enumerate(prompts):
        prompt_scores = []
        result_dict = {
            "prompt": prompt
        }
        for i in range(1, iters + 1):
            # Process data sequentially
            results = await process_data(samples, llm, prompt)

            # Initialize lists to store results
            questions = []
            raw_responses = []
            preds = []
            labels = []

            for data_content, raw, pred, label in results:
                questions.append(data_content)
                raw_responses.append(raw)
                preds.append(pred)
                labels.append(label)

            # Create the results section for the current iteration
            results_section = {}
            for j, (content, raw, pred, label) in enumerate(zip(questions, raw_responses, preds, labels), start=1):
                # print("Prediction:", pred, "Label:", label)
                results_section[j] = {
                    "content": content,
                    "response": raw,
                    "pred": pred,
                    "label": label
                }

            # Compute the score for the current results section
            score = pb.Eval.compute_cls_accuracy(preds, labels)
            print(f"Score: {score:.3f}")
            results_section["score"] = score
            prompt_scores.append(score)

            # Add the results section to the result dictionary
            result_dict[f"results_{i}"] = results_section

        # Compute the average score for the current prompt
        prompt_mean_score = sum(prompt_scores) / len(prompt_scores)
        prompt_std_dev = (sum([(score - prompt_mean_score) ** 2 for score in prompt_scores]) / len(prompt_scores)) ** 0.5
        result_dict["prompt_mean_score"] = prompt_mean_score

        # Add the result dictionary to the main dictionary
        main_dict["results"][idx + 1] = result_dict

        # Append the average score to the avg_scores list for overall calculations
        mean_scores.append(prompt_mean_score)

        # Write the result dictionary to the JSONL file after each prompt
        if agent_model == "":
            output_file_path = f"/Users/iwatson/Documents/Research Project/prompt-optimisation/src/outputs/{infer_model}/{dataset_name}/{prompt_type}.jsonl"
        else:
            output_file_path = f"/Users/iwatson/Documents/Research Project/prompt-optimisation/src/outputs/{infer_model}/{dataset_name}/{agent_model}_{prompt_type}.jsonl"
        with open(output_file_path, 'a', encoding='utf-8') as jsonl_file:
            jsonl_file.write(json.dumps(result_dict, indent=4) + '\n')

    # Compute the overall average score, median, and standard deviation
    mean_score = sum(mean_scores) / len(mean_scores)
    median_score = sorted(mean_scores)[len(mean_scores) // 2]
    std_dev = (sum([(score - mean_score) ** 2 for score in mean_scores]) / len(mean_scores)) ** 0.5
    print(f"Mean: {mean_score:.3f}, Median: {median_score:.3f}, Std. Dev.: {std_dev:.3f}")

    main_dict["architecture_mean_score"] = mean_score
    main_dict["architecture_median_score"] = median_score
    main_dict["architecture_std_dev"] = std_dev

    # Write the final summary to the JSONL file
    with open(output_file_path, 'a', encoding='utf-8') as jsonl_file:
        jsonl_file.write(json.dumps(main_dict, indent=4) + '\n')

# Run the main function
await main()


  0%|          | 0/30 [00:00<?, ?it/s]

100%|██████████| 30/30 [04:08<00:00,  8.28s/it]


Score: 0.133


100%|██████████| 30/30 [04:58<00:00,  9.95s/it]


Score: 0.167


100%|██████████| 30/30 [07:43<00:00, 15.44s/it]

Score: 0.000
Mean: 0.100, Median: 0.133, Std. Dev.: 0.072





In [5]:
mean_scores = [0.033, 0.033, 0.067 ]

mean_score = sum(mean_scores) / len(mean_scores)
median_score = sorted(mean_scores)[len(mean_scores) // 2]
std_dev = (sum([(score - mean_score) ** 2 for score in mean_scores]) / len(mean_scores)) ** 0.5
print(f"Mean: {mean_score:.3f}, Median: {median_score:.3f}, Std. Dev.: {std_dev:.6f}")

Mean: 0.044, Median: 0.033, Std. Dev.: 0.016028


In [34]:
# from concurrent.futures import ThreadPoolExecutor, as_completed
# import json
# from tqdm import tqdm

# def process_data(data):
#     # process input
#     input_text = pb.InputProcess.basic_format(prompt, data)
#     label = data['label']
#     if data["content"] == "##":
#         return (data['content'], "Timeout", -1, label)
#     else:
#         raw_pred = llm(input_text)
#     if (infer_model == "llama3.1") or (infer_model == "mistral:v0.3"):
#         raw_pred = raw_pred.content
#     # print("Output generated")
#     # process output
#     if dataset_name == "gsm8k":
#         # pred = extract_and_map_number(raw_pred)
#         pred = pb.OutputProcess.pattern_re(raw_pred, r"##(\d+| \d+)")
#     elif dataset_name == "sst2":
#         # pred = extract_and_map_sentiment(raw_pred)
#         # regex to find extract ##positive or ##negative sentiment
#         pred = pb.OutputProcess.pattern_re(raw_pred, r"##(positive|negative|Positive|Negative| positive| negative| Positive| Negative)")
#         pred = {"positive": 1, "negative": 0}.get(pred.lower(), -1)
#     # print(f"Raw: {raw_pred}, Pred: {pred}, Label: {label}")
#     # Collect the necessary information for output
#     # print("Output processed")
#     return (data['content'], raw_pred, pred, label)

# main_dict = {
#     "results": {}
# }
# mean_scores = []
# for idx, prompt in enumerate(prompts):
#     prompt_scores = []
#     result_dict = {
#         "prompt": prompt
#     }
#     for i in range(1, iters+1):
#         # Initialize lists to store results
#         questions = []
#         raw_responses = []
#         preds = []
#         labels = []

#         # Use ThreadPoolExecutor to process data in parallel
#         with ThreadPoolExecutor() as executor:
#             # Map process_data function to each item in samples
#             future_to_data = {executor.submit(process_data, data): data for data in samples}
            
#             for future in tqdm(as_completed(future_to_data), total=len(samples)):
#                 data_content, raw, pred, label = future.result()
#                 questions.append(data_content)
#                 raw_responses.append(raw)
#                 preds.append(pred)
#                 labels.append(label)

#         # Create the results section for the current iteration
#         results_section = {}
#         for j, (content, raw, pred, label) in enumerate(zip(questions, raw_responses, preds, labels), start=1):
#             results_section[j] = {
#                 "content": content,
#                 "response": raw, 
#                 "pred": pred,
#                 "label": label
#             }

#         # Compute the score for the current results section
#         score = pb.Eval.compute_cls_accuracy(preds, labels)
#         print(f"Score: {score:.3f}")
#         results_section["score"] = score
#         prompt_scores.append(score)

#         # Add the results section to the result dictionary
#         result_dict[f"results_{i}"] = results_section

#     # Compute the average score for the current prompt
#     prompt_mean_score = sum(prompt_scores) / len(prompt_scores)
#     prompt_std_dev = (sum([(score - prompt_mean_score) ** 2 for score in prompt_scores]) / len(prompt_scores)) ** 0.5
#     result_dict["prompt_mean_score"] = prompt_mean_score

#     # Add the result dictionary to the main dictionary
#     main_dict["results"][idx + 1] = result_dict

#     # Append the average score to the avg_scores list for overall calculations
#     mean_scores.append(prompt_mean_score)

#     # Write the result dictionary to the JSONL file after each prompt
#     if agent_model == "":
#         output_file_path = f"/Users/iwatson/Documents/Research Project/prompt-optimisation/src/outputs/{infer_model}/{dataset_name}/{prompt_type}.jsonl"
#     else:
#         output_file_path = f"/Users/iwatson/Documents/Research Project/prompt-optimisation/src/outputs/{infer_model}/{dataset_name}/{agent_model}_{prompt_type}.jsonl"
#     with open(output_file_path, 'a', encoding='utf-8') as jsonl_file:
#         jsonl_file.write(json.dumps(result_dict, indent=4) + '\n')

# # Compute the overall average score, median, and standard deviation
# mean_score = sum(mean_scores) / len(mean_scores)
# median_score = sorted(mean_scores)[len(mean_scores) // 2]
# std_dev = (sum([(score - mean_score) ** 2 for score in mean_scores]) / len(mean_scores)) ** 0.5
# print(f"Mean: {mean_score:.3f}, Median: {median_score:.3f}, Std. Dev.: {std_dev:.3f}")

# main_dict["architecture_mean_score"] = mean_score
# main_dict["architecture_median_score"] = median_score
# main_dict["architecture_std_dev"] = std_dev

# # Write the final summary to the JSONL file
# with open(output_file_path, 'a', encoding='utf-8') as jsonl_file:
#     jsonl_file.write(json.dumps(main_dict, indent=4) + '\n')


### Test Pre-defined Prompts

In [33]:
# load method
# print all methods and their supported datasets
print('All supported methods: ')
print(pb.SUPPORTED_METHODS)
print('Supported datasets for each method: ')
print(pb.METHOD_SUPPORT_DATASET)

method = pb.PEMethod(method='baseline', 
                    dataset=dataset_name,
                    verbose=True,  # if True, print the detailed prompt and response
                    prompt_id = 1  # for emotion_prompt 
                    )

All supported methods: 
['CoT', 'ZSCoT', 'least_to_most', 'generated_knowledge', 'expert_prompting', 'emotion_prompt', 'baseline']
Supported datasets for each method: 
{'CoT': ['gsm8k', 'csqa', 'bigbench_date', 'bigbench_object_tracking'], 'ZSCoT': ['gsm8k', 'csqa', 'bigbench_date', 'bigbench_object_tracking'], 'expert_prompting': ['gsm8k', 'csqa', 'bigbench_date', 'bigbench_object_tracking'], 'emotion_prompt': ['gsm8k', 'csqa', 'bigbench_date', 'bigbench_object_tracking'], 'least_to_most': ['gsm8k', 'last_letter_concat'], 'generated_knowledge': ['csqa', 'numersense', 'qasc'], 'baseline': ['gsm8k', 'csqa', 'bigbench_date', 'bigbench_object_tracking', 'last_letter_concat', 'numersense', 'qasc']}


In [34]:
results = method.test(dataset, 
                      model, 
                      num_samples=2,
                      )

results

  0%|          | 0/1319 [00:00<?, ?it/s]


AttributeError: 'str' object has no attribute 'convert_text_to_prompt'