In [2]:
import sys

# Add the directory of promptbench to the Python path
sys.path.append('/Users/iwatson/Repos/promptbench')

# Now you can import promptbench by name
import promptbench as pb

  from .autonotebook import tqdm as notebook_tqdm


### Load Models and Dataset

In [3]:
# print all supported datasets in promptbench
print('All supported datasets: ')
for dataset in pb.SUPPORTED_DATASETS:
    print(f'  {dataset}')

All supported datasets: 
  sst2
  cola
  qqp
  mnli
  mnli_matched
  mnli_mismatched
  qnli
  wnli
  rte
  mrpc
  mmlu
  squad_v2
  un_multi
  iwslt2017
  math
  bool_logic
  valid_parentheses
  gsm8k
  csqa
  bigbench_date
  bigbench_object_tracking
  last_letter_concat
  numersense
  qasc
  bbh
  drop
  arc-easy
  arc-challenge


In [4]:
# print all supported models in promptbench
print('All supported models: ')
for model in pb.SUPPORTED_MODELS:
    print(f'  {model}')

All supported models: 
  google/flan-t5-large
  llama2-7b
  llama2-7b-chat
  llama2-13b
  llama2-13b-chat
  llama2-70b
  llama2-70b-chat
  phi-1.5
  phi-2
  palm
  gpt-3.5-turbo
  gpt-4
  gpt-4-1106-preview
  gpt-3.5-turbo-1106
  gpt-4-0125-preview
  gpt-3.5-turbo-0125
  gpt-4-turbo
  gpt-4o
  vicuna-7b
  vicuna-13b
  vicuna-13b-v1.3
  google/flan-ul2
  gemini-pro
  mistralai/Mistral-7B-v0.1
  mistralai/Mistral-7B-Instruct-v0.1
  mistralai/Mixtral-8x7B-v0.1
  mistralai/Mixtral-8x7B-Instruct-v0.1
  01-ai/Yi-6B
  01-ai/Yi-34B
  01-ai/Yi-6B-Chat
  01-ai/Yi-34B-Chat
  baichuan-inc/Baichuan2-7B-Base
  baichuan-inc/Baichuan2-13B-Base
  baichuan-inc/Baichuan2-7B-Chat
  baichuan-inc/Baichuan2-13B-Chat


In [5]:
dataset_name = "gsm8k"

In [6]:
dataset = pb.DatasetLoader.load_dataset(dataset_name)
dataset[:5]

[{'content': "Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?",
  'label': '18'},
 {'content': 'A robe takes 2 bolts of blue fiber and half that much white fiber.  How many bolts in total does it take?',
  'label': '3'},
 {'content': 'Josh decides to try flipping a house.  He buys a house for $80,000 and then puts in $50,000 in repairs.  This increased the value of the house by 150%.  How much profit did he make?',
  'label': '70000'},
 {'content': 'James decides to run 3 sprints 3 times a week.  He runs 60 meters each sprint.  How many total meters does he run a week?',
  'label': '540'},
 {'content': "Every day, Wendi feeds each of her chickens three cups of mixed chicken feed, containing seeds, mealworms and vegetables to help keep them healthy.  She giv

In [24]:
model = "gpt-3.5-turbo"
model = pb.LLMModel(model=model, max_new_tokens=4096, temperature=0.00001)

### Test Custom Prompts

In [25]:
prompts = pb.Prompt([
'Instructions:\n\nPlease solve the following math word problem, ensuring you adhere to all specified guidelines:\n\n- Read the problem carefully.\n- Break down the problem into logical steps.\n- Solve each step accurately before moving to the next.\n- Ensure your final answer is correct.\n- Adhere to the word limits and guidelines provided.\n\nGuidelines:\n\n- Your response must be clear and concise.\n- Your final answer must be presented at the end in the format: ##<your answer (arabic numerals)>.\n- Do not include any spaces or units in the final answer.\n- Avoid disclosing any personal data.\n- Ensure your response does not go beyond the scope of solving the math word problem.\n\nContent Topic: Math Word Problems\n\n{content}\n\nPlease output your answer at the end as ##<your answer (arabic numerals)>.'
])

In [26]:
# Custom mapping function

In [27]:
from tqdm import tqdm
for prompt in prompts:
    preds = []
    labels = []
    for data in tqdm(dataset[:20]):
        # process input
        input_text = pb.InputProcess.basic_format(prompt, data)
        label = data['label']
        raw_pred = model(input_text)
        # process output
        pred = pb.OutputProcess.pattern_re(raw_pred, r"##(\d+)")
        print(f"Pred: {pred}, Label: {label}")
        preds.append(pred)
        labels.append(label)
    
    # evaluate
    score = pb.Eval.compute_cls_accuracy(preds, labels)
    print(f"{score:.3f}, {prompt}")

  5%|▌         | 1/20 [00:03<00:58,  3.08s/it]

Pred: 18, Label: 18


 10%|█         | 2/20 [00:05<00:46,  2.61s/it]

Pred: 3, Label: 3


 15%|█▌        | 3/20 [00:10<01:01,  3.63s/it]

Pred: 195000, Label: 70000


 20%|██        | 4/20 [00:12<00:51,  3.23s/it]

Pred: 540, Label: 540


 25%|██▌       | 5/20 [00:16<00:51,  3.46s/it]

Pred: 1, Label: 20


 30%|███       | 6/20 [00:19<00:45,  3.23s/it]

Pred: 64, Label: 64


 35%|███▌      | 7/20 [00:21<00:37,  2.89s/it]

Pred: 260, Label: 260


 40%|████      | 8/20 [00:23<00:30,  2.54s/it]

Pred: 120, Label: 160


 45%|████▌     | 9/20 [00:26<00:29,  2.69s/it]

Pred: 235, Label: 45


 50%|█████     | 10/20 [00:29<00:29,  2.92s/it]

Pred: 520, Label: 460


 55%|█████▌    | 11/20 [00:31<00:23,  2.65s/it]

Pred: 366, Label: 366


 60%|██████    | 12/20 [00:34<00:19,  2.47s/it]

Pred: 694, Label: 694


 65%|██████▌   | 13/20 [00:37<00:18,  2.63s/it]

Pred: 12, Label: 13


 70%|███████   | 14/20 [00:43<00:22,  3.68s/it]

Pred: 22, Label: 18


 75%|███████▌  | 15/20 [00:45<00:17,  3.44s/it]

Pred: 60, Label: 60


 80%|████████  | 16/20 [00:48<00:12,  3.03s/it]

Pred: 125, Label: 125


 85%|████████▌ | 17/20 [00:51<00:09,  3.19s/it]

Pred: 230, Label: 230


 90%|█████████ | 18/20 [00:54<00:05,  2.96s/it]

Pred: 57500, Label: 57500


 95%|█████████▌| 19/20 [00:56<00:02,  2.78s/it]

Pred: 7, Label: 7


100%|██████████| 20/20 [00:59<00:00,  2.99s/it]

Pred: 3, Label: 6
0.600, Instructions:

Please solve the following math word problem, ensuring you adhere to all specified guidelines:

- Read the problem carefully.
- Break down the problem into logical steps.
- Solve each step accurately before moving to the next.
- Ensure your final answer is correct.
- Adhere to the word limits and guidelines provided.

Guidelines:

- Your response must be clear and concise.
- Your final answer must be presented at the end in the format: ##<your answer (arabic numerals)>.
- Do not include any spaces or units in the final answer.
- Avoid disclosing any personal data.
- Ensure your response does not go beyond the scope of solving the math word problem.

Content Topic: Math Word Problems

{content}

Please output your answer at the end as ##<your answer (arabic numerals)>.





### Test Pre-defined Prompts

In [40]:
# load method
# print all methods and their supported datasets
print('All supported methods: ')
print(pb.SUPPORTED_METHODS)
print('Supported datasets for each method: ')
print(pb.METHOD_SUPPORT_DATASET)

method = pb.PEMethod(method='emotion_prompt', 
                        dataset=dataset_name,
                        verbose=True,  # if True, print the detailed prompt and response
                        prompt_id = 1  # for emotion_prompt 
                        )

All supported methods: 
['CoT', 'ZSCoT', 'least_to_most', 'generated_knowledge', 'expert_prompting', 'emotion_prompt', 'baseline']
Supported datasets for each method: 
{'CoT': ['gsm8k', 'csqa', 'bigbench_date', 'bigbench_object_tracking'], 'ZSCoT': ['gsm8k', 'csqa', 'bigbench_date', 'bigbench_object_tracking'], 'expert_prompting': ['gsm8k', 'csqa', 'bigbench_date', 'bigbench_object_tracking'], 'emotion_prompt': ['gsm8k', 'csqa', 'bigbench_date', 'bigbench_object_tracking'], 'least_to_most': ['gsm8k', 'last_letter_concat'], 'generated_knowledge': ['csqa', 'numersense', 'qasc'], 'baseline': ['gsm8k', 'csqa', 'bigbench_date', 'bigbench_object_tracking', 'last_letter_concat', 'numersense', 'qasc']}


In [41]:
results = method.test(dataset, 
                      model, 
                      num_samples=2,
                      )

results

  0%|          | 1/1319 [00:04<1:46:57,  4.87s/it]

Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?
This is very important to my career.
Please output your answer at the end as ##<your answer (arabic numerals)>

To determine how much Janet makes daily from selling fresh duck eggs at the farmers' market, we can follow these steps:

1. **Calculate the total number of eggs laid per day:** Janet’s ducks lay 16 eggs per day.
   
2. **Calculate the number of eggs used for breakfast:** Janet eats 3 eggs for breakfast every morning.

3. **Calculate the number of eggs used for baking muffins:** Janet uses 4 eggs to bake muffins every day.

4. **Calculate the number of eggs left to sell:** Subtract the eggs used for breakfast and baking from the total number of eggs laid daily.
   
   \[
   \text{Eggs left to sell} = 

  0%|          | 2/1319 [00:08<1:29:12,  4.06s/it]

A robe takes 2 bolts of blue fiber and half that much white fiber.  How many bolts in total does it take?
This is very important to my career.
Please output your answer at the end as ##<your answer (arabic numerals)>

To solve this problem, we need to find the total number of bolts of fiber required to make one robe. Here's the step-by-step process:

1. Determine the amount of blue fiber needed. According to the problem, each robe takes 2 bolts of blue fiber.

2. Determine the amount of white fiber needed. The problem states that this is half the amount of blue fiber. Therefore, since the blue fiber required is 2 bolts, the white fiber required would be:
   \[
   \frac{2}{2} = 1 \text{ bolt of white fiber}
   \]

3. Calculate the total number of bolts by adding the bolts of blue fiber and white fiber together:
   \[
   2 \text{ bolts of blue fiber} + 1 \text{ bolt of white fiber} = 3 \text{ bolts in total}
   \]

Thus, the answer is:
##3##





1.0