In [1]:
import sys

# Add the directory of promptbench to the Python path
sys.path.append('/Users/iwatson/Repos/promptbench')

# Now you can import promptbench by name
import promptbench as pb

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# print all supported datasets in promptbench
print('All supported datasets: ')
for dataset in pb.SUPPORTED_DATASETS:
    print(f'  {dataset}')

All supported datasets: 
  sst2
  cola
  qqp
  mnli
  mnli_matched
  mnli_mismatched
  qnli
  wnli
  rte
  mrpc
  mmlu
  squad_v2
  un_multi
  iwslt2017
  math
  bool_logic
  valid_parentheses
  gsm8k
  csqa
  bigbench_date
  bigbench_object_tracking
  last_letter_concat
  numersense
  qasc
  bbh
  drop
  arc-easy
  arc-challenge


In [3]:
# print all supported models in promptbench
print('All supported models: ')
for model in pb.SUPPORTED_MODELS:
    print(f'  {model}')

All supported models: 
  google/flan-t5-large
  llama2-7b
  llama2-7b-chat
  llama2-13b
  llama2-13b-chat
  llama2-70b
  llama2-70b-chat
  phi-1.5
  phi-2
  palm
  gpt-3.5-turbo
  gpt-4
  gpt-4-1106-preview
  gpt-3.5-turbo-1106
  gpt-4-0125-preview
  gpt-3.5-turbo-0125
  gpt-4-turbo
  gpt-4o
  vicuna-7b
  vicuna-13b
  vicuna-13b-v1.3
  google/flan-ul2
  gemini-pro
  mistralai/Mistral-7B-v0.1
  mistralai/Mistral-7B-Instruct-v0.1
  mistralai/Mixtral-8x7B-v0.1
  mistralai/Mixtral-8x7B-Instruct-v0.1
  01-ai/Yi-6B
  01-ai/Yi-34B
  01-ai/Yi-6B-Chat
  01-ai/Yi-34B-Chat
  baichuan-inc/Baichuan2-7B-Base
  baichuan-inc/Baichuan2-13B-Base
  baichuan-inc/Baichuan2-7B-Chat
  baichuan-inc/Baichuan2-13B-Chat


In [13]:
dataset = pb.DatasetLoader.load_dataset("gsm8k")
dataset[:5]

Downloading readme: 100%|██████████| 7.94k/7.94k [00:00<00:00, 33.2MB/s]
Downloading data: 100%|██████████| 2.31M/2.31M [00:00<00:00, 3.24MB/s]
Downloading data: 100%|██████████| 419k/419k [00:00<00:00, 1.47MB/s]
Generating train split: 100%|██████████| 7473/7473 [00:00<00:00, 331500.48 examples/s]
Generating test split: 100%|██████████| 1319/1319 [00:00<00:00, 469235.54 examples/s]


[{'content': "Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?",
  'label': '18'},
 {'content': 'A robe takes 2 bolts of blue fiber and half that much white fiber.  How many bolts in total does it take?',
  'label': '3'},
 {'content': 'Josh decides to try flipping a house.  He buys a house for $80,000 and then puts in $50,000 in repairs.  This increased the value of the house by 150%.  How much profit did he make?',
  'label': '70000'},
 {'content': 'James decides to run 3 sprints 3 times a week.  He runs 60 meters each sprint.  How many total meters does he run a week?',
  'label': '540'},
 {'content': "Every day, Wendi feeds each of her chickens three cups of mixed chicken feed, containing seeds, mealworms and vegetables to help keep them healthy.  She giv

In [21]:
model = "gpt-4o"

In [22]:
model = pb.LLMModel(model=model, max_new_tokens=10, temperature=0.0)

In [23]:
prompts = pb.Prompt([
    """Solve the following grade school maths problem: {content}""",
    """'Solve the following grade school maths problem: {content}. Use only basic arithmetic operations (addition, subtraction, multiplication, or division). Ensure the problem is suitable for the specified grade level and can be solved mentally or with minimal written work. Provide the numerical answer only, without units, explanations, or additional formatting."""
])

In [24]:
def proj_func(pred):
    mapping = {
        "correct": 1,
        "incorrect": 0
    }
    return mapping.get(pred, -1)

In [25]:
from tqdm import tqdm
for prompt in prompts:
    preds = []
    labels = []
    for data in tqdm(dataset[:10]):
        # process input
        input_text = pb.InputProcess.basic_format(prompt, data)
        label = data['label']
        raw_pred = model(input_text)
        # process output
        pred = pb.OutputProcess.cls(raw_pred)
        preds.append(pred)
        labels.append(label)
    
    # evaluate
    score = pb.Eval.compute_cls_accuracy(preds, labels)
    print(f"{score:.3f}, {prompt}")

100%|██████████| 10/10 [00:11<00:00,  1.17s/it]


0.000, Solve the following grade school maths problem: {content}


100%|██████████| 10/10 [00:07<00:00,  1.33it/s]

0.700, 'Solve the following grade school maths problem: {content}. Use only basic arithmetic operations (addition, subtraction, multiplication, or division). Ensure the problem is suitable for the specified grade level and can be solved mentally or with minimal written work. Provide the numerical answer only, without units, explanations, or additional formatting.



