In [1]:
import sys

# Add the directory of promptbench to the Python path
sys.path.append('/Users/iwatson/Repos/promptbench')

# Now you can import promptbench by name
import promptbench as pb

  from .autonotebook import tqdm as notebook_tqdm


### Load Models and Dataset

In [2]:
# print all supported datasets in promptbench
print('All supported datasets: ')
for dataset in pb.SUPPORTED_DATASETS:
    print(f'  {dataset}')

All supported datasets: 
  sst2
  cola
  qqp
  mnli
  mnli_matched
  mnli_mismatched
  qnli
  wnli
  rte
  mrpc
  mmlu
  squad_v2
  un_multi
  iwslt2017
  math
  bool_logic
  valid_parentheses
  gsm8k
  csqa
  bigbench_date
  bigbench_object_tracking
  last_letter_concat
  numersense
  qasc
  bbh
  drop
  arc-easy
  arc-challenge


In [3]:
# print all supported models in promptbench
print('All supported models: ')
for model in pb.SUPPORTED_MODELS:
    print(f'  {model}')

All supported models: 
  google/flan-t5-large
  llama2-7b
  llama2-7b-chat
  llama2-13b
  llama2-13b-chat
  llama2-70b
  llama2-70b-chat
  phi-1.5
  phi-2
  palm
  gpt-3.5-turbo
  gpt-4
  gpt-4-1106-preview
  gpt-3.5-turbo-1106
  gpt-4-0125-preview
  gpt-3.5-turbo-0125
  gpt-4-turbo
  gpt-4o
  vicuna-7b
  vicuna-13b
  vicuna-13b-v1.3
  google/flan-ul2
  gemini-pro
  mistralai/Mistral-7B-v0.1
  mistralai/Mistral-7B-Instruct-v0.1
  mistralai/Mixtral-8x7B-v0.1
  mistralai/Mixtral-8x7B-Instruct-v0.1
  01-ai/Yi-6B
  01-ai/Yi-34B
  01-ai/Yi-6B-Chat
  01-ai/Yi-34B-Chat
  baichuan-inc/Baichuan2-7B-Base
  baichuan-inc/Baichuan2-13B-Base
  baichuan-inc/Baichuan2-7B-Chat
  baichuan-inc/Baichuan2-13B-Chat


In [10]:
dataset_name = "bigbench_object_tracking"

In [13]:
dataset = pb.DatasetLoader.load_dataset(dataset_name)
dataset[:20]

[{'content': 'Alice, Bob, and Claire are playing a game. At the start of the game, they are each holding a ball: Alice has a orange ball, Bob has a white ball, and Claire has a blue ball. \n\nAs the game progresses, pairs of players trade balls. First, Alice and Bob swap balls. Then, Bob and Claire swap balls. Finally, Alice and Bob swap balls. At the end of the game, Alice has the \nWhich choice is true ? Answer Choices: (A) orange ball. (B) white ball. (C) blue ball.',
  'label': 'C'},
 {'content': 'Alice, Bob, and Claire are playing a game. At the start of the game, they are each holding a ball: Alice has a orange ball, Bob has a white ball, and Claire has a blue ball. \n\nAs the game progresses, pairs of players trade balls. First, Alice and Bob swap balls. Then, Bob and Claire swap balls. Finally, Alice and Bob swap balls. At the end of the game, Bob has the \nWhich choice is true ? Answer Choices: (A) orange ball. (B) white ball. (C) blue ball.',
  'label': 'B'},
 {'content': 'Al

In [14]:
model = "gpt-3.5-turbo"
model = pb.LLMModel(model=model, max_new_tokens=4096, temperature=0.0)

### Test Custom Prompts

In [21]:
prompts = pb.Prompt([
'{content}. Please output your answer at the end as ##<your answer (among A through C)> .',
"""Please solve the following object tracking problem:
{content}
Provide your answer at the end exactly as ##<answer (among A through C)>, with no spaces between the ## and the answer.""",
"Please solve the following object tracking problem and provide the answer at the end exactly as ##<your answer (among A through C)> with no spaces between the ## and your answer. {content}",
"""Object tracking is essential for understanding movement and interactions in various contexts.

Please solve the following object tracking problem:

{content}

To ensure your answer is correctly formatted, follow these actions:
1. Analyze and solve the given object tracking problem.
2. Write your answer at the end of the solution exactly in the format: ##<answer (among A through C)>.
   - Ensure the format includes double hashes (##) immediately followed by the answer (A, B, or C).
   - This formatting is crucial for the system to recognize your answer.
3. Verify there are no spaces between the ## and the answer.

Examples:
Correct Format:
- If the correct answer is B, you should write it exactly as: ##B

Incorrect Format:
- If you write the answer as ## B or #B#, it will not be recognized correctly.""",
"""Solve the following object tracking problem: {content}

Please output your answer at the end exactly as ##<answer (among A through C)>.
**Note:** Ensure there are no spaces between the ## and the answer. This formatting is crucial for the correct evaluation of responses.""",
])

In [22]:
# Custom mapping function

In [23]:
from tqdm import tqdm
for prompt in prompts:
    preds = []
    labels = []
    for data in tqdm(dataset[:20]):
        # process input
        input_text = pb.InputProcess.basic_format(prompt, data)
        label = data['label']
        raw_pred = model(input_text)
        # process output
        pred = pb.OutputProcess.pattern_re(raw_pred, r"##([A-C])")
        # print(f"Pred: {pred}, Label: {label}")
        preds.append(pred)
        labels.append(label)
    
    # evaluate
    score = pb.Eval.compute_cls_accuracy(preds, labels)
    print(f"{score:.3f}, {prompt}")

100%|██████████| 20/20 [00:18<00:00,  1.08it/s]


0.150, {content}. Please output your answer at the end as ##<your answer (among A through C)> .


100%|██████████| 20/20 [00:53<00:00,  2.67s/it]


0.600, Please solve the following object tracking problem:
{content}
Provide your answer at the end exactly as ##<answer (among A through C)>, with no spaces between the ## and the answer.


100%|██████████| 20/20 [00:49<00:00,  2.47s/it]


0.850, Please solve the following object tracking problem and provide the answer at the end exactly as ##<your answer (among A through C)> with no spaces between the ## and your answer. {content}


100%|██████████| 20/20 [01:01<00:00,  3.08s/it]


0.500, Object tracking is essential for understanding movement and interactions in various contexts.

Please solve the following object tracking problem:

{content}

To ensure your answer is correctly formatted, follow these actions:
1. Analyze and solve the given object tracking problem.
2. Write your answer at the end of the solution exactly in the format: ##<answer (among A through C)>.
   - Ensure the format includes double hashes (##) immediately followed by the answer (A, B, or C).
   - This formatting is crucial for the system to recognize your answer.
3. Verify there are no spaces between the ## and the answer.

Examples:
Correct Format:
- If the correct answer is B, you should write it exactly as: ##B

Incorrect Format:
- If you write the answer as ## B or #B#, it will not be recognized correctly.


100%|██████████| 20/20 [00:37<00:00,  1.90s/it]

0.300, Solve the following object tracking problem: {content}

Please output your answer at the end exactly as ##<answer (among A through C)>.
**Note:** Ensure there are no spaces between the ## and the answer. This formatting is crucial for the correct evaluation of responses.





### Test Pre-defined Prompts

In [15]:
# load method
# print all methods and their supported datasets
print('All supported methods: ')
print(pb.SUPPORTED_METHODS)
print('Supported datasets for each method: ')
print(pb.METHOD_SUPPORT_DATASET)

method = pb.PEMethod(method='emotion_prompt', 
                        dataset=dataset_name,
                        verbose=True,  # if True, print the detailed prompt and response
                        prompt_id = 1  # for emotion_prompt 
                        )

All supported methods: 
['CoT', 'ZSCoT', 'least_to_most', 'generated_knowledge', 'expert_prompting', 'emotion_prompt', 'baseline']
Supported datasets for each method: 
{'CoT': ['gsm8k', 'csqa', 'bigbench_date', 'bigbench_object_tracking'], 'ZSCoT': ['gsm8k', 'csqa', 'bigbench_date', 'bigbench_object_tracking'], 'expert_prompting': ['gsm8k', 'csqa', 'bigbench_date', 'bigbench_object_tracking'], 'emotion_prompt': ['gsm8k', 'csqa', 'bigbench_date', 'bigbench_object_tracking'], 'least_to_most': ['gsm8k', 'last_letter_concat'], 'generated_knowledge': ['csqa', 'numersense', 'qasc'], 'baseline': ['gsm8k', 'csqa', 'bigbench_date', 'bigbench_object_tracking', 'last_letter_concat', 'numersense', 'qasc']}


In [16]:
results = method.test(dataset, 
                      model, 
                      num_samples=2,
                      )

results

  0%|          | 1/750 [00:01<14:19,  1.15s/it]

Alice, Bob, and Claire are playing a game. At the start of the game, they are each holding a ball: Alice has a orange ball, Bob has a white ball, and Claire has a blue ball. 

As the game progresses, pairs of players trade balls. First, Alice and Bob swap balls. Then, Bob and Claire swap balls. Finally, Alice and Bob swap balls. At the end of the game, Alice has the 
Which choice is true ? Answer Choices: (A) orange ball. (B) white ball. (C) blue ball.
This is very important to my career.
Please output your answer at the end as ##<your answer (among A through C)>

At the end of the game, Alice will have the white ball. 

##B


  0%|          | 2/750 [00:03<21:50,  1.75s/it]

Alice, Bob, and Claire are playing a game. At the start of the game, they are each holding a ball: Alice has a orange ball, Bob has a white ball, and Claire has a blue ball. 

As the game progresses, pairs of players trade balls. First, Alice and Bob swap balls. Then, Bob and Claire swap balls. Finally, Alice and Bob swap balls. At the end of the game, Bob has the 
Which choice is true ? Answer Choices: (A) orange ball. (B) white ball. (C) blue ball.
This is very important to my career.
Please output your answer at the end as ##<your answer (among A through C)>

Let's track the ball swaps:

1. Alice has the orange ball, Bob has the white ball, and Claire has the blue ball.
2. Alice and Bob swap balls: Alice has the white ball, Bob has the orange ball, and Claire has the blue ball.
3. Bob and Claire swap balls: Alice has the white ball, Bob has the blue ball, and Claire has the orange ball.
4. Alice and Bob swap balls again: Alice has the blue ball, Bob has the white ball, and Claire ha




0.5