In [1]:
from tqdm import tqdm
import json

import promptbench as pb

  from .autonotebook import tqdm as notebook_tqdm


### Load Models and Dataset

In [2]:
# print all supported datasets in promptbench
print('All supported datasets: ')
for dataset in pb.SUPPORTED_DATASETS:
    print(f'  {dataset}')

All supported datasets: 
  sst2
  cola
  qqp
  mnli
  mnli_matched
  mnli_mismatched
  qnli
  wnli
  rte
  mrpc
  mmlu
  squad_v2
  un_multi
  iwslt2017
  math
  bool_logic
  valid_parentheses
  gsm8k
  csqa
  bigbench_date
  bigbench_object_tracking
  last_letter_concat
  numersense
  qasc
  bbh
  drop
  arc-easy
  arc-challenge


In [3]:
# print all supported models in promptbench
print('All supported models: ')
for model in pb.SUPPORTED_MODELS:
    print(f'  {model}')

All supported models: 
  google/flan-t5-large
  llama2-7b
  llama2-7b-chat
  llama2-13b
  llama2-13b-chat
  llama2-70b
  llama2-70b-chat
  phi-1.5
  phi-2
  palm
  gpt-3.5-turbo
  gpt-4
  gpt-4-1106-preview
  gpt-3.5-turbo-1106
  gpt-4-0125-preview
  gpt-3.5-turbo-0125
  gpt-4-turbo
  gpt-4o
  vicuna-7b
  vicuna-13b
  vicuna-13b-v1.3
  google/flan-ul2
  gemini-pro
  mistralai/Mistral-7B-v0.1
  mistralai/Mistral-7B-Instruct-v0.1
  mistralai/Mixtral-8x7B-v0.1
  mistralai/Mixtral-8x7B-Instruct-v0.1
  01-ai/Yi-6B
  01-ai/Yi-34B
  01-ai/Yi-6B-Chat
  01-ai/Yi-34B-Chat
  baichuan-inc/Baichuan2-7B-Base
  baichuan-inc/Baichuan2-13B-Base
  baichuan-inc/Baichuan2-7B-Chat
  baichuan-inc/Baichuan2-13B-Chat


In [4]:
dataset_name = "sst2"
dataset = pb.DatasetLoader.load_dataset(dataset_name)

In [5]:
print(dataset[:5])

[{'content': "it 's a charming and often affecting journey . ", 'label': 1}, {'content': 'unflinchingly bleak and desperate ', 'label': 0}, {'content': 'allows us to hope that nolan is poised to embark a major career as a commercial yet inventive filmmaker . ', 'label': 1}, {'content': "the acting , costumes , music , cinematography and sound are all astounding given the production 's austere locales . ", 'label': 1}, {'content': "it 's slow -- very , very slow . ", 'label': 0}]


In [6]:
import random
random.seed(42)

length = len(dataset)
sample_indices = random.sample(range(length), 30)
print(sample_indices)
# extract 50 samples from the dataset
samples = [dataset[i] for i in sample_indices]

[654, 114, 25, 759, 281, 250, 228, 142, 754, 104, 692, 758, 558, 89, 604, 432, 32, 30, 95, 223, 238, 517, 616, 27, 574, 203, 733, 665, 718, 429]


In [7]:
# class LLMModel():
#     def __init__(self, model: str, max_new_tokens: int=20, temperature: float=0.0, device: str="cuda", dtype: str="auto", model_dir: str=None, system_prompt: str=None, api_key:str =None, **kwargs):
#         self.model_name = model
#         self.model = self._create_model(max_new_tokens, temperature, device, dtype, model_dir, system_prompt, api_key, **kwargs)

#     def _create_model(self, max_new_tokens, temperature, device, dtype, model_dir, system_prompt, api_key, **kwargs):
#         """Uses ChatOllama to create a model and returns it."""

#         llm = 

In [8]:
model_name = "gpt-4o"
model = pb.LLMModel(model=model_name, max_new_tokens=4096, temperature=1.0)

### Test Custom Prompts

In [9]:
from gsm8k_prompts import GSM8KPrompts
from sst2_prompts import SST2Prompts

gsm8k_prompts = GSM8KPrompts()

gsm8k_baseline_prompt = gsm8k_prompts.get_baseline_prompt()
gmk9k_emotive_prompt = gsm8k_prompts.get_emotive_prompt()
gsm8k_CoT_prompt = gsm8k_prompts.get_zero_shot_CoT_prompt()
# reflection_prompts = gsm8k_prompts.get_reflection_prompts() 
gsm8k_authoritarian_prompts_0 = gsm8k_prompts.get_authoritarian_prompts_0()
gsm8k_authoritarian_prompts_1 = gsm8k_prompts.get_authoritarian_prompts_1()
gsm8k_market_prompts_0 = gsm8k_prompts.get_market_prompts_0()
gsm8k_market_prompts_1 = gsm8k_prompts.get_market_prompts_1()
gsm8k_hierarchical_prompts_0 = gsm8k_prompts.get_hierarchical_prompts_0()
gsm8k_hierarchical_prompts_1 = gsm8k_prompts.get_hierarchical_prompts_1()

sst2_prompts = SST2Prompts()

sst2_baseline_prompt = sst2_prompts.get_baseline_prompt()
sst2_emotive_prompt = sst2_prompts.get_emotive_prompt()
sst2_CoT_prompt = sst2_prompts.get_zero_shot_CoT_prompt()
# reflection_prompts = sst2_prompts.get_reflection_prompts()
sst2_authoritarian_prompts_0 = sst2_prompts.get_authoritarian_prompts_0()
sst2_authoritarian_prompts_1 = sst2_prompts.get_authoritarian_prompts_1()
sst2_market_prompts_0 = sst2_prompts.get_market_prompts_0()
sst2_market_prompts_1 = sst2_prompts.get_market_prompts_1()
sst2_hierarchical_prompts_0 = sst2_prompts.get_hierarchical_prompts_0()
sst2_hierarchical_prompts_1 = sst2_prompts.get_hierarchical_prompts_1()

In [27]:
prompts = pb.Prompt(sst2_hierarchical_prompts_1)

In [28]:
for prompt in prompts:
    print(prompt)

Classify the sentiment of the following text as either positive or negative, paying attention to emotive language and the overall emotional tone. Output your answer clearly as ##positive or ##negative without any spaces.

{content}
Classify the sentiment of the following text as either positive or negative: {content}
Please output your answer at the end as ##<your answer (No format restrictions)>
Please classify the sentiment of the following text as either positive or negative by breaking it down into aspects. Identify key aspects within the text, assess the sentiment of each aspect considering the emotional impact, emotive language, and lexical choices, and then determine the overall sentiment based on how these aspects contribute to the general tone. Here is an example of breaking down into aspects, evaluating emotional impact, and assessing:

Example:
Text: "The device has an excellent display, but the battery life is disappointing."
- Aspect 1: Display - Sentiment: Positive (excel

In [29]:
prompt_type = [
    "hierarchical_1_0",
    "hierarchical_1_1",
    "hierarchical_1_2",
    "hierarchical_1_3",
    "hierarchical_1_4",
]

In [30]:
# for idx, prompt in enumerate(prompts):
#     preds = []
#     labels = []
#     questions = []
#     for data in tqdm(samples):
#         # process input
#         input_text = pb.InputProcess.basic_format(prompt, data)
#         label = data['label']
#         raw_pred = model(input_text)
#         # process output
#         pred = pb.OutputProcess.pattern_re(raw_pred, r"##(\d+)")
#         # print(f"Pred: {pred}, Label: {label}")
#         questions.append(data['content'])
#         preds.append(pred)
#         labels.append(label)

#         output = list(zip(dataset, pred, label))

#         # Define the output file path
#         output_file_path = f"/Users/iwatson/Documents/Research Project/prompt-optimisation/src/outputs/{dataset_name}_{prompt_type[idx]}.jsonl"

#         # Save the paired list to a JSON file
#         with open(output_file_path, 'w', encoding='utf-8') as jsonl_file:
#             for pair in output:
#                 jsonl_file.write(json.dumps(pair) + '\n')
    
#     # evaluate
#     score = pb.Eval.compute_cls_accuracy(preds, labels)
#     print(f"{score:.3f}, {prompt}")

In [31]:
from concurrent.futures import ThreadPoolExecutor, as_completed
import json
from tqdm import tqdm

def extract_and_map_sentiment(text):
    # Attempt to find the term following "##"
    try:
        # Split the text around "##" and take the part right after it
        term = text.split('##')[1].split()[0]  # This assumes the term is the first word following "##"
    except IndexError:
        # If "##" isn't found or there's no term after it, return None
        return -1
    
    # Map the extracted term to a numerical value
    if term.lower() == 'positive':
        return 1
    elif term.lower() == 'negative':
        return 0
    else:
        return -1

def process_data(data):
    # process input
    input_text = pb.InputProcess.basic_format(prompt, data)
    label = data['label']
    raw_pred = model(input_text)
    # process output
    # pred = pb.OutputProcess.pattern_re(raw_pred, r"##(\d+)")
    pred = extract_and_map_sentiment(raw_pred)
    # pred = pb.OutputProcess.cls(cls)
    # print(f"Raw: {raw_pred}, Pred: {pred}, Label: {label}")
    # Collect the necessary information for output
    return (data['content'], pred, label)

scores = []
for idx, prompt in enumerate(prompts):
    # Initialize lists to store results
    questions = []
    preds = []
    labels = []

    # Use ThreadPoolExecutor to process data in parallel
    with ThreadPoolExecutor(max_workers=10) as executor:
        # Map process_data function to each item in samples
        future_to_data = {executor.submit(process_data, data): data for data in samples}
        
        for future in tqdm(as_completed(future_to_data), total=len(samples)):
            data_content, pred, label = future.result()
            questions.append(data_content)
            preds.append(pred)
            labels.append(label)

    # Assuming dataset, dataset_name, and prompt_type[idx] are defined elsewhere
    output = list(zip(dataset, preds, labels))

    # Define the output file path
    output_file_path = f"/Users/iwatson/Documents/Research Project/prompt-optimisation/src/outputs/{dataset_name}_{prompt_type[idx]}.jsonl"

    # Save the paired list to a JSON file
    with open(output_file_path, 'w', encoding='utf-8') as jsonl_file:
        for pair in output:
            jsonl_file.write(json.dumps(pair) + '\n')
    
    score = pb.Eval.compute_cls_accuracy(preds, labels)
    scores.append(score)
    print(f"{score:.3f}")

mean_score = sum(scores) / len(scores)
median_score = sorted(scores)[len(scores) // 2]
std_dev = (sum([(score - mean_score) ** 2 for score in scores]) / len(scores)) ** 0.5
print(f"Mean: {mean_score:.3f}, Median: {median_score:.3f}, Std. Dev.: {std_dev:.3f}")


100%|██████████| 30/30 [00:02<00:00, 12.04it/s]


0.967


100%|██████████| 30/30 [00:05<00:00,  5.00it/s]


0.967


100%|██████████| 30/30 [00:12<00:00,  2.38it/s]


0.900


100%|██████████| 30/30 [00:48<00:00,  1.62s/it]


0.633


100%|██████████| 30/30 [00:02<00:00, 11.62it/s]

0.933
Mean: 0.880, Median: 0.933, Std. Dev.: 0.126





### Test Pre-defined Prompts

In [15]:
# load method
# print all methods and their supported datasets
print('All supported methods: ')
print(pb.SUPPORTED_METHODS)
print('Supported datasets for each method: ')
print(pb.METHOD_SUPPORT_DATASET)

method = pb.PEMethod(method='baseline', 
                    dataset=dataset_name,
                    verbose=True,  # if True, print the detailed prompt and response
                    prompt_id = 1  # for emotion_prompt 
                    )

All supported methods: 
['CoT', 'ZSCoT', 'least_to_most', 'generated_knowledge', 'expert_prompting', 'emotion_prompt', 'baseline']
Supported datasets for each method: 
{'CoT': ['gsm8k', 'csqa', 'bigbench_date', 'bigbench_object_tracking'], 'ZSCoT': ['gsm8k', 'csqa', 'bigbench_date', 'bigbench_object_tracking'], 'expert_prompting': ['gsm8k', 'csqa', 'bigbench_date', 'bigbench_object_tracking'], 'emotion_prompt': ['gsm8k', 'csqa', 'bigbench_date', 'bigbench_object_tracking'], 'least_to_most': ['gsm8k', 'last_letter_concat'], 'generated_knowledge': ['csqa', 'numersense', 'qasc'], 'baseline': ['gsm8k', 'csqa', 'bigbench_date', 'bigbench_object_tracking', 'last_letter_concat', 'numersense', 'qasc']}


In [17]:
results = method.test(dataset, 
                      model, 
                      num_samples=2,
                      )

results

  0%|          | 2/872 [00:07<51:43,  3.57s/it]


0.0