# Dataset Preparation for GSM8K, ASDiv, SVAMP, StrategyQA, Sports Understanding datasets

# OpenAI Inference

In [None]:
import os
os.environ['OPENAI_API_KEY'] = 'use-your-api-token-here'
import openai
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
import ast
from tqdm.notebook import tqdm as tqdm


# Define model name. The following model is used to get paraphrased questions for a question
MODEL_NAME = "text-davinci-003"

def get_openai_response(task, question):
    """
    task is a string prepended to a question to get natural language explanation.
    """
    response = openai.Completion.create(
        model=MODEL_NAME,
        prompt=task + '\n' + question,
        temperature=0,
        max_tokens=1024,
    )
    return response.choices[0].text        
        
def get_openai_response_wrapper(args):
    """
    wrapper around `get_openai_response` function for multithreading api calls
    """
    task, question = args
    return get_openai_response(task=task, question=question)


def get_openai_response_batch(task, questions):
    """
    Make concurrent calls to open ai api's for faster processing
    """
    responses = []
    with ThreadPoolExecutor(max_workers=4) as executor:
        args_list = [(task, question) for question in questions]
        for result in tqdm(executor.map(get_openai_response_wrapper, args_list), total=len(questions)):
            responses.append(result)
    return responses


def get_paraphrased_questions(questions):
    """
    Generate semantically equivalent paraphrased questions for a question for sample probing uncertainty.
    """
    responses = [
        get_openai_response(
            "Paraphrase the question into 25 different forms with the same meaning, and share them as a Python list of double quotes enclosed strings",
            question) for question in tqdm(questions)]
    responses = [ast.literal_eval(response) for response in tqdm(responses)]
    return responses

# GSM8K 100 Samples

In [None]:
import pandas as pd
from datasets import load_dataset

# Load GSM8K dataset
gsm8k_dataset = pd.DataFrame(load_dataset("gsm8k", "main")["test"])
gsm8k_dataset["label"] = gsm8k_dataset["answer"].apply(lambda answer: answer[answer.find("####") + 4:].strip())
# Sample 100 questions at random
gsm8k_dataset_subset = gsm8k_dataset.sample(n=100, random_state=42)
data = gsm8k_dataset_subset
data

In [None]:
import re

def parse_question(question):
    """
    Extract sentence from question from the full question. The paraphrases of extracted sentence are used to calculate uncertainty in sample probing experiment. 
    """
    for question_word in ["how", "calculate", "what"]:
        match = re.search(r'\b' + re.escape(question_word) + r'\b', question, re.IGNORECASE)
        if match:
            question = question[match.start():]
    return question

# Extract questions from dataset
questions = data["question"].to_list()
questions = list(map(parse_question, questions))

In [None]:
# Paraphrase questions for sample probing
responses = get_openai_response_batch(
    "Paraphrase the question into 10 different forms with the same meaning, and share them as a Python list of double quotes enclosed strings",
    questions,
)
responses = [ast.literal_eval(response) for response in tqdm(responses)]

In [None]:
paraphrased_questions = responses

## Experiment 1 - Verbalized Chain Of Thought
## Experiment 2 - Feature Importance Explanation
## Experiment 3 - Verbalized Feature Importance
## Experiment 4 - Sample Probing CoT Explanation
## Experiment 5 - Sample Probing Feature Importance Explanation
## Experiment 8 - Model Probing CoT Explanation
## Experiment 9 - Model Probing Feature Importance Explanation

In [1]:
# Define Q_e for different tasks. Q_e is preprended to question Q to generate answer A and natural language explanation A_e

experiment_1_task = """
Read the question, give your answer by analyzing step by step, and assign a confidence level to each step and the final answer. The output format is as follows:
Step 1: [Your reasoning here], Confidence: [Your confidence here]%
Step 2: [Your reasoning here], Confidence: [Your confidence here]%
Step 3: ...
...
Step N: [Your reasoning here], Confidence: [Your confidence here]%
Final Answer and Overall Confidence (0-100): [Your answer as a number here], [Your confidence here]%
Note: The confidence indicates the degree of certainty you have about your reasoning. For instance, if your confidence level is 80%, it means you are 80% certain that your reasoning is correct.
Provide the answer in aforementioned format, and nothing else.
"""

experiment_2_task = """Read the question, and output the words important for your final answer, sorted in descending order of importance. The output format is as follows
1. [Word 1 here]
2. [Word 2 here]
3. [Word 3 here]
...
N.: [Word N here]
Final Answer and Overall Confidence (0-100): [Your answer as a number here], [Your confidence here]%
Provide the answer in aforementioned format, and nothing else.
"""

experiment_3_task = """
Read the question, and assign each word an importance score between 0 and 100 of how important it is for your answer. The output format is as follows
Word: [Word 1 here], Importance: [Your importance score here]
Word: [Word 2 here], Importance: [Your importance score here]
Word: [Word 3 here], Importance: [Your importance score here]
...
Word: [Word N here], Importance: [Your importance score here]
Final Answer and Overall Confidence (0-100): [Your answer as a number here], [Your confidence here]%
Note: The importance scores of all words should add up to 100. The overall confidence score indicates the degree of certainty you have about your important words and importance scores. For instance, if your confidence level is 80%, it means you are 80% certain that important words and importance scores assigned are correct.
Provide the answer in aforementioned format, and nothing else.
"""

experiment_4_task = experiment_1_task

experiment_5_task = experiment_2_task

experiment_8_task = experiment_1_task

experiment_9_task = experiment_2_task

In [None]:
# make OpenAI API calls to generate answers and natural language explanations

data["experiment_1_question"] = [experiment_1_task + "\n" + question for question in data["question"]]
data["experiment_2_question"] = [experiment_2_task + "\n" + question for question in data["question"]]
data["experiment_3_question"] = [experiment_3_task + "\n" + question for question in data["question"]]

experiment_4_questions = []
experiment_5_questions = []

for idx, (_, row) in tqdm(enumerate(data.iterrows())):
    question = row["question"]
    original_question = parse_question(question)
    experiment_4_questions.append(
        [experiment_4_task + "\n" + question.replace(original_question, paraphrased_question) for paraphrased_question in paraphrased_questions[idx]]
    )
    experiment_5_questions.append(
        [experiment_5_task + "\n" + question.replace(original_question, paraphrased_question) for paraphrased_question in paraphrased_questions[idx]]
    )
    
data["experiment_4_question"] = experiment_4_questions
data["experiment_5_question"] = experiment_5_questions

data["experiment_8_question"] = [experiment_8_task + "\n" + question for question in data["question"]]
data["experiment_9_question"] = [experiment_9_task + "\n" + question for question in data["question"]]


In [None]:
data.to_parquet("data/gsm8k_100/input.parquet")

# AQUA 100 Samples

In [None]:
aqua_dataset = pd.DataFrame(load_dataset("aqua_rat")["test"])
aqua_dataset["inputs"] = aqua_dataset["question"] + "\n" + aqua_dataset["options"].apply(lambda x: " ".join(x))
aqua_dataset = aqua_dataset.sample(n=100, random_state=42)
data = aqua_dataset

In [None]:
import re

def parse_question(question):
    """
    Extract sentence from question from the full question. The paraphrases of extracted sentence are used to calculate uncertainty in sample probing experiment. 
    """
    for question_word in ["how", "calculate", "what", "which", "find", "compute"]:
        match = re.search(r'\b' + re.escape(question_word) + r'\b', question, re.IGNORECASE)
        if match:
            question = question[match.start():]
    return question

questions = []
for idx, question in enumerate(data["question"]):
    questions.append(parse_question(question))

In [None]:
questions

In [None]:
responses = get_openai_response_batch(
    "Paraphrase the question into 10 different forms with the same meaning, and share them as a Python list of double quotes enclosed strings",
    questions,
)

In [None]:
def parse_response(response):
    try:
        return ast.literal_eval(response)
    except:
        return [ques[ques.find("\"") + 1: ques.rfind("\"")] for ques in response.strip().split("\n")]
    
paraphrased_questions = list(map(parse_response, responses))

## Experiment 1 - Verbalized Chain Of Thought
## Experiment 2 - Feature Importance Explanation
## Experiment 3 - Verbalized Feature Importance
## Experiment 4 - Sample Probing CoT Explanation
## Experiment 5 - Sample Probing Feature Importance Explanation
## Experiment 8 - Model Probing CoT Explanation
## Experiment 9 - Model Probing Feature Importance Explanation

In [None]:
# Define Q_e for different tasks. Q_e is preprended to question Q to generate answer A and natural language explanation A_e

experiment_1_task = """Read the question, give your answer by analyzing step by step, and assign a confidence level to each step and the final answer. The output format is as follows:
Step 1: [Your reasoning here], Confidence: [Your confidence here]%
Step 2: [Your reasoning here], Confidence: [Your confidence here]%
Step 3: ...
...
Step N: [Your reasoning here], Confidence: [Your confidence here]%
Final Answer and Overall Confidence (0-100): [Your answer - Option A / B / C / D / E], [Your confidence here]%
Note: The confidence indicates the degree of certainty you have about your answer. For instance, if your confidence level is 80%, it means you are 80% certain that your answer is correct.
Provide the answer in aforementioned format, and nothing else.
"""

experiment_2_task = """Read the question, and output the words important for your final answer, sorted in descending order of importance. The output format is as follows
1. [Word 1 here]
2. [Word 2 here]
3. [Word 3 here]
...
N.: [Word N here]
Final Answer and Overall Confidence (0-100): [Your answer - Option A / B / C / D / E], [Your confidence here]%
Provide the answer in aforementioned format, and nothing else."""

experiment_3_task = """Read the question, and assign each word an importance score between 0 and 100 of how important it is for your answer. The output format is as follows
Word: [Word 1 here], Importance: [Your importance score here]
Word: [Word 2 here], Importance: [Your importance score here]
Word: [Word 3 here], Importance: [Your importance score here]
...
Word: [Word N here], Importance: [Your importance score here]
Final Answer and Overall Confidence (0-100): [Your answer - Option A / B / C / D / E], [Your confidence here]%
Note: The importance scores of all words should add up to 100. The overall confidence score indicates the degree of certainty you have about your important words and importance scores. For instance, if your confidence level is 80%, it means you are 80% certain that important words and importance scores assigned are correct.
Provide the answer in aforementioned format, and nothing else.
"""

experiment_4_task = experiment_1_task

experiment_5_task = experiment_2_task

experiment_8_task = experiment_1_task

experiment_9_task = experiment_2_task



In [None]:
data["experiment_1_question"] = [experiment_1_task + "\n" + question for question in data["inputs"]]
data["experiment_2_question"] = [experiment_2_task + "\n" + question for question in data["inputs"]]
data["experiment_3_question"] = [experiment_3_task + "\n" + question for question in data["inputs"]]

experiment_4_questions = []
experiment_5_questions = []

for idx, (_, row) in tqdm(enumerate(data.iterrows())):
    question = row["inputs"]
    original_question = questions[idx]
    experiment_4_questions.append(
        [experiment_4_task + "\n" + question.replace(original_question, paraphrased_question) for paraphrased_question in set(paraphrased_questions[idx])]
    )
    experiment_5_questions.append(
        [experiment_5_task + "\n" + question.replace(original_question, paraphrased_question) for paraphrased_question in set(paraphrased_questions[idx])]
    )
    
data["experiment_4_question"] = experiment_4_questions
data["experiment_5_question"] = experiment_5_questions

data["experiment_8_question"] = [experiment_8_task + "\n" + question for question in data["inputs"]]
data["experiment_9_question"] = [experiment_9_task + "\n" + question for question in data["inputs"]]

In [None]:
# data.to_parquet("data/aqua_100/input.parquet")

In [None]:
print(data.iloc[0]["experiment_4_question"][4])

# ASDiv 100 Samples

In [None]:
import pandas as pd
from datasets import load_dataset
asdiv_dataset = pd.DataFrame(load_dataset("EleutherAI/asdiv")["validation"])
asdiv_dataset = asdiv_dataset.sample(n=100, random_state=42)
asdiv_dataset["inputs"] = asdiv_dataset["body"] + asdiv_dataset["question"]
data = asdiv_dataset

In [None]:
data

In [None]:
questions = data["question"].to_list()

responses = get_openai_response_batch(
    "Paraphrase the question into 10 different forms with the exact same meaning, and share them as a Python list of double quotes enclosed strings",
    questions,
)

def parse_response(response):
    try:
        return ast.literal_eval(response)
    except:
        return [ques[ques.find("\"") + 1: ques.rfind("\"")] for ques in response.strip().split("\n")]
    
paraphrased_questions = list(map(parse_response, responses))

## Experiment 1 - Verbalized Chain Of Thought
## Experiment 2 - Feature Importance Explanation
## Experiment 3 - Verbalized Feature Importance
## Experiment 4 - Sample Probing CoT Explanation
## Experiment 5 - Sample Probing Feature Importance Explanation
## Experiment 8 - Model Probing CoT Explanation
## Experiment 9 - Model Probing Feature Importance Explanation

In [None]:
# Define Q_e for different tasks. Q_e is preprended to question Q to generate answer A and natural language explanation A_e

experiment_1_task = """
Read the question, give your answer by analyzing step by step, and assign a confidence level to each step and the final answer. The output format is as follows:
Step 1: [Your reasoning here], Confidence: [Your confidence here]%
Step 2: ...
Step 3: ...
...
Step N: ...
Final Answer and Overall Confidence (0-100): [Your answer as a number here], [Your confidence here]%
Note: The confidence indicates the degree of certainty you have about your answer. For instance, if your confidence level is 80%, it means you are 80% certain that your answer is correct.
Provide the answer in aforementioned format, and nothing else.
"""

experiment_2_task = """Read the question, and output the words important for your final answer, sorted in descending order of importance. The output format is as follows
1. [Word 1 here]
2. [Word 2 here]
3. [Word 3 here]
...
N.: [Word N here]
Final Answer and Overall Confidence (0-100): [Your answer as a number here], [Your confidence here]%
Provide the answer in aforementioned format, and nothing else.
"""

experiment_3_task = """
Read the question, and assign each word an importance score between 0 and 100 of how important it is for your final answer. The output format is as follows
Word: [Word 1 here], Importance: [Your importance score here]
Word: [Word 2 here], Importance: [Your importance score here]
Word: [Word 3 here], Importance: [Your importance score here]
...
Word: [Word N here], Importance: [Your importance score here]
Final Answer and Overall Confidence (0-100): [Your answer as a number here], [Your confidence here]%
Note: The importance scores of all words should add up to 100. The overall confidence score indicates the degree of certainty you have about your importance scores. For instance, if your confidence level is 80%, it means you are 80% certain that importance scores assigned are correct.
Provide the answer in aforementioned format, and nothing else.
"""

experiment_4_task = experiment_1_task

experiment_5_task = experiment_2_task

experiment_8_task = experiment_1_task

experiment_9_task = experiment_2_task

In [None]:
# make OpenAI API calls to generate answers and natural language explanations

data["experiment_1_question"] = [experiment_1_task + "\n" + question for question in data["inputs"]]
data["experiment_2_question"] = [experiment_2_task + "\n" + question for question in data["inputs"]]
data["experiment_3_question"] = [experiment_3_task + "\n" + question for question in data["inputs"]]

experiment_4_questions = []
experiment_5_questions = []

for idx, (_, row) in tqdm(enumerate(data.iterrows())):
    question = row["inputs"]
    original_question = row["question"]
    experiment_4_questions.append(
        [experiment_4_task + "\n" + question.replace(original_question, paraphrased_question) for paraphrased_question in set(paraphrased_questions[idx])]
    )
    experiment_5_questions.append(
        [experiment_5_task + "\n" + question.replace(original_question, paraphrased_question) for paraphrased_question in set(paraphrased_questions[idx])]
    )
    
data["experiment_4_question"] = experiment_4_questions
data["experiment_5_question"] = experiment_5_questions

data["experiment_8_question"] = [experiment_8_task + "\n" + question for question in data["inputs"]]
data["experiment_9_question"] = [experiment_9_task + "\n" + question for question in data["inputs"]]

In [None]:
file_path = "data/asdiv_100/input.parquet"
# assert os.path.exists(file_path) is False
data.to_parquet(file_path)

# SVAMP 100 Samples

In [None]:
import pandas as pd
from datasets import load_dataset
svamp_dataset = pd.DataFrame(load_dataset("ChilleD/SVAMP")["test"])
svamp_dataset = svamp_dataset.sample(n=100, random_state=42)
svamp_dataset["inputs"] = svamp_dataset["Body"] + "\n" + svamp_dataset["Question"]
data = svamp_dataset
data

In [None]:
questions = data["Question"].to_list()

responses = get_openai_response_batch(
    "Paraphrase the question into 10 different forms with the same meaning, and share them as a Python list of double quotes enclosed strings",
    questions,
)

def parse_response(response):
    try:
        return ast.literal_eval(response)
    except:
        return [ques[ques.find("\"") + 1: ques.rfind("\"")] for ques in response.strip().split("\n")]
    
paraphrased_questions = list(map(parse_response, responses))

## Experiment 1 - Verbalized Chain Of Thought
## Experiment 2 - Feature Importance Explanation
## Experiment 3 - Verbalized Feature Importance
## Experiment 4 - Sample Probing CoT Explanation
## Experiment 5 - Sample Probing Feature Importance Explanation
## Experiment 8 - Model Probing CoT Explanation
## Experiment 9 - Model Probing Feature Importance Explanation

In [None]:
# Define Q_e for different tasks. Q_e is preprended to question Q to generate answer A and natural language explanation A_e

experiment_1_task = """
Read the question, give your answer by analyzing step by step, and assign a confidence level to each step and the final answer. The output format is as follows:
Step 1: [Your reasoning here], Confidence: [Your confidence here]%
Step 2: ...
Step 3: ...
...
Step N: ...
Final Answer and Overall Confidence (0-100): [Your answer as a number here], [Your confidence here]%
Note: The confidence indicates the degree of certainty you have about your answer. For instance, if your confidence level is 80%, it means you are 80% certain that your answer is correct.
Provide the answer in aforementioned format, and nothing else.
"""

experiment_2_task = """Read the question, and output the words important for your final answer, sorted in descending order of importance. The output format is as follows
1. [Word 1 here]
2. [Word 2 here]
3. [Word 3 here]
...
N.: [Word N here]
Final Answer and Overall Confidence (0-100): [Your answer as a number here], [Your confidence here]%
Provide the answer in aforementioned format, and nothing else.
"""

experiment_3_task = """
Read the question, and assign each word an importance score between 0 and 100 of how important it is for your final answer. The output format is as follows
Word: [Word 1 here], Importance: [Your importance score here]
Word: [Word 2 here], Importance: [Your importance score here]
Word: [Word 3 here], Importance: [Your importance score here]
...
Word: [Word N here], Importance: [Your importance score here]
Final Answer and Overall Confidence (0-100): [Your answer as a number here], [Your confidence here]%
Note: The importance scores of all words should add up to 100. The overall confidence score indicates the degree of certainty you have about your importance scores. For instance, if your confidence level is 80%, it means you are 80% certain that importance scores assigned are correct.
Provide the answer in aforementioned format, and nothing else.
"""

experiment_4_task = experiment_1_task

experiment_5_task = experiment_2_task

experiment_8_task = experiment_1_task

experiment_9_task = experiment_2_task

In [None]:
# make OpenAI API calls to generate answers and natural language explanations

data["experiment_1_question"] = [experiment_1_task + "\n" + question for question in data["inputs"]]
data["experiment_2_question"] = [experiment_2_task + "\n" + question for question in data["inputs"]]
data["experiment_3_question"] = [experiment_3_task + "\n" + question for question in data["inputs"]]

experiment_4_questions = []
experiment_5_questions = []

for idx, (_, row) in tqdm(enumerate(data.iterrows())):
    question = row["inputs"]
    original_question = row["Question"]
    experiment_4_questions.append(
        [experiment_4_task + "\n" + question.replace(original_question, paraphrased_question) for paraphrased_question in set(paraphrased_questions[idx])]
    )
    experiment_5_questions.append(
        [experiment_5_task + "\n" + question.replace(original_question, paraphrased_question) for paraphrased_question in set(paraphrased_questions[idx])]
    )
    
data["experiment_4_question"] = experiment_4_questions
data["experiment_5_question"] = experiment_5_questions

data["experiment_8_question"] = [experiment_8_task + "\n" + question for question in data["inputs"]]
data["experiment_9_question"] = [experiment_9_task + "\n" + question for question in data["inputs"]]

In [None]:
print(data.iloc[0]["experiment_9_question"])

In [None]:
file_path = "data/svamp_100/input.parquet"
assert os.path.exists(file_path) is False
data.to_parquet(file_path)

In [None]:
import pandas as pd
data = pd.read_parquet("data/svamp_100/output-gpt-3.5-turbo.parquet")
data

In [None]:
temp = """Read the question, and identify words important for your final answer, sorted in descending order of importance. The output format is as follows
Word: [Word 1 here]
Word: [Word 2 here]
Word: [Word 3 here]
...
Word: [Word N here]
Final Answer: [Your answer as a number here]
Provide the answer in aforementioned format, and nothing else."""

data["experiment_2_question"] = [experiment_2_task + "\n" + question for question in data["inputs"]]

data["experiment_5_question"] = data["experiment_5_question"].apply(
    lambda text_list: [text.replace(temp, experiment_2_task) for text in text_list]
)

data["experiment_9_question"] = [experiment_9_task + "\n" + question for question in data["inputs"]]

data.to_parquet("data/svamp_100/output-gpt-3.5-turbo.parquet")


In [None]:
print(data.iloc[89]["experiment_5_question"][7])

# Sports Understanding

In [None]:
import pandas as pd
from datasets import load_dataset

sports_dataset = pd.DataFrame(load_dataset("tasksource/bigbench",'sports_understanding')["validation"])
data = sports_dataset.sample(n=100, random_state=42)
data

## Experiment 1 - Verbalized Chain Of Thought
## Experiment 2 - Feature Importance Explanation
## Experiment 3 - Verbalized Feature Importance
## Experiment 4 - Sample Probing CoT Explanation
## Experiment 5 - Sample Probing Feature Importance Explanation
## Experiment 8 - Model Probing CoT Explanation
## Experiment 9 - Model Probing Feature Importance Explanation

In [None]:
# Define Q_e for different tasks. Q_e is preprended to question Q to generate answer A and natural language explanation A_e

experiment_1_task = """
Read the question, give your answer by analyzing step by step, and assign a confidence level to each step and the final answer. The output format is as follows:
Step 1: [Your reasoning here], Confidence: [Your confidence here]%
Step 2: ...
Step 3: ...
...
Step N: ...
Final Answer and Overall Confidence (0-100): [Your answer plausible / implausible here], [Your confidence here]%
Note: The confidence indicates the degree of certainty you have about your answer. For instance, if your confidence level is 80%, it means you are 80% certain that your answer is correct.
Provide the answer in aforementioned format, and nothing else.
"""

experiment_2_task = """Read the question, and output the words important for your final answer, sorted in descending order of importance. The output format is as follows
1. [Word 1 here]
2. [Word 2 here]
3. [Word 3 here]
...
N.: [Word N here]
Final Answer and Overall Confidence (0-100): [Your answer plausible / implausible here], [Your confidence here]%
Provide the answer in aforementioned format, and nothing else.
"""

experiment_3_task = """
Read the question, and assign each word an importance score between 0 and 100 of how important it is for your final answer. The output format is as follows
Word: [Word 1 here], Importance: [Your importance score here]
Word: [Word 2 here], Importance: [Your importance score here]
Word: [Word 3 here], Importance: [Your importance score here]
...
Word: [Word N here], Importance: [Your importance score here]
Final Answer and Overall Confidence (0-100): [Your answer plausible / implausible here], [Your confidence here]%
Note: The importance scores of all words should add up to 100. The overall confidence score indicates the degree of certainty you have about your importance scores. For instance, if your confidence level is 80%, it means you are 80% certain that importance scores assigned are correct.
Provide the answer in aforementioned format, and nothing else.
"""

experiment_4_task = experiment_1_task

experiment_5_task = experiment_2_task

experiment_8_task = experiment_1_task

experiment_9_task = experiment_2_task

In [None]:
def parse_question(ques):
    """
    Extract sentence from question from the full question. The paraphrases of extracted sentence are used to calculate uncertainty in sample probing experiment. 
    """
    return ques.strip().split("\n")[1][11:]

questions = data["inputs"].apply(parse_question).to_list()

responses = get_openai_response_batch(
    "Paraphrase the question into 10 different forms with the same meaning, and share them as a Python list of double quotes enclosed strings",
    questions,
)

def parse_response(response):
    try:
        return ast.literal_eval(response)
    except:
        return [ques[ques.find("\"") + 1: ques.rfind("\"")] for ques in response.strip().split("\n")]
    
paraphrased_questions = list(map(parse_response, responses))

In [None]:
data["paraphrased_questions"] = paraphrased_questions

In [None]:
# make OpenAI API calls to generate answers and natural language explanations

data["experiment_1_question"] = [experiment_1_task + "\n" + question for question in data["inputs"]]
data["experiment_2_question"] = [experiment_2_task + "\n" + question for question in data["inputs"]]
data["experiment_3_question"] = [experiment_3_task + "\n" + question for question in data["inputs"]]

experiment_4_questions = []
experiment_5_questions = []

for idx, (_, row) in tqdm(enumerate(data.iterrows())):
    question = row["inputs"]
    original_question = parse_question(question)
    experiment_4_questions.append(
        [experiment_4_task + "\n" + question.replace(original_question, paraphrased_question) for paraphrased_question in set(paraphrased_questions[idx])]
    )
    experiment_5_questions.append(
        [experiment_5_task + "\n" + question.replace(original_question, paraphrased_question) for paraphrased_question in set(paraphrased_questions[idx])]
    )
    
data["experiment_4_question"] = experiment_4_questions
data["experiment_5_question"] = experiment_5_questions

data["experiment_8_question"] = [experiment_8_task + "\n" + question for question in data["inputs"]]
data["experiment_9_question"] = [experiment_9_task + "\n" + question for question in data["inputs"]]

In [None]:
file_path = "data/sportsunderstanding_100/input.parquet"
assert os.path.exists(file_path) is False
data.to_parquet(file_path)

# Strategy QA

In [None]:
from datasets import load_dataset

strategyqa_data = pd.DataFrame(load_dataset("ChilleD/StrategyQA")["test"])
data = strategyqa_data.sample(n=100, random_state=42)
data

## Experiment 1 - Verbalized Chain Of Thought
## Experiment 2 - Feature Importance Explanation
## Experiment 3 - Verbalized Feature Importance
## Experiment 4 - Sample Probing CoT Explanation
## Experiment 5 - Sample Probing Feature Importance Explanation
## Experiment 8 - Model Probing CoT Explanation
## Experiment 9 - Model Probing Feature Importance Explanation

In [None]:
# Define Q_e for different tasks. Q_e is preprended to question Q to generate answer A and natural language explanation A_e

experiment_1_task = """
Read the question, give your answer by analyzing step by step, and assign a confidence level to each step and the final answer. The output format is as follows:
Step 1: [Your reasoning here], Confidence: [Your confidence here]%
Step 2: ...
Step 3: ...
...
Step N: ...
Final Answer and Overall Confidence (0-100): [Your answer Yes/No here], [Your confidence here]%
Note: The confidence indicates the degree of certainty you have about your answer. For instance, if your confidence level is 80%, it means you are 80% certain that your reasoning is correct.
Provide the answer in aforementioned format, and nothing else.
"""

experiment_2_task = """Read the question, and output the words important for your final answer, sorted in descending order of importance. The output format is as follows
1. [Word 1 here]
2. [Word 2 here]
3. [Word 3 here]
...
N.: [Word N here]
Final Answer and Overall Confidence (0-100): [Your answer Yes/No here], [Your confidence here]%
Provide the answer in aforementioned format, and nothing else.
"""

experiment_3_task = """
Read the question, and assign each word an importance score between 0 and 100 of how important it is for your final answer. The output format is as follows
Word: [Word 1 here], Importance: [Your importance score here]
Word: [Word 2 here], Importance: [Your importance score here]
Word: [Word 3 here], Importance: [Your importance score here]
...
Word: [Word N here], Importance: [Your importance score here]
Final Answer and Overall Confidence (0-100): [Your answer Yes/No here], [Your confidence here]%
Note: The importance scores of all words should add up to 100. The overall confidence score indicates the degree of certainty you have about your importance scores. For instance, if your confidence level is 80%, it means you are 80% certain that importance scores assigned are correct.
Provide the answer in aforementioned format, and nothing else.
"""

experiment_4_task = experiment_1_task

experiment_5_task = experiment_2_task

experiment_8_task = experiment_1_task

experiment_9_task = experiment_2_task

In [None]:
questions = data["question"].to_list()

responses = get_openai_response_batch(
    "Paraphrase the question into 10 different forms with the same meaning, and share them as a Python list of double quotes enclosed strings",
    questions,
)

def parse_response(response):
    try:
        return ast.literal_eval(response)
    except:
        return [ques[ques.find("\"") + 1: ques.rfind("\"")] for ques in response.strip().split("\n")]
    
paraphrased_questions = list(map(parse_response, responses))

In [None]:
data["paraphrased_questions"] = paraphrased_questions
data["inputs"] = data["question"]

In [None]:
# make OpenAI API calls to generate answers and natural language explanations

data["experiment_1_question"] = [experiment_1_task + "\n" + question for question in data["inputs"]]
data["experiment_2_question"] = [experiment_2_task + "\n" + question for question in data["inputs"]]
data["experiment_3_question"] = [experiment_3_task + "\n" + question for question in data["inputs"]]

experiment_4_questions = []
experiment_5_questions = []

for idx, (_, row) in tqdm(enumerate(data.iterrows())):
    question = row["question"]
    original_question = row["question"]
    experiment_4_questions.append(
        [experiment_4_task + "\n" + question.replace(original_question, paraphrased_question) for paraphrased_question in set(paraphrased_questions[idx])]
    )
    experiment_5_questions.append(
        [experiment_5_task + "\n" + question.replace(original_question, paraphrased_question) for paraphrased_question in set(paraphrased_questions[idx])]
    )
    
data["experiment_4_question"] = experiment_4_questions
data["experiment_5_question"] = experiment_5_questions

data["experiment_8_question"] = [experiment_8_task + "\n" + question for question in data["inputs"]]
data["experiment_9_question"] = [experiment_9_task + "\n" + question for question in data["inputs"]]

In [None]:
file_path = "data/strategyqa_100/input.parquet"
assert os.path.exists(file_path) is False
data.to_parquet(file_path)