In [1]:
%pip install datasets

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
from datasets import load_dataset

dataset = load_dataset("selfrag/selfrag_train_data")

In [3]:
dataset = dataset['train']

In [4]:
dataset

Dataset({
    features: ['instruction', 'output', 'input', 'id', 'dataset_name'],
    num_rows: 145619
})

## Analyze Data

In [4]:
token = "[No Retrieval]"
no_retrive = [output.count(token) for output in dataset['output'] if output.count(token) > 0]
print("Records:", len(no_retrive))
print("Sum:", sum(no_retrive))

Records: 100444
Sum: 231873


In [5]:
token = "[Retrieval]"
retrive = [output.count(token) for output in dataset['output'] if output.count(token) > 0]
print("Records:", len(retrive))
print("Sum:", sum(retrive))

Records: 74219
Sum: 204893


In [6]:
token = "[Utility:5]"
ult_5 = [output.count(token) for output in dataset['output'] if output.count(token) > 0]
print("Records:", len(ult_5))
print("Sum:", sum(ult_5))

Records: 122599
Sum: 122599


In [7]:
token = "[Utility:4]"
ult_4 = [output.count(token) for output in dataset['output'] if output.count(token) > 0]
print("Records:", len(ult_4))
print("Sum:", sum(ult_4))

Records: 13050
Sum: 13050


In [8]:
token = "[Utility:3]"
ult_3 = [output.count(token) for output in dataset['output'] if output.count(token) > 0]
print("Records:", len(ult_3))
print("Sum:", sum(ult_3))

Records: 109
Sum: 109


In [9]:
token = "[Utility:2]"
ult_2 = [output.count(token) for output in dataset['output'] if output.count(token) > 0]
print("Records:", len(ult_2))
print("Sum:", sum(ult_2))

Records: 6575
Sum: 6575


In [10]:
token = "[Utility:1]"
ult_1 = [output.count(token) for output in dataset['output'] if output.count(token) > 0]
print("Records:", len(ult_1))
print("Sum:", sum(ult_1))

Records: 3286
Sum: 3286


## Generating Questions

In [5]:
control_tokens = [
    "[Fully supported]",
    "[Partially supported]",
    "[No support / Contradictory]",
    "[No Retrieval]",
    "[Retrieval]",
    "[Irrelevant]",
    "[Relevant]",
    "<paragraph>",
    "</paragraph>",
    "[Utility:1]",
    "[Utility:2]",
    "[Utility:3]",
    "[Utility:4]",
    "[Utility:5]",
    "[Continue to Use Evidence]",
    "[Completeness: Complete]",
    "[Completeness: Partially Complete]",
    "[Completeness: Incomplete]",
]

relevant_tokens = ["Relevant", "Irrelevant"]
support_tokens = [
    "Fully supported",
    "Partially supported",
    "No support / Contradictory",
]
utility_tokens = [
    "Utility:5",
    "Utility:4",
    "Utility:3",
    "Utility:2",
    "Utility:1",
]
retrieval_tokens = ["No Retrieval", "Retrieval"]

In [17]:
import re
def postprocess_answer_option_conditioned(answer, filter_paragraph=True):

    if filter_paragraph:
        answer = re.sub(r"<paragraph>.*?</paragraph>", "", answer, flags=re.DOTALL)
    for token in control_tokens:
        answer = answer.replace(token, "")

    if "</s>" in answer:
        answer = answer.replace("</s>", "")

    if "<|endoftext|>" in answer:
        answer = answer.replace("<|endoftext|>", "")

    return answer

In [32]:
complete_samples = []
partially_complete_samples = []
incomplete_samples = []
retrieve_samples = []
no_retrieve_samples = []
relevant_samples = []
irrelevant_samples = []
supported_samples = []
partially_supported_samples = []
not_supported_samples = []
processed_samples = []

for question, input, output in zip(dataset['instruction'], dataset['input'], dataset['output']):
  output = output.replace("[Utility:5]", "[Completeness: Complete]")
  output = output.replace("[Utility:4]", "[Completeness: Partially Complete]")
  output = output.replace("[Utility:3]", "[Completeness: Incomplete]")
  output = output.replace("[Utility:2]", "[Completeness: Incomplete]")
  output = output.replace("[Utility:1]", "[Completeness: Incomplete]")

  row_data = {"instruction": question, "input": input, "output": output}

  # Completeness Task
  if("[Completeness: Complete]" in output):
    complete_samples.append(output)
    row_data["completeness"] = [{"answer": postprocess_answer_option_conditioned(output),"label": "Complete"}]
  if("[Completeness: Partially Complete]" in output):
    partially_complete_samples.append(output)
    row_data["completeness"] = [{"answer": postprocess_answer_option_conditioned(output),"label": "Partially Complete"}]
  if("[Completeness: Incomplete]" in output):
    incomplete_samples.append(output)
    row_data["completeness"] = [{"answer": postprocess_answer_option_conditioned(output),"label": "Incomplete"}]

  # Relevancy Task
  data = re.findall(f"(<paragraph>.*?\[({'|'.join(relevant_tokens)})\])", output, flags=re.DOTALL) # Get list of the following tuple: (paragraph, relevancy)
  data = {postprocess_answer_option_conditioned(sample[0], False):sample[1] for sample in data}

  row_data["relevancy"] = [{"paragraph": paragraph, "label": data[paragraph]} for paragraph in data]

  # Is Supported Task

  support_task_data = []
  data = re.findall(f"(<paragraph>.*?\[({'|'.join(support_tokens)})\])", output, flags=re.DOTALL) # Get list of the following tuple: (paragraph + answer, is supported)

  for sample in data:
    inputs = sample[0].split("</paragraph>")
    label = sample[1]

    paragraph = postprocess_answer_option_conditioned(inputs[0], False)
    answer = postprocess_answer_option_conditioned(inputs[1])

    support_task_data.append({"paragraph": paragraph, "answer": answer, "label": label})
  
  row_data["support"] = support_task_data
  
  processed_samples.append(row_data)

## Generate Reasoning for Labels

In [39]:
%pip install --upgrade huggingface_hub

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting huggingface_hub
  Downloading huggingface_hub-0.22.2-py3-none-any.whl.metadata (12 kB)
Downloading huggingface_hub-0.22.2-py3-none-any.whl (388 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m388.9/388.9 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: huggingface_hub
  Attempting uninstall: huggingface_hub
    Found existing installation: huggingface-hub 0.20.3
    Uninstalling huggingface-hub-0.20.3:
      Successfully uninstalled huggingface-hub-0.20.3
Successfully installed huggingface_hub-0.22.2
[0mNote: you may need to restart the kernel to use updated packages.


In [41]:
%pip install ipywidgets

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting ipywidgets
  Downloading ipywidgets-8.1.2-py3-none-any.whl.metadata (2.4 kB)
Collecting widgetsnbextension~=4.0.10 (from ipywidgets)
  Downloading widgetsnbextension-4.0.10-py3-none-any.whl.metadata (1.6 kB)
Collecting jupyterlab-widgets~=3.0.10 (from ipywidgets)
  Downloading jupyterlab_widgets-3.0.10-py3-none-any.whl.metadata (4.1 kB)
Downloading ipywidgets-8.1.2-py3-none-any.whl (139 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.4/139.4 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jupyterlab_widgets-3.0.10-py3-none-any.whl (215 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m215.0/215.0 kB[0m [31m30.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading widgetsnbextension-4.0.10-py3-none-any.whl (2.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m113.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collec

In [None]:
from huggingface_hub import login
hf_token = "[YOUR_HF_TOKEN]"
login(token = hf_token)

In [17]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

In [18]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from vllm import LLM, SamplingParams
import torch

model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left", token=hf_token)
model = LLM(
    model_id,
    download_dir="/gscratch/h2lab/akari/model_cache",
    gpu_memory_utilization=0.90,
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


INFO 04-29 22:26:10 llm_engine.py:73] Initializing an LLM engine with config: model='meta-llama/Meta-Llama-3-8B-Instruct', tokenizer='meta-llama/Meta-Llama-3-8B-Instruct', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir='/gscratch/h2lab/akari/model_cache', load_format=auto, tensor_parallel_size=1, quantization=None, enforce_eager=False, seed=0)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


INFO 04-29 22:26:15 llm_engine.py:223] # GPU blocks: 8025, # CPU blocks: 2048
INFO 04-29 22:26:18 model_runner.py:394] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 04-29 22:26:27 model_runner.py:437] Graph capturing finished in 9 secs.


In [38]:

def completeness_prompt(instruction, answer, label):
    return f'''You will be provided an Instruction, an Answer, and an evaluation on the Completeness of the answer. You will explain the reason why the Answer was given the provided Completeness evaluation.
Instruction:
{instruction}
Answer:
{answer}
Completeness:
{label}
Reason:
'''

In [38]:
messages = [
    {"role": "system", "content": "You a helpful and honest assistant."},
    {"role": "user", "content": completeness_prompt(sample['instruction'], sample['completeness'][0]['answer'], sample['completeness'][0]['label'])},
]


prompt = tokenizer.apply_chat_template(
    messages,
    tokenize=False, 
    add_generation_prompt=True,
)
print(prompt)

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You a helpful and honest assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>

You will be provided an Instruciton, an Answer, and an evaluation on the Completeness of the answer. You will explain the reason why the Answer was given the provided Completeness evaluation.
Instructiopn:
Create a regex expression to remove all characters in a string after the last dot in the string. For example, if I have a pattern of "X.Y.4" I want the number 4 and the dot to be removed.
Answer:
One possible regex expression to remove all characters in a string after the last dot in the string is:

`\.[^.]*$`

This expression matches a dot (`\.`) followed by zero or more characters that are not dots (`[^.]*`) until the end of the string (`$`).For example, if you have a pattern of “X.Y.4”, this expression will match “.4” and replace it with “”, resulting in “X.Y”.

Here is some code in Python that demonstrates how to use this expression:


In [24]:
terminators = [
    tokenizer.eos_token,
    "<|eot_id|>"
]

In [25]:
sampling_params = SamplingParams(
    temperature=0.6, 
    top_p=0.9, 
    max_tokens=258, 
    skip_special_tokens=False,
    stop=terminators
)

In [44]:
preds = model.generate(prompts=[prompt], sampling_params=sampling_params)
print(f"Model prediction: {preds[0].outputs[0].text}")

Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.09s/it]

Model prediction: The Completeness evaluation is "Complete" because the answer provides a thorough explanation of the regex expression and how it works. The answer explains the purpose of each part of the expression, including the dot (`\.`), the zero or more characters that are not dots (`[^.]*`), and the end of the string (`$`). This provides a clear understanding of how the expression will match and remove characters in the input string.

Additionally, the answer provides a code snippet in Python that demonstrates how to use the regex expression with the `re.sub` function to remove the characters after the last dot in the string. This code snippet is concise and easy to understand, making it a complete answer to the problem.





In [45]:
completeness_prompts_batch = []
for sample in processed_samples:
    for data in sample['completeness']:
        messages = [
            {"role": "system", "content": "You a helpful and honest assistant."},
            {"role": "user", "content": completeness_prompt(sample['instruction'], data['answer'], data['label'])},
        ]

        prompt = tokenizer.apply_chat_template(
            messages,
            tokenize=False, 
            add_generation_prompt=True,
        )

        completeness_prompts_batch.append(prompt)    

In [46]:
completeness_prompts_batch[0]

'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou a helpful and honest assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nYou will be provided an Instruciton, an Answer, and an evaluation on the Completeness of the answer. You will explain the reason why the Answer was given the provided Completeness evaluation.\nInstructiopn:\nIn this task, you are given a context paragraph of the tweet and question. Your task is to generate right answer of given question based on given context tweet paragraph.\n\nExample input: Context: Our prayers are with the students, educators & families at Independence High School & all the first responders on the scene. #PatriotPride— Doug Ducey (@dougducey) February 12, 2016 Question: at which school were first responders on the scene for?\nExample output: independence high school\nExample explanation: From the context tweet, we can see that independence high school is the right answer.\nQ: Context: BREAKING: ATF sending addition

In [47]:
preds = model.generate(prompts=completeness_prompts_batch, sampling_params=sampling_params)
print(f"Model prediction: {preds[0].outputs[0].text}")

Processed prompts: 100%|██████████| 145619/145619 [2:21:42<00:00, 17.13it/s]  

Model prediction: I'm ready to assist!

The answer is: portland

The completeness evaluation is: Complete

Reason: The context tweet clearly states that the agents are being sent from Portland, along with a K9 team, to the UCCShooting tragedy. This information directly answers the question "where are the agents from?", making the answer complete.





In [48]:
import json
preds = [pred.outputs[0].text for pred in preds]

with open('completeness_reasoning.json', 'w') as f:
    json.dump(preds, f)

In [49]:
i = 0
for sample in processed_samples:
    for data in sample['completeness']:
        data['reason'] = preds[i]
        i += 1

with open('data_checkpoint.json', 'w') as f:
    json.dump(processed_samples, f)

In [6]:
# Load Checkpoint
import json

with open('data_checkpoint.json') as f:
    processed_samples = json.load(f)

print(len(processed_samples))

145619


In [8]:
dataset

Dataset({
    features: ['instruction', 'output', 'input', 'id', 'dataset_name'],
    num_rows: 145619
})

In [10]:
# Removing Alpaca Data
reduced_samples = []

for sample, dataset_name in zip(processed_samples, dataset['dataset_name']):
    if(dataset_name != "gpt4_alpaca"):
        reduced_samples.append(sample)

In [14]:
# Reducing Data
import random

reduced_samples_length = 50000

reduced_sample_indexes = random.sample(range(len(reduced_samples)), reduced_samples_length)
reduced_samples = [reduced_samples[i] for i in reduced_sample_indexes]


In [7]:
def relevancy_prompt(instruction, paragraph, label):
    return f'''You will be provided an Instruction, a Paragraph, and an evaluation on the Relevancy of the paragraph with respect to the question. You will explain the reason why the Paragraph was given the provided Relevancy evaluation.
Instruction:
{instruction}
Paragraph:
{paragraph}
Relevancy:
{label}
Reason:
'''

In [19]:
relevancy_prompts_batch = []

for sample in reduced_samples:
    for data in sample['relevancy']:
        messages = [
            {"role": "system", "content": "You a helpful and honest assistant."},
            {"role": "user", "content": relevancy_prompt(sample['instruction'], data['paragraph'], data['label'])},
        ]

        prompt = tokenizer.apply_chat_template(
            messages,
            tokenize=False, 
            add_generation_prompt=True,
        )

        relevancy_prompts_batch.append(prompt)

In [21]:
relevancy_prompts_batch[0]

'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou a helpful and honest assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nYou will be provided an Instruciton, a Paragraph, and an evaluation on the Relevancy of the paragraph with respect to the question. You will explain the reason why the Paragraph was given the provided Relevancy evaluation.\nInstructiopn:\nWhy do many people believe that ivermectin can prevent or treat Covid-19?\nParagraph:\nIvermectin\nin humans in the treatment of onchocerciasis (river blindness), but is also effective against other worm infestations (such as strongyloidiasis, ascariasis, trichuriasis, filariasis and enterobiasis), and some epidermal parasitic skin diseases, including scabies. Ivermectin is currently being used to help eliminate river blindness (onchocerciasis) in the Americas, and to stop transmission of lymphatic filariasis and onchocerciasis around the world in programs sponsored by the Carter Center using ivermect

In [26]:
preds = model.generate(prompts=relevancy_prompts_batch, sampling_params=sampling_params)
print(f"Model prediction: {preds[0].outputs[0].text}")

Processed prompts: 100%|██████████| 47738/47738 [48:11<00:00, 16.51it/s]  

Model prediction: The paragraph is considered "Relevant" to the instruction "Why do many people believe that ivermectin can prevent or treat Covid-19?" because it provides information about the uses of ivermectin, a drug that is being researched and discussed for its potential to prevent or treat Covid-19. Although the paragraph does not directly mention Covid-19, it highlights the drug's effectiveness against various parasitic infections and diseases, which is the foundation of the claims being made about its potential use against Covid-19. The information in the paragraph provides context and background knowledge about ivermectin's properties and uses, which are relevant to understanding why people may believe it can be used to prevent or treat Covid-19.





In [27]:
import json
preds = [pred.outputs[0].text for pred in preds]

with open('relevancy_reasoning.json', 'w') as f:
    json.dump(preds, f)

In [28]:
i = 0
for sample in reduced_samples:
    for data in sample['relevancy']:
        data['reason'] = preds[i]
        i += 1

with open('data_checkpoint.json', 'w') as f:
    json.dump(reduced_samples, f)

print(i)

47738


In [29]:
def support_prompt(paragraph, answer, label):
    return f'''You will be provided an Paragraph, an Answer, and an evaluation on whether the Answer is supported by the Paragraph. You will explain the reason why the Answer was given the provided Is Supported evaluation.
Paragraph:
{paragraph}
Answer:
{answer}
Is Supported:
{label}
Reason:
'''

In [32]:
support_prompts_batch = []

for sample in reduced_samples:
    for data in sample['support']:
        messages = [
            {"role": "system", "content": "You a helpful and honest assistant."},
            {"role": "user", "content": support_prompt(data['paragraph'], data['answer'], data['label'])},
        ]

        prompt = tokenizer.apply_chat_template(
            messages,
            tokenize=False, 
            add_generation_prompt=True,
        )
        support_prompts_batch.append(prompt)    

In [33]:
support_prompts_batch[0]

'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou a helpful and honest assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nYou will be provided an Paragraph, an Answer, and an evaluation on whether the Answer is supported by the Paragraph. You will explain the reason why the Answer was given the provided Is Supported evaluation.\nParagraph:\nIvermectin\nin humans in the treatment of onchocerciasis (river blindness), but is also effective against other worm infestations (such as strongyloidiasis, ascariasis, trichuriasis, filariasis and enterobiasis), and some epidermal parasitic skin diseases, including scabies. Ivermectin is currently being used to help eliminate river blindness (onchocerciasis) in the Americas, and to stop transmission of lymphatic filariasis and onchocerciasis around the world in programs sponsored by the Carter Center using ivermectin donated by Merck. The disease is common in 30 African countries, six Latin American countries, and Yem

In [34]:
preds = model.generate(prompts=support_prompts_batch, sampling_params=sampling_params)
print(f"Model prediction: {preds[0].outputs[0].text}")

Processed prompts: 100%|██████████| 46471/46471 [42:23<00:00, 18.27it/s]  

Model prediction: The Answer is evaluated as "No support / Contradictory".

The reason is that the Paragraph does not mention anything about ivermectin being used to prevent or treat Covid-19. The paragraph only discusses the use of ivermectin in the treatment of various worm infestations and parasitic skin diseases, as well as its use in eliminating river blindness and stopping the transmission of lymphatic filariasis and onchocerciasis. There is no mention of ivermectin being used to prevent or treat Covid-19, which is the topic of the Answer. Therefore, the Answer is not supported by the provided paragraph.





In [35]:
import json
preds = [pred.outputs[0].text for pred in preds]

with open('support_reasoning.json', 'w') as f:
    json.dump(preds, f)

In [36]:
i = 0
for sample in reduced_samples:
    for data in sample['support']:
        data['reason'] = preds[i]
        i += 1

with open('data_checkpoint.json', 'w') as f:
    json.dump(reduced_samples, f)

print(i)

46471


## Generate Instructions

In [52]:
completeness_instruction = "You will be provided an Instruciton, an Answer, and an evaluation on the Completeness of the answer. You will explain the reason why the Answer was given the provided Completeness evaluation."

relevancy_instruction = "You will be provided an Instruciton, a Paragraph, and an evaluation on the Relevancy of the paragraph with respect to the question. You will explain the reason why the Paragraph was given the provided Relevancy evaluation."

support_instruction = "You will be provided an Paragraph, an Answer, and an evaluation on whether the Answer is supported by the Paragraph. You will explain the reason why the Answer was given the provided Is Supported evaluation."

### Genrate Completeness Instrucitons

In [80]:
messages = [
    {"role": "system", "content": "You a helpful and honest assistant."},
    {"role": "user", "content": f"Write 20 different ways of asking the following instruction:\n\n{completeness_instruction}"},
]

prompt = tokenizer.apply_chat_template(
    messages,
    tokenize=False, 
    add_generation_prompt=True,
)

In [81]:
sampling_params = SamplingParams(
    temperature=0.6, 
    top_p=0.9, 
    max_tokens=4028, 
    skip_special_tokens=False,
    stop=terminators
)

In [82]:
preds = model.generate(prompts=[prompt], sampling_params=sampling_params)
print(f"Model prediction: {preds[0].outputs[0].text}")

Processed prompts: 100%|██████████| 1/1 [00:07<00:00,  7.47s/it]

Model prediction: Here are 20 different ways of asking the instruction:

1. Please provide a task, response, and assessment of the response's thoroughness. Explain why the response was given the assigned completeness score.
2. You will be given a task, answer, and evaluation of the answer's completeness. Please explain the reasoning behind the completeness assessment.
3. For each task, you will receive an answer, a completeness evaluation, and a justification for the evaluation.
4. Please provide a detailed explanation for each answer, including the completeness score and the reasoning behind it.
5. You will be asked to review tasks with answers, completeness evaluations, and explanations for each evaluation.
6. Provide a breakdown of each answer, including the completeness score and a justification for the evaluation.
7. For each response, please explain the completeness score and the reasoning behind it.
8. You will receive answers, completeness evaluations, and explanations for each




In [83]:
import re
questions_v1 = [re.sub(r"\d+\. ", "", line) for line in preds[0].outputs[0].text.splitlines() if re.match(r"^\d+", line)]
questions_v1

["Please provide a task, response, and assessment of the response's thoroughness. Explain why the response was given the assigned completeness score.",
 "You will be given a task, answer, and evaluation of the answer's completeness. Please explain the reasoning behind the completeness assessment.",
 'For each task, you will receive an answer, a completeness evaluation, and a justification for the evaluation.',
 'Please provide a detailed explanation for each answer, including the completeness score and the reasoning behind it.',
 'You will be asked to review tasks with answers, completeness evaluations, and explanations for each evaluation.',
 'Provide a breakdown of each answer, including the completeness score and a justification for the evaluation.',
 'For each response, please explain the completeness score and the reasoning behind it.',
 'You will receive answers, completeness evaluations, and explanations for each evaluation. Please review and provide feedback.',
 'Please review 

In [84]:
sampling_params = SamplingParams(
    temperature=1, 
    top_p=0.9, 
    max_tokens=4028, 
    skip_special_tokens=False,
    stop=terminators
)

preds = model.generate(prompts=[prompt], sampling_params=sampling_params)
print(f"Model prediction: {preds[0].outputs[0].text}")

Processed prompts: 100%|██████████| 1/1 [00:07<00:00,  7.55s/it]

Model prediction: Here are 20 different ways to ask the instruction:

1. Can you provide a breakdown of how you evaluated the completeness of the answer and the reasoning behind it?
2. How do you determine the completeness of an answer, and can you explain why you gave it the score you did?
3. Can you walk me through your thought process when evaluating the completeness of an answer and provide the reasoning behind your decision?
4. What criteria do you use to assess the completeness of an answer, and how do you apply them to this particular response?
5. Can you explain why you gave the answer a completeness score of [X] and provide evidence to support your evaluation?
6. How do you ensure that your evaluation of completeness is accurate and unbiased, and can you provide an example of this process in action?
7. Can you provide a step-by-step guide on how you evaluate the completeness of an answer, including the reasoning behind each step?
8. What are the key factors you consider when e




In [85]:
import re
questions_v2 = [re.sub(r"\d+\. ", "", line) for line in preds[0].outputs[0].text.splitlines() if re.match(r"^\d+", line)]
questions_v2

['Can you provide a breakdown of how you evaluated the completeness of the answer and the reasoning behind it?',
 'How do you determine the completeness of an answer, and can you explain why you gave it the score you did?',
 'Can you walk me through your thought process when evaluating the completeness of an answer and provide the reasoning behind your decision?',
 'What criteria do you use to assess the completeness of an answer, and how do you apply them to this particular response?',
 'Can you explain why you gave the answer a completeness score of [X] and provide evidence to support your evaluation?',
 'How do you ensure that your evaluation of completeness is accurate and unbiased, and can you provide an example of this process in action?',
 'Can you provide a step-by-step guide on how you evaluate the completeness of an answer, including the reasoning behind each step?',
 'What are the key factors you consider when evaluating the completeness of an answer, and how do you weigh th

In [86]:
random_quesiton_indexes = random.sample(range(20), 10)
print(random_quesiton_indexes)

completeness_instructions = [questions_v2[i] for i in random_quesiton_indexes] + [questions_v1[i] for i in random_quesiton_indexes]
completeness_instructions

[17, 3, 11, 13, 10, 9, 15, 18, 8, 7]


['What are some strategies you use to encourage answers that are more complete and comprehensive?',
 'What criteria do you use to assess the completeness of an answer, and how do you apply them to this particular response?',
 'What are some common pitfalls to avoid when evaluating the completeness of an answer, and how do you ensure you avoid them?',
 'How do you distinguish between an answer that is simply incomplete and one that is inadequate or poorly constructed?',
 'Can you explain how your evaluation of completeness takes into account the specific context of the question and the answer?',
 'How do you handle situations where an answer may not be 100% complete but still provides valuable information?',
 "How do you use your knowledge of the topic and the question being asked to inform your evaluation of the answer's completeness?",
 "Can you explain how your evaluation of completeness is influenced by the answer's relevance to the question, and how you weigh this factor?",
 'Can y

### Generate Relevancy Instructions

In [87]:
messages = [
    {"role": "system", "content": "You a helpful and honest assistant."},
    {"role": "user", "content": f"Write 20 different ways of asking the following instruction:\n\n{relevancy_instruction}"},
]

prompt = tokenizer.apply_chat_template(
    messages,
    tokenize=False, 
    add_generation_prompt=True,
)


In [88]:

sampling_params = SamplingParams(
    temperature=0.6, 
    top_p=0.9, 
    max_tokens=4028, 
    skip_special_tokens=False,
    stop=terminators
)

preds = model.generate(prompts=[prompt], sampling_params=sampling_params)
print(f"Model prediction: {preds[0].outputs[0].text}")

Processed prompts: 100%|██████████| 1/1 [00:05<00:00,  5.26s/it]

Model prediction: Here are 20 different ways of asking the instruction:

1. Please review the provided paragraph and evaluate its relevance to the given instruction. Explain your reasoning.
2. Assess the paragraph's relevance to the question and provide a justification for your answer.
3. Determine the degree of relevance between the paragraph and the instruction, and explain your thought process.
4. Evaluate the paragraph's connection to the instruction and provide a brief explanation.
5. Provide a relevance score for the paragraph and justify your decision.
6. Review the paragraph and explain how well it addresses the instruction.
7. Assess the paragraph's relevance to the instruction and provide a written justification.
8. Determine the paragraph's relevance to the question and provide a clear explanation.
9. Evaluate the paragraph's connection to the instruction and provide a concise explanation.
10. Provide a rating for the paragraph's relevance to the instruction and justify your




In [89]:
import re
questions_v1 = [re.sub(r"\d+\. ", "", line) for line in preds[0].outputs[0].text.splitlines() if re.match(r"^\d+", line)]
questions_v1

['Please review the provided paragraph and evaluate its relevance to the given instruction. Explain your reasoning.',
 "Assess the paragraph's relevance to the question and provide a justification for your answer.",
 'Determine the degree of relevance between the paragraph and the instruction, and explain your thought process.',
 "Evaluate the paragraph's connection to the instruction and provide a brief explanation.",
 'Provide a relevance score for the paragraph and justify your decision.',
 'Review the paragraph and explain how well it addresses the instruction.',
 "Assess the paragraph's relevance to the instruction and provide a written justification.",
 "Determine the paragraph's relevance to the question and provide a clear explanation.",
 "Evaluate the paragraph's connection to the instruction and provide a concise explanation.",
 "Provide a rating for the paragraph's relevance to the instruction and justify your choice.",
 'Review the paragraph and explain how it relates to th

In [90]:
sampling_params = SamplingParams(
    temperature=1, 
    top_p=0.9, 
    max_tokens=4028, 
    skip_special_tokens=False,
    stop=terminators
)

preds = model.generate(prompts=[prompt], sampling_params=sampling_params)
print(f"Model prediction: {preds[0].outputs[0].text}")

Processed prompts: 100%|██████████| 1/1 [00:05<00:00,  5.87s/it]

Model prediction: Here are 20 different ways of asking the instruction:

1. Please review the given instruction, paragraph, and relevancy evaluation, and provide a clear explanation for why the paragraph received its assigned score.
2. Explain the reasoning behind the relevancy evaluation given to the provided paragraph.
3. Write a justification for the relevancy rating assigned to the paragraph.
4. Provide a detailed explanation for why the paragraph is or is not relevant to the given instruction.
5. Elucidate the thought process behind the relevancy evaluation of the paragraph.
6. Discuss the connection between the paragraph and the instruction, highlighting the reasons for the given relevancy score.
7. Interpret the relevancy evaluation assigned to the paragraph and provide supporting evidence from the text.
8. Explain how the paragraph relates to the instruction and justify the assigned relevancy score.
9. Provide a breakdown of the reasons behind the relevancy rating, using the pa




In [91]:
import re
questions_v2 = [re.sub(r"\d+\. ", "", line) for line in preds[0].outputs[0].text.splitlines() if re.match(r"^\d+", line)]
questions_v2

['Please review the given instruction, paragraph, and relevancy evaluation, and provide a clear explanation for why the paragraph received its assigned score.',
 'Explain the reasoning behind the relevancy evaluation given to the provided paragraph.',
 'Write a justification for the relevancy rating assigned to the paragraph.',
 'Provide a detailed explanation for why the paragraph is or is not relevant to the given instruction.',
 'Elucidate the thought process behind the relevancy evaluation of the paragraph.',
 'Discuss the connection between the paragraph and the instruction, highlighting the reasons for the given relevancy score.',
 'Interpret the relevancy evaluation assigned to the paragraph and provide supporting evidence from the text.',
 'Explain how the paragraph relates to the instruction and justify the assigned relevancy score.',
 'Provide a breakdown of the reasons behind the relevancy rating, using the paragraph and instruction as evidence.',
 'Write a summary of the re

In [92]:
random_quesiton_indexes = random.sample(range(20), 10)
print(random_quesiton_indexes)

relevancy_instructions = [questions_v2[i] for i in random_quesiton_indexes] + [questions_v1[i] for i in random_quesiton_indexes]
relevancy_instructions

[14, 16, 8, 1, 0, 18, 11, 6, 13, 10]


['Explain the relationship between the paragraph and instruction, highlighting areas of agreement and disagreement.',
 'Provide an explanation for the relevancy rating, using specific examples from the paragraph and instruction.',
 'Provide a breakdown of the reasons behind the relevancy rating, using the paragraph and instruction as evidence.',
 'Explain the reasoning behind the relevancy evaluation given to the provided paragraph.',
 'Please review the given instruction, paragraph, and relevancy evaluation, and provide a clear explanation for why the paragraph received its assigned score.',
 'Provide a clear and concise explanation of the thought process behind the relevancy evaluation.',
 "Discuss the paragraph's connection to the instruction, highlighting areas of relevance and irrelevance.",
 'Interpret the relevancy evaluation assigned to the paragraph and provide supporting evidence from the text.',
 'Elucidate the relevance of the paragraph to the instruction, highlighting the 

### Generate Support Instructions

In [93]:
messages = [
    {"role": "system", "content": "You a helpful and honest assistant."},
    {"role": "user", "content": f"Write 20 different ways of asking the following instruction:\n\n{support_instruction}"},
]

prompt = tokenizer.apply_chat_template(
    messages,
    tokenize=False, 
    add_generation_prompt=True,
)

In [94]:
sampling_params = SamplingParams(
    temperature=0.6, 
    top_p=0.9, 
    max_tokens=4028, 
    skip_special_tokens=False,
    stop=terminators
)

preds = model.generate(prompts=[prompt], sampling_params=sampling_params)
print(f"Model prediction: {preds[0].outputs[0].text}")

Processed prompts: 100%|██████████| 1/1 [00:06<00:00,  6.22s/it]

Model prediction: Here are 20 different ways of asking the instruction:

1. Analyze the given paragraph and answer, determining if the answer is supported by the provided text.
2. Evaluate the answer provided and explain whether it is supported by the given paragraph.
3. Determine the validity of the answer based on the paragraph and explain the reasoning.
4. Assess the answer and provide evidence from the paragraph to support or refute the evaluation.
5. Examine the paragraph and answer, and indicate whether the answer is supported by the text.
6. Provide a justification for the evaluation of the answer based on the paragraph.
7. Evaluate the answer and explain whether it is supported by the paragraph, citing specific evidence.
8. Determine if the answer is supported by the paragraph, and provide a brief explanation.
9. Analyze the paragraph and answer, and provide a conclusion on whether the answer is supported.
10. Provide a written explanation of why the answer was given the "Is Su




In [95]:
import re
questions_v1 = [re.sub(r"\d+\. ", "", line) for line in preds[0].outputs[0].text.splitlines() if re.match(r"^\d+", line)]
questions_v1

['Analyze the given paragraph and answer, determining if the answer is supported by the provided text.',
 'Evaluate the answer provided and explain whether it is supported by the given paragraph.',
 'Determine the validity of the answer based on the paragraph and explain the reasoning.',
 'Assess the answer and provide evidence from the paragraph to support or refute the evaluation.',
 'Examine the paragraph and answer, and indicate whether the answer is supported by the text.',
 'Provide a justification for the evaluation of the answer based on the paragraph.',
 'Evaluate the answer and explain whether it is supported by the paragraph, citing specific evidence.',
 'Determine if the answer is supported by the paragraph, and provide a brief explanation.',
 'Analyze the paragraph and answer, and provide a conclusion on whether the answer is supported.',
 'Provide a written explanation of why the answer was given the "Is Supported" evaluation.',
 'Determine whether the answer is supported

In [96]:
sampling_params = SamplingParams(
    temperature=1, 
    top_p=0.9, 
    max_tokens=4028, 
    skip_special_tokens=False,
    stop=terminators
)

preds = model.generate(prompts=[prompt], sampling_params=sampling_params)
print(f"Model prediction: {preds[0].outputs[0].text}")

Processed prompts: 100%|██████████| 1/1 [00:06<00:00,  6.76s/it]

Model prediction: Here are 20 different ways to ask the instruction:

1. Please review the given paragraph and answer, and then indicate whether the answer is supported by the paragraph and explain your reasoning.
2. Evaluate the relationship between the paragraph and the answer, and determine if the answer is supported by the text.
3. Assess the answer's alignment with the provided paragraph, and provide a justification for your evaluation.
4. Analyze the answer and paragraph to determine if they are logically connected, and explain the outcome.
5. Check if the answer is backed up by the paragraph, and provide a clear explanation for your assessment.
6. Investigate the correspondence between the paragraph and answer, and determine if the answer is supported.
7. Evaluate the paragraph and answer to determine if the answer is supported by the paragraph, and provide a justification for your evaluation.
8. Determine if the answer is supported by the paragraph, and explain the reasoning be




In [97]:
import re
questions_v2 = [re.sub(r"\d+\. ", "", line) for line in preds[0].outputs[0].text.splitlines() if re.match(r"^\d+", line)]
questions_v2

['Please review the given paragraph and answer, and then indicate whether the answer is supported by the paragraph and explain your reasoning.',
 'Evaluate the relationship between the paragraph and the answer, and determine if the answer is supported by the text.',
 "Assess the answer's alignment with the provided paragraph, and provide a justification for your evaluation.",
 'Analyze the answer and paragraph to determine if they are logically connected, and explain the outcome.',
 'Check if the answer is backed up by the paragraph, and provide a clear explanation for your assessment.',
 'Investigate the correspondence between the paragraph and answer, and determine if the answer is supported.',
 'Evaluate the paragraph and answer to determine if the answer is supported by the paragraph, and provide a justification for your evaluation.',
 'Determine if the answer is supported by the paragraph, and explain the reasoning behind your conclusion.',
 'Review the paragraph and answer to ass

In [98]:
random_quesiton_indexes = random.sample(range(20), 10)
print(random_quesiton_indexes)

support_instructions = [questions_v2[i] for i in random_quesiton_indexes] + [questions_v1[i] for i in random_quesiton_indexes]
support_instructions

[0, 15, 10, 7, 17, 11, 13, 1, 3, 9]


['Please review the given paragraph and answer, and then indicate whether the answer is supported by the paragraph and explain your reasoning.',
 "Evaluate the answer's relation to the paragraph, and determine if the answer is supported.",
 'Analyze the answer and paragraph to determine if they are causally linked, and explain the outcome.',
 'Determine if the answer is supported by the paragraph, and explain the reasoning behind your conclusion.',
 'Determine if the answer is logically deduced from the paragraph, and explain your conclusion.',
 'Determine if the answer is backed up by the paragraph, and provide a clear explanation for your assessment.',
 "Review the paragraph and answer to determine if the answer is supported by the paragraph's content.",
 'Evaluate the relationship between the paragraph and the answer, and determine if the answer is supported by the text.',
 'Analyze the answer and paragraph to determine if they are logically connected, and explain the outcome.',
 'E

In [102]:
# Adding Root Instruction

completeness_instructions.append(completeness_instruction)
relevancy_instructions.append(relevancy_instruction)
support_instructions.append(support_instruction)

## Distribute Quesitons

In [105]:
num_instrucitons = len(completeness_instructions) # All Instruciton Types Have Same Length
for sample in reduced_samples:
    for data in sample['completeness']:
        rand_index = random.sample(range(num_instrucitons), 1)[0]
        data['instruction'] = completeness_instructions[rand_index]

    for data in sample['relevancy']:
        rand_index = random.sample(range(num_instrucitons), 1)[0]
        data['instruction'] = relevancy_instructions[rand_index]
    
    for data in sample['support']:
        rand_index = random.sample(range(num_instrucitons), 1)[0]
        data['instruction'] = support_instructions[rand_index]

In [113]:
reduced_samples[0]['relevancy'][0]

{'paragraph': 'Ivermectin\nin humans in the treatment of onchocerciasis (river blindness), but is also effective against other worm infestations (such as strongyloidiasis, ascariasis, trichuriasis, filariasis and enterobiasis), and some epidermal parasitic skin diseases, including scabies. Ivermectin is currently being used to help eliminate river blindness (onchocerciasis) in the Americas, and to stop transmission of lymphatic filariasis and onchocerciasis around the world in programs sponsored by the Carter Center using ivermectin donated by Merck. The disease is common in 30 African countries, six Latin American countries, and Yemen. The drug rapidly kills microfilariae, but not the adult worms. A single oral dose of',
 'label': 'Relevant',
 'reason': 'The paragraph is considered "Relevant" to the instruction "Why do many people believe that ivermectin can prevent or treat Covid-19?" because it provides information about the uses of ivermectin, a drug that is being researched and di

## Create Dataset

In [114]:
def completeness_text_prompt(prompt_instruciton, instruction, answer, label, reason):
    return f'''{prompt_instruciton}
Instruction:
{instruction}
Answer:
{answer}
Reason:
{reason}
Completeness:
{label}'''

def relevancy_text_prompt(prompt_instruciton, instruction, paragraph, label, reason):
    return f'''{prompt_instruciton}
Instruction:
{instruction}
Paragraph:
{paragraph}
Reason:
{reason}
Relevancy:
{label}'''

def support_text_prompt(prompt_instruciton, paragraph, answer, label, reason):
    return f'''{prompt_instruciton}
Paragraph:
{paragraph}
Answer:
{answer}
Reason:
{reason}
Is Supported:
{label}'''

In [115]:
def completeness_input_prompt(instruction, answer):
    return f'''Instruction:
{instruction}
Answer:
{answer}'''

def relevancy_input_prompt(instruction, paragraph):
    return f'''Instruction:
{instruction}
Paragraph:
{paragraph}'''

def support_input_prompt(paragraph, answer):
    return f'''Paragraph:
{paragraph}
Answer:
{answer}'''

In [116]:
def completeness_output_prompt(label, reason):
    return f'''Reason:
{reason}
Completeness:
{label}'''

def relevancy_output_prompt(label, reason):
    return f'''Reason:
{reason}
Relevancy:
{label}'''

def support_output_prompt(label, reason):
    return f'''Reason:
{reason}
Is Supported:
{label}'''

In [119]:
new_dataset = []

for sample in reduced_samples:
    for data in sample['completeness']:
        data['text'] = completeness_text_prompt(data['instruction'], sample['instruction'], data['answer'], data['label'], data['reason'])
        new_dataset.append({
            "instruction": data['instruction'],
            "inputs": completeness_input_prompt(sample['instruction'], data['answer']),
            "output": completeness_output_prompt(data['reason'], data['label']),
            "text": data['text']
        })

    for data in sample['relevancy']:
        data['text'] = relevancy_text_prompt(data['instruction'], sample['instruction'], data['paragraph'], data['label'], data['reason'])
        new_dataset.append({
            "instruction": data['instruction'],
            "inputs": relevancy_input_prompt(sample['instruction'], data['paragraph']),
            "output": relevancy_output_prompt(data['reason'], data['label']),
            "text": data['text']
        })
    
    for data in sample['support']:
        data['text'] = support_text_prompt(data['instruction'], data['paragraph'], data['answer'], data['label'], data['reason'])
        new_dataset.append({
            "instruction": data['instruction'],
            "inputs": support_input_prompt(data['paragraph'], data['answer']),
            "output": support_output_prompt(data['reason'], data['label']),
            "text": data['text']
        })
        
print(new_dataset[0])

{'instruction': 'You will be provided an Instruciton, an Answer, and an evaluation on the Completeness of the answer. You will explain the reason why the Answer was given the provided Completeness evaluation.', 'inputs': "Instruction:\nWhy do many people believe that ivermectin can prevent or treat Covid-19?\nAnswer:\nThe belief that ivermectin can prevent or treat Covid-19 is based on a combination of anecdotal reports, observational studies, and laboratory experiments, but the evidence supporting its use is limited and inconclusive.\n\nIvermectin is an anti-parasitic drug that has been used for decades to treat conditions such as river blindness, scabies, and lice infestations.In recent years, there have been some studies suggesting that it may also have antiviral properties, but most of these studies have been conducted in vitro or in animal models, and the evidence is not strong enough to support its use as a treatment for Covid-19 in humans.\n\nHowever, most health authorities, in

In [120]:
# Save Dataset
with open('rag_reasoning_dataset.json', 'w') as f:
    json.dump(new_dataset, f)