# Set up

In [None]:
!pip install uv
!pip install python-dotenv
!pip install openai
!pip install datasets
!pip install evaluate
!pip install tenacity

In [None]:
!git clone https://github.com/leap-laboratories/PIZZA.git

Cloning into 'PIZZA'...
remote: Enumerating objects: 1087, done.[K
remote: Counting objects: 100% (107/107), done.[K
remote: Compressing objects: 100% (63/63), done.[K
remote: Total 1087 (delta 63), reused 46 (delta 44), pack-reused 980 (from 1)[K
Receiving objects: 100% (1087/1087), 4.59 MiB | 20.98 MiB/s, done.
Resolving deltas: 100% (747/747), done.


In [None]:
!git clone https://github.com/huggingface/evaluate.git

Cloning into 'evaluate'...
remote: Enumerating objects: 8477, done.[K
remote: Counting objects: 100% (887/887), done.[K
remote: Compressing objects: 100% (201/201), done.[K
remote: Total 8477 (delta 719), reused 732 (delta 677), pack-reused 7590 (from 1)[K
Receiving objects: 100% (8477/8477), 2.22 MiB | 13.39 MiB/s, done.
Resolving deltas: 100% (5166/5166), done.


In [None]:
%cd PIZZA
!uv venv
!source .venv/bin/activate
!uv pip install -r requirements.txt

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[2mopenai    [0m [32m-----------------[2m-------------[0m[0m 160.00 KiB/285.99 KiB
[2mjoblib    [0m [32m---------------------------[2m---[0m[0m 260.70 KiB/294.74 KiB
[2mfsspec    [0m [32m---------------------------[2m---[0m[0m 269.54 KiB/308.70 KiB
[2mtzdata    [0m [32m---------[2m---------------------[0m[0m 94.04 KiB/337.28 KiB
[2mhuggingface-hub[0m [32m---------------------------[2m---[0m[0m 350.73 KiB/392.30 KiB
[2mpydantic  [0m [32m------------------------------[2m[0m[0m 397.65 KiB/399.95 KiB
[2mpytz      [0m [32m-------[2m-----------------------[0m[0m 109.62 KiB/493.63 KiB
[2mnvidia-cuda-runtime-cu12[0m [32m----------------------[2m--------[0m[0m 575.38 KiB/804.29 KiB
[2msafetensors[0m [32m---------------------[2m---------[0m[0m 813.11 KiB/1.15 MiB
[2mpygments  [0m [32m-------------[2m-----------------[0m[0m 495.08 KiB/1.15 MiB
[2mkiwisolver[0m [32m------

In [None]:
!echo "OPENAI_API_KEY=your_openai_key" > .env
%load_ext dotenv
%dotenv

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
import re
import time
import json
import pandas as pd
import numpy as np
import asyncio

import openai
from openai import OpenAI

from asyncio import Semaphore
from tqdm.asyncio import tqdm

import warnings
warnings.simplefilter("ignore")

from attribution.api_attribution import OpenAIAttributor
from attribution.experiment_logger import ExperimentLogger
from attribution.token_perturbation import (
    FixedPerturbationStrategy,
    NthNearestPerturbationStrategy,
)

Mounted at /content/drive


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

# Define code

In [None]:
PROMPT_TEMPLATE_VERDICT = """
You are presented with a MODEL_ANSWER by an LLM.

Identify whether the MODEL_ANSWER is a Refusal or is Okay.

- **Refusal:** Indicates an inability to provide an answer, acknowledges a lack of response, expresses doubt, or uncertainty.
- **Okay:** Does not meet the criteria for a Refusal.

Provide your analysis and ensure that the last line of your RESPONSE is:

VERDICT: $VERDICT

where $VERDICT is either "Refusal" or "Okay".

MODEL_ANSWER: {Model_answer}

RESPONSE:
""".strip()

In [None]:
import os
import json
import asyncio
import time

# from evaluate import load
import evaluate
import logging
import ast
import hashlib

In [None]:
# CELL 1

from tenacity import retry, wait_random_exponential, retry_if_not_exception_type

from openai import OpenAI

CLIENT = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

class OpenAIKeyError(Exception):
    """OpenAI API key not provided in environment variable."""
    pass

@retry(retry=retry_if_not_exception_type(OpenAIKeyError), wait=wait_random_exponential(min=1, max=10))
def predict(prompt, temperature=1.0, model='gpt-4o'):
    """Predict with GPT models."""

    if not CLIENT.api_key:
        raise OpenAIKeyError('Need to provide OpenAI API key in environment variable `OPENAI_API_KEY`.')

    if isinstance(prompt, str):
        messages = [{'role': 'user', 'content': prompt}]
    else:
        messages = prompt

    model_mapping = {
        'gpt-4': 'gpt-4-0613',
        'gpt-4-turbo': 'gpt-4-1106-preview',
        'gpt-3.5': 'gpt-3.5-turbo-1106',
        'gpt-4o': 'gpt-4o-2024-05-13'
    }
    model = model_mapping.get(model, model)

    output = CLIENT.chat.completions.create(
        model=model,
        messages=messages,
        max_tokens=200,
        temperature=temperature,
    )
    response = output.choices[0].message.content
    return response

def md5hash(string):
    """Generate MD5 hash for a given string."""
    return int(hashlib.md5(string.encode('utf-8')).hexdigest(), 16)


In [None]:
# CELL 2

def model_based_metric(predicted_answer, example, model_name='gpt-4o'):
    """Assess the quality of the predicted answer using a model-based metric."""
    if 'answers' in example:
        correct_answers = example['answers']['text']
    elif 'reference' in example:
        correct_answers = example['reference']['answers']['text']
    else:
        raise ValueError("Example must contain 'answers' or 'reference' with 'answers'.")

    prompt = f"We are assessing the quality of answers to the following question: {example['question']}\n"
    if len(correct_answers) == 1:
        prompt += f"The expected answer is: {correct_answers[0]}.\n"
    else:
        prompt += f"The following are expected answers to this question: {correct_answers}.\n"

    prompt += f"The proposed answer is: {predicted_answer}\n"

    if len(correct_answers) == 1:
        prompt += "Within the context of the question, does the proposed answer mean the same as the expected answer?"
    else:
        prompt += "Within the context of the question, does the proposed answer mean the same as any of the expected answers?"

    prompt += " Respond only with yes or no.\nResponse:"

    response = predict(prompt, model=model_name)
    response_lower = response.strip().lower()

    if 'yes' in response_lower:
        return 1.0
    elif 'no' in response_lower:
        return 0.0
    else:
        logging.warning('Ambiguous response received. Retrying LLM check.')
        response = predict(prompt, model=model_name)
        response_lower = response.strip().lower()
        if 'yes' in response_lower:
            return 1.0
        elif 'no' in response_lower:
            return 0.0
        logging.warning('Unable to determine yes or no from response. Defaulting to 0.0.')
        return 0.0

def llm_metric(predicted_answer, example):
    """LLM-based metric without specifying model."""
    return model_based_metric(predicted_answer, example)

def get_gpt_metric(model_name='gpt-4o'):
    """Get GPT-based metric function with specified model."""

    def gpt_metric(predicted_answer, example):
        return model_based_metric(predicted_answer, example, model_name=model_name)

    return gpt_metric

def get_reference(example):
    """Extract reference answers from the example."""
    if 'answers' not in example:
        example = example['reference']
    answers = example['answers']
    answer_starts = answers.get('answer_start', [])
    reference = {'answers': {'answer_start': answer_starts, 'text': answers['text']}, 'id': example['id']}
    logging.debug(f"Reference extracted: {reference}")
    return reference

def get_metric(metric_name):
    """Retrieve the appropriate metric function based on the metric name."""
    print('we got here 1')
    if metric_name == 'squad':
        print('we got here 1.5')
        squad_metric = evaluate.load('C:/Users/juanv/Documents/arb/pizza/evaluate/metrics/squad_v2/squad_v2.py')  # Corrected line

        def squad_metric_fn(response, example, *args, **kwargs):
            print('we got here 2')
            if 'id' in example:
                exid = example['id']
            elif 'id' in example['reference']:
                exid = example['reference']['id']
            else:
                raise ValueError

            pred = {'prediction_text': response, 'no_answer_probability': 0.0, 'id': exid}
            print('we got here 3')
            results = squad_metric.compute(
                predictions=[pred],
                references=[get_reference(example)]
            )
            print('we got here 4')
            return 1.0 if (results['f1'] >= 50.0) else 0.0

        return squad_metric_fn

    elif metric_name == 'llm':
        return llm_metric
    elif metric_name == 'llm_gpt-3.5':
        return get_gpt_metric(model_name='gpt-3.5')
    elif metric_name == 'llm_gpt-4':
        return get_gpt_metric(model_name='gpt-4o')
    else:
        raise ValueError(f"Unknown metric: {metric_name}")


In [None]:
# CELL 3
import asyncio
import json
import os
import re
import logging
from asyncio import Semaphore
from tqdm import tqdm
import ast
from tqdm.asyncio import tqdm


def split_array(arr):
    """Split array items containing newline into tokens and attribution values."""
    tokens = []
    att_values = []

    for item in arr:
        if '\n' in str(item):
            parts = str(item).split('\n')
            tokens.append(parts[0].strip())
            att_values.append(float(parts[1]))
        else:
            tokens.append(str(item).strip())
            att_values.append(None)

    return tokens, att_values

async def get_total_attribution(prompt, max_retries=3):
    logger = ExperimentLogger()

    for attempt in range(max_retries):
        try:
            logging.info(f"Attempt {attempt + 1} for get_total_attribution.")
            await attributor.hierarchical_perturbation(
                prompt,
                logger=logger,
                attribution_strategies=["cosine", "prob_diff"],
                perturbation_strategy=FixedPerturbationStrategy(replacement_token=""),
                max_depth=1,
            )

            output = logger.df_experiments['original_output'].iloc[-1]
            total_att = logger.print_total_attribution(exp_id=-1)

            try:
                mat_full = logger.get_attribution_matrices(exp_id=-1)
            except ValueError as e:
                logging.error(f"Error getting attribution matrices: {str(e)}")
                mat_full = None

            tokens, cosine_vals = split_array(total_att.values[0][4:])
            _, prob_diff_vals = split_array(total_att.values[1][4:])

            if output is None or output.strip() == "":
                raise ValueError("Output is None or empty")

            if any(x is None or (isinstance(x, (list, np.ndarray)) and len(x) == 0) for x in [cosine_vals, prob_diff_vals, tokens, mat_full]):
                raise ValueError("One or more results are None or empty")

            return cosine_vals, prob_diff_vals, tokens, output, mat_full

        except Exception as e:
            logging.error(f"Attempt {attempt + 1} failed in get_total_attribution: {str(e)}", exc_info=True)
            if attempt < max_retries - 1:
                sleep_time = 2 ** attempt  # Exponential backoff
                logging.info(f"Retrying in {sleep_time} seconds...")
                await asyncio.sleep(sleep_time)
            else:
                logging.error("Max retries reached in get_total_attribution. Returning None values.")
                return None, None, None, None, None

async def get_refusal_verdict(output):
    system_message = "You are a helpful assistant."
    model = "gpt-4o"
    temperature = 0.7
    top_p = 1
    max_tokens = 800
    message_list = [{"role": "user", "content": PROMPT_TEMPLATE_VERDICT.format(Model_answer=output)}]
    message_list = [{"role": "system", "content": system_message}] + message_list

    trial = 0
    while True:
        try:
            response = await asyncio.to_thread(
                CLIENT.chat.completions.create,
                model=model,
                messages=message_list,
                temperature=temperature,
                top_p=top_p,
                max_tokens=max_tokens,
            )
            res = response.choices[0].message.content

            ANSWER_PATTERN = r"(?i)VERDICT\s*:\s*(.*?)(?:\n|$)"
            match = re.search(ANSWER_PATTERN, res)

            if match:
                verdict = match.group(1).strip()
                return res, verdict
            else:
                logging.warning("Response format invalid. 'VERDICT' not found.")
                return res, None
        except Exception as e:
            exception_backoff = 2 ** trial  # Exponential backoff
            logging.error(f"Exception in get_refusal_verdict, attempt {trial + 1}. Retrying after {exception_backoff} seconds.", exc_info=True)
            await asyncio.sleep(exception_backoff)
            trial += 1
            if trial > 5:  # Max 5 retries
                logging.error("Max retries reached in get_refusal_verdict. Skipping this request.")
                return None, None

async def get_hallucination_verdict(output, example, metric_arg):
    metric = get_metric(metric_arg)
    is_correct = metric(output, example)
    logging.debug(f"Metric result (is_correct): {is_correct}")

    if is_correct == 1.0:
        verdict = 'Correct'
        return verdict
    else:
        res, verdict = await get_refusal_verdict(output)
        logging.debug(f"Refusal verdict: {verdict}")
        if verdict:
            verdict_lower = verdict.lower()
            if 'refusal' in verdict_lower:
                return 'Refusal'
            elif 'okay' in verdict_lower:
                return 'Hallucination'
        return 'Unknown'


async def process_row(idx, id, question, context, answer, metric_arg, max_retries=5):
    for attempt in range(max_retries):
        try:
            example = {
                "id": id,
                "question": question,
                "context": context,
                "answers": ast.literal_eval(answer)
            }

            question_nature_format = f"""{question} Give a complete single brief answer."""

            cosine_vals, prob_diff_vals, tokens, output, mats_full = await get_total_attribution(question_nature_format)

            if output is None:
                print(f">>> {idx} has no OUTPUT")
            print(f">>> {idx} OUTPUT: {output}")
            print(f">>> {idx} VALS: {cosine_vals}")
            verdict = await get_hallucination_verdict(output, example, metric_arg)

            print(f">>> {idx} VERDICT: {verdict}")

            if mats_full is not None:
                mats_json_data = []
                for i, df in enumerate(mats_full):
                    mats_json_data.append({
                        "matrix_id": i,
                        "data": df.reset_index().to_dict(orient="records")
                    })
            else:
                mats_json_data = None

            # print(f">>>>Row {idx}<<<<")
            # print(f"Input: {question}")
            # print(f"Output: {output}")
            # print(f"Correct Answer: {answer}")
            # print(f"Verdict: {verdict}")
            # print(f"Tokens: {tokens}")
            # print(f"Cosine values: {cosine_vals}")
            # print(f"ProbDiff values: {prob_diff_vals}")
            # print(f"Mat full: {mats_json_data}")

            result = {
                "row": idx,
                "id": id,
                "input": question,
                "output": output,
                "correct_answer": answer,
                "verdict": verdict,
                "tokens": tokens,
                "cosine_values": cosine_vals,
                "prob_diff_values": prob_diff_vals,
                "attributions_full": mats_json_data,
                "context": context
            }

            success = await save_result_with_retry(results_dir, idx, result)
            if success:
                print(f"Saved result for row Y{idx+1} to JSON file")
                return
            else:
                print(f"Failed to save result for row Y{idx+1} after multiple attempts.")

        except Exception as e:
            print(f"Error processing row {idx}, attempt {attempt + 1}: {str(e)}")
            if attempt == max_retries - 1:
                print(f"Max retries reached for row {idx}. Saving partial data.")
                result = {
                    "row": idx,
                    "id": id,
                    "input": question,
                    "output": str(e),
                    "correct_answer": answer,
                    "verdict": None,
                    "tokens": None,
                    "cosine_values": None,
                    "prob_diff_values": None,
                    "attributions_full": None,
                    "context": context
                }
                await save_result_with_retry(results_dir, idx, result, file_suffix="_error")
            else:
                await asyncio.sleep(1)

async def save_result_with_retry(results_dir, idx, result, max_attempts=10, delay=3, file_suffix=""):
    for attempt in range(max_attempts):
        try:
            file_path = os.path.join(results_dir, f"new_result_{idx}{file_suffix}.json")
            with open(file_path, 'w') as f:
                json.dump(result, f, indent=2)
            return True
        except IOError as e:
            logging.error(f"Attempt {attempt + 1} failed: Error saving result for row {idx}. Error: {str(e)}")
            if attempt < max_attempts - 1:
                logging.info(f"Retrying save in {delay} seconds...")
                await asyncio.sleep(delay)
    logging.error(f"Failed to save result for row {idx} after {max_attempts} attempts.")
    return False

async def process_rows(df_array, metric_arg, max_concurrent=24):
    semaphore = Semaphore(max_concurrent)
    total_rows = len(df_array)

    async def worker(idx, id, question, context, answer):
        async with semaphore:
            await process_row(idx, id, question, context, answer, metric_arg)

    tasks = []
    for idx, id, context, question, answer in df_array:
        task = asyncio.create_task(worker(idx, id, question, context, answer))
        tasks.append(task)

    for task in tqdm.as_completed(tasks, total=total_rows, desc="Processing rows"):
        try:
            await task
        except Exception as e:
            print(f"Unhandled error in task: {str(e)}")

# Run code

In [None]:
base_dir = 'your_base_dir'

data_folder = 'your_data_folder'
output_folder = 'output'

name_of_output = 'train_sample' # choose appropriate name

data_dir = os.path.join(base_dir, data_folder)
output_dir = os.path.join(base_dir, output_folder)
os.makedirs(output_dir, exist_ok=True)

In [None]:
# testing
print(os.listdir(data_dir))

['squad_test_set_3000.csv', 'nq_train_set_3000.csv', 'squad_train_set_3000.csv', 'nq_test_set_3000.csv', 'squad', 'nq', 'squad_train_set_10000.csv', 'squad10k', 'nq_train_set_10000.csv', 'nq10k', 'squad_10k_train_processed.csv', 'nq_10k_train_processed.csv', 'final_datasets']


In [None]:
df = pd.read_csv(os.path.join(data_dir, 'squad_train_set_10000.csv'))
df.drop(columns=['title'], inplace=True)

In [None]:
df.head(3)

Unnamed: 0,id,context,question,answers
0,57101271b654c5140001f7b7,Congress acted defiantly toward the Supreme Co...,Under the Drug Kingpin Act of 1988 and Federal...,"{'text': ['fifty'], 'answer_start': [150]}"
1,5726684d708984140094c509,Bonaparte could win battles by concealment of ...,How many standards did Napoleon's army capture...,"{'text': ['170'], 'answer_start': [443]}"
2,56f8b1989b226e1400dd0e29,"Additionally, genes can have regulatory region...",How do enhancers increase transcription?,"{'text': ['by binding an activator protein'], ..."


In [None]:
# for fn in os.listdir(data_dir):
#     if fn.endswith('.csv'):
#         df = pd.read_csv(os.path.join(data_dir, fn))
#         print(f">> df for {fn}")
#         display(df.head(3))

In [None]:
attributor = OpenAIAttributor(openai_model='gpt-4o-mini')

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
metric_arg = 'llm_gpt-4' # for squad, it's squad
results_dir = os.path.join(output_dir, name_of_output)
os.makedirs(results_dir, exist_ok=True)
df_testing = df.reset_index().values

In [None]:
print(f"> Num of samples is {len(df_testing)}.\n> First 3 samples:")
print(df_testing[:3])

> Num of samples is 10000.
> First 3 samples:
[[0 '57101271b654c5140001f7b7'
  'Congress acted defiantly toward the Supreme Court by passing the Drug Kingpin Act of 1988 and the Federal Death Penalty Act of 1994 that made roughly fifty crimes punishable by death, including crimes that do not always involve the death of someone. Such non-death capital offenses include treason, espionage (spying for another country), and high-level drug trafficking. Since no one has yet been sentenced to death for such non-death capital offenses, the Supreme Court has not ruled on their constitutionality.'
  'Under the Drug Kingpin Act of 1988 and Federal Death Penalty Act of 1994, about how many crimes were punishable by death?'
  "{'text': ['fifty'], 'answer_start': [150]}"]
 [1 '5726684d708984140094c509'
  "Bonaparte could win battles by concealment of troop deployments and concentration of his forces on the 'hinge' of an enemy's weakened front. If he could not use his favourite envelopment strategy, 

In [None]:
print(results_dir)

/content/drive/MyDrive/arb/pizza/pizza-dataset-files/nature_datasets_FINAL/squad10k/train_sample


In [None]:
await process_rows(df_testing[0:10], metric_arg)

# Squad datasets

In [None]:
# df = pd.read_csv(os.path.join(data_dir, 'squad_test_set_XXXX.csv')) # use the right file
# df.drop(columns=['title'], inplace=True)
# results_dir = os.path.join(output_dir, 'squad','train_v1_llm_sample')
# os.makedirs(results_dir, exist_ok=True)
# df_array_TRAIN = df.reset_index().values

In [None]:
# await process_rows(df_array_TRAIN, metric_arg)

In [None]:
# df = pd.read_csv(os.path.join(data_dir, 'squad_test_set_XXXX.csv')) # use the right file
# df.drop(columns=['title'], inplace=True)
# results_dir = os.path.join(output_dir, 'squad','test_v1_llm_sample')
# os.makedirs(results_dir, exist_ok=True)
# df_array_TEST = df.reset_index().values

In [None]:
# await process_rows(df_array_TEST, metric_arg)

# Process dir

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
import re
import time
import json
import pandas as pd
import numpy as np
import asyncio


def read_json_file(file_path):
    try:
        with open(file_path, 'r') as f:
            return json.load(f)
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")
        return None

def custom_progress_bar(iterable, desc="Processing", total=None):
    if total is None:
        total = len(iterable)

    print(f"{desc}:")
    progress_interval = max(1, total // 20)  # 5% intervals
    for i, item in enumerate(iterable, 1):
        yield item
        if i % progress_interval == 0 or i == total:
            progress = (i / total) * 100
            print(f"Progress: {progress:.1f}% ({i}/{total})")

def process_directory(results_dir):
    json_files = [os.path.join(results_dir, fn) for fn in os.listdir(results_dir) if fn.endswith('.json')]

    data = [file_data for file in custom_progress_bar(json_files, desc="Reading JSON files", total=len(json_files))
            if (file_data := read_json_file(file)) is not None]

    return pd.DataFrame(data)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
data_dir = os.path.join(base_dir, data_folder)
output_dir = os.path.join(base_dir, output_folder)
os.makedirs(output_dir, exist_ok=True)

In [None]:
df = process_directory(results_dir)

Reading JSON files:
Progress: 10.0% (1/10)
Progress: 20.0% (2/10)
Progress: 30.0% (3/10)
Progress: 40.0% (4/10)
Progress: 50.0% (5/10)
Progress: 60.0% (6/10)
Progress: 70.0% (7/10)
Progress: 80.0% (8/10)
Progress: 90.0% (9/10)
Progress: 100.0% (10/10)


In [None]:
df.head(1)

Unnamed: 0,row,id,input,output,correct_answer,verdict,tokens,cosine_values,prob_diff_values,attributions_full,context
0,6,56ce35b2aab44d1400b885b4,What ailment did Harper Lee's mother suffer from?,Harper Lee's mother suffered from mental illne...,"{'text': ['mental illness'], 'answer_start': [...",Correct,"[What, ail, ment, did, Harper, Lee, 's, mother...","[0.05, 0.05, 0.07, 0.07, 0.09, 0.08, 0.08, 0.0...","[0.08, 0.08, 0.1, 0.1, 0.13, 0.11, 0.11, 0.11,...","[{'matrix_id': 0, 'data': [{'index': 'What (0)...","Lee had lost her mother, who suffered from men..."


In [None]:
df.to_csv(os.path.join(results_dir,f'{name_of_output}.csv'), index=False) # use appropriate name