In [None]:
!pip install uv
!pip install python-dotenv
!pip install openai
!pip install datasets

In [None]:
!git clone https://github.com/leap-laboratories/PIZZA.git

Cloning into 'PIZZA'...
remote: Enumerating objects: 1031, done.[K
remote: Counting objects: 100% (556/556), done.[K
remote: Compressing objects: 100% (260/260), done.[K
remote: Total 1031 (delta 388), reused 417 (delta 295), pack-reused 475 (from 1)[K
Receiving objects: 100% (1031/1031), 3.91 MiB | 11.77 MiB/s, done.
Resolving deltas: 100% (688/688), done.


In [None]:
%cd PIZZA
!uv venv
!source .venv/bin/activate
!uv pip install -r requirements.txt

/content/PIZZA/PIZZA
Using Python 3.10.12 interpreter at: [36m/usr/bin/python3[39m
Creating virtualenv at: [36m.venv[39m
Activate with: [32msource .venv/bin/activate[39m
[2K[2mResolved [1m68 packages[0m [2min 684ms[0m[0m
[2K[2mInstalled [1m68 packages[0m [2min 454ms[0m[0m
 [32m+[39m [1mannotated-types[0m[2m==0.7.0[0m
 [32m+[39m [1manyio[0m[2m==4.4.0[0m
 [32m+[39m [1mcertifi[0m[2m==2024.2.2[0m
 [32m+[39m [1mcharset-normalizer[0m[2m==3.3.2[0m
 [32m+[39m [1mcontourpy[0m[2m==1.2.1[0m
 [32m+[39m [1mcycler[0m[2m==0.12.1[0m
 [32m+[39m [1mdistro[0m[2m==1.9.0[0m
 [32m+[39m [1mexceptiongroup[0m[2m==1.2.1[0m
 [32m+[39m [1mfilelock[0m[2m==3.14.0[0m
 [32m+[39m [1mfonttools[0m[2m==4.52.4[0m
 [32m+[39m [1mfsspec[0m[2m==2024.5.0[0m
 [32m+[39m [1mh11[0m[2m==0.14.0[0m
 [32m+[39m [1mhttpcore[0m[2m==1.0.5[0m
 [32m+[39m [1mhttpx[0m[2m==0.27.0[0m
 [32m+[39m [1mhuggingface-hub[0m[2m==0.23.2[0m
 [32m

In [None]:
!echo "OPENAI_API_KEY=your_api_key" > .env
%load_ext dotenv
%dotenv

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
import re
import time
import json
import pandas as pd
import numpy as np
import asyncio

import openai
from openai import OpenAI

from asyncio import Semaphore
from tqdm.asyncio import tqdm

import warnings
warnings.simplefilter("ignore")

from attribution.api_attribution import OpenAIAttributor
from attribution.experiment_logger import ExperimentLogger
from attribution.token_perturbation import (
    FixedPerturbationStrategy,
    NthNearestPerturbationStrategy,
)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
dataset_dir = '/path_to_defan_dataset'
dataset_fn = '<fn>.csv'

df = pd.read_csv(os.path.join(dataset_dir, dataset_fn))

results_dir = '/results_dir'

os.makedirs(results_dir, exist_ok=True)

In [None]:
attributor = OpenAIAttributor(openai_model='gpt-4o-mini') # gpt-3.5-turbo, gpt-4o-mini

In [None]:
PROMPT_TEMPLATE_VERDICT = """
You are presented with a MODEL_ANSWER and the CORRECT_ANSWER to a question. Evaluate the MODEL_ANSWER using the following steps:

Compare the MODEL_ANSWER to the CORRECT_ANSWER for accuracy and correctness.

Determine if the MODEL_ANSWER falls into one of these categories:
a) Correct: The answer matches the CORRECT_ANSWER in all key aspects, and is accurate and complete. If it is correct, the last line of your RESPONSE should be VERDICT: Correct.
b) Refusal: The answer indicates an inability to provide an answer or acknowledges a lack of response, is a refusal, doubt, or expresses uncertainty. If it is a refusal, the last line of your RESPONSE should be VERDICT: Refusal.
c) Hallucination: The answer states false information that contradicts the CORRECT_ANSWER. If it is a hallucination, the last line of your RESPONSE should be VERDICT: Hallucination.

The last line of your RESPONSE must be VERDICT: $VERDICT (without quotes) where $VERDICT is Correct, Refusal, or Hallucination.

MODEL_ANSWER: {Model_answer}

CORRECT_ANSWER: {Correct_answer}

RESPONSE:
""".strip()

In [None]:
def split_array(arr):
    tokens = []
    att_values = []

    for item in arr:
        if '\n' in str(item):
            parts = str(item).split('\n')
            tokens.append(parts[0].strip())
            att_values.append(float(parts[1]))
        else:
            tokens.append(str(item).strip())
            att_values.append(None)

    return tokens, att_values

async def get_total_attribution(prompt):
    logger = ExperimentLogger()
    try:
        await attributor.hierarchical_perturbation(
            prompt,
            logger=logger,
            attribution_strategies=["cosine", "prob_diff"],
            perturbation_strategy=FixedPerturbationStrategy(replacement_token=""),
            max_depth=1,
        )

        output = logger.df_experiments['original_output'].iloc[-1]

        total_att = logger.print_total_attribution(exp_id=-1)

        try:
            mat_full = logger.get_attribution_matrices(exp_id=-1)
        except ValueError as e:
            print(f"Error getting attribution matrices: {str(e)}")
            mat_full = None

        tokens, cosine_vals = split_array(total_att.values[0][4:])
        _, prob_diff_vals = split_array(total_att.values[1][4:])

        return cosine_vals, prob_diff_vals, tokens, output, mat_full
    except Exception as e:
        print(f"Error in get_total_attribution: {str(e)}")
        return None, None, None, None, None

async def get_hallucination_verdict(output, answer):
    system_message = "You are a helpful assistant."
    model = "gpt-4o"
    temperature = 0.7
    top_p = 1
    max_tokens = 800
    message_list = [{"role": "user", "content": PROMPT_TEMPLATE_VERDICT.format(Model_answer=output, Correct_answer=answer)}]
    message_list = [{"role": "system", "content": system_message}] + message_list

    trial = 0
    while True:
        try:
            response = await asyncio.to_thread(
                client.chat.completions.create,
                model=model,
                messages=message_list,
                temperature=temperature,
                top_p=top_p,
                max_tokens=max_tokens,
            )
            res = response.choices[0].message.content

            ANSWER_PATTERN = r"(?i)VERDICT\s*:\s*(.*?)(?:\n|$)"
            match = re.search(ANSWER_PATTERN, res)

            if match:
                verdict = match.group(1).strip()
                return res, verdict
            else:
                print('Formatting is not working or a NEW_PROMPT has not been created.')
                return res, None
        except Exception as e:
            exception_backoff = 2**trial  # exponential back off
            print(f"Exception occurred, will retry {trial} after {exception_backoff} seconds.", e)
            await asyncio.sleep(exception_backoff)
            trial += 1
            if trial > 5:  # Max 5 retries
                print("Max retries reached. Skipping this request.")
                return None, None

async def process_row(idx, question, answer, max_retries=3):
    for attempt in range(max_retries):
        try:
            cosine_vals, prob_diff_vals, tokens, output, mats_full = await get_total_attribution(question)
            _, verdict = await get_hallucination_verdict(output, answer)

            if mats_full is not None:
                mats_json_data = []
                for i, df in enumerate(mats_full):
                    mats_json_data.append({
                        "matrix_id": i,
                        "data": df.reset_index().to_dict(orient="records")
                    })
            else:
                mats_json_data = None

            print(f">>>>Row {idx}<<<<")
            print(f"Input: {question}")
            print(f"Output: {output}")
            print(f"Correct Answer: {answer}")
            print(f"Verdict: {verdict}")
            print(f"Tokens: {tokens}")
            print(f"Cosine values: {cosine_vals}")
            print(f"ProbDiff values: {prob_diff_vals}")
            print(f"Mat full: {mats_json_data}")

            result = {
                "row": idx,
                "input": question,
                "output": output,
                "correct_answer": answer,
                "verdict": verdict,
                "tokens": tokens,
                "cosine_values": cosine_vals,
                "prob_diff_values": prob_diff_vals,
                "attributions_full": mats_json_data
            }

            with open(os.path.join(results_dir, f"new_result_{idx}.json"), 'w') as f:
                json.dump(result, f, indent=2)

            print(f"Saved result for row Y{idx+1} to JSON file")
            return

        except Exception as e:
            print(f"Error processing row {idx}, attempt {attempt + 1}: {str(e)}")
            if attempt == max_retries - 1:
                print(f"Max retries reached for row {idx}. Saving partial data.")
                result = {
                    "row": idx,
                    "input": question,
                    "output": str(e),
                    "correct_answer": answer,
                    "verdict": None,
                    "tokens": None,
                    "cosine_values": None,
                    "prob_diff_values": None,
                    "attributions_full": None
                }
                with open(os.path.join(results_dir, f"new_result_{idx}_error.json"), 'w') as f:
                    json.dump(result, f, indent=2)
            else:
                await asyncio.sleep(1)

async def process_rows(df_array, max_concurrent=50):
    semaphore = Semaphore(max_concurrent)
    total_rows = len(df_array)

    async def worker(idx, question, answer):
        async with semaphore:
            await process_row(idx, question, answer)

    tasks = []
    for idx, question, answer, _ in df_array:
        task = asyncio.create_task(worker(idx, question, answer))
        tasks.append(task)

    for task in tqdm.as_completed(tasks, total=total_rows, desc="Processing rows"):
        try:
            await task
        except Exception as e:
            print(f"Unhandled error in task: {str(e)}")

In [None]:
df_array = df.reset_index().values
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [None]:
df_array[0]

array([0,
       ' Which team won the 1930 FIFA World Cup?(Give me the name only)',
       'Uruguay', 'name'], dtype=object)

In [None]:
start_time = time.time()
await process_rows(df_array[:1000])
end_time = time.time()

print(f"\nTotal processing time: {end_time - start_time:.2f} seconds")