In [1]:
from llama_index.core import Settings, VectorStoreIndex, SimpleDirectoryReader, StorageContext, load_index_from_storage
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

from datasets import load_dataset
import evaluate

import dotenv
import os
import shutil
import tqdm
import time
import json

dotenv.load_dotenv()
api_key = os.getenv("API_KEY")

Settings.llm = OpenAI(temperature=0.2, model="gpt-4")
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")

  from .autonotebook import tqdm as notebook_tqdm


## Indexing Functions


In [7]:
def create_index_from_text(text, title):
    """
    Create an index from a piece of context. If an index with the given title already exists, it will be returned.

    Args:
        text (str): The context to index
        title (str): The title of the index for access later

    Returns:
        VectorStoreIndex: The index created from the context
    """
    index = get_index_by_title(title)
    if index is None:
        os.makedirs("tmp", exist_ok=True)
        with open("tmp/tmp.txt", "w") as f:
            f.write(text)
        documents = SimpleDirectoryReader("tmp").load_data()
        index = VectorStoreIndex.from_documents(documents, model_name="openai/text-embedding-3-small")
        index.set_index_id(title)
        index.storage_context.persist()
        shutil.rmtree("tmp")
    return index


def get_index_by_title(title):
    """
    Get an index by its title. If the index does not exist, returns None.

    Args:
        title (str): The title of the index

    Returns:
        VectorStoreIndex: The index with the specified title
    """
    try:
        storage_context = StorageContext.from_defaults(persist_dir="storage")
        index = load_index_from_storage(storage_context, index_id=title)
        return index
    except ValueError:
        return None

## Question Answering Functions


In [8]:
def answer_reading_comprehension(question, context_title=None, context="", use_rag=False):
    """
    Answer a question given a context. If use_rag is True, retrieval will be used to answer the question. Otherwise, the entire context will be prepended to the question.

    Args:
        question (str): The question to answer
        context_title (str): The title of the context index
        context (str): The context to use
        use_rag (bool): Whether to use retrieval to answer the question

    Returns:
        str: The answer to the question
    """
    if use_rag:
        if context_title is None:
            raise ValueError("context_title must be provided when using RAG")
        index = create_index_from_text(context, context_title)
        query_engine = index.as_query_engine()
        response = query_engine.query(question).response
    else:
        response = Settings.llm.complete(context + "\n" + question).text
    return response


def answer_reading_comprehension_with_rag(*args, **kwargs):
    """
    Answer a question given a context using retrieval to answer the question.

    Args:
        question (str): The question to answer
        context_title (str): The title of the context index
        context (str): The context to use

    Returns:
        str: The answer to the question
    """
    return answer_reading_comprehension(*args, **kwargs, use_rag=True)


def answer_reading_comprehension_in_context(*args, **kwargs):
    """
    Answer a question given a context. The context will be prepended to the question.

    Args:
        question (str): The question to answer
        context (str): The context to use

    Returns:
        str: The answer to the question
    """
    return answer_reading_comprehension(*args, **kwargs, use_rag=False)

## Evaluation


### Load the dataset and metrics


In [9]:
longdep_qa_ds = load_dataset("bigainlco/LooGLE", "longdep_qa", split="test")
rouge = evaluate.load("rouge")


def get_rouge_metrics(output_file):
    """
    Get ROUGE metrics for a .jsonl file containing generated answers and ground truth answers.

    Args:
        output_file (str): The path to the .jsonl file

    Returns:
        dict: The ROUGE metrics
    """
    with open(output_file, "r") as f:
        lines = f.readlines()
    outputs = [json.loads(line) for line in lines]
    generated_answers = [output["generated_answer"] for output in outputs]
    ground_truths = [output["ground_truth"] for output in outputs]
    rouge_metrics = rouge.compute(predictions=generated_answers, references=ground_truths)
    return rouge_metrics


def llm_self_score(output_file):
    """
    Score the generated answers in a .jsonl file using the LLM. The user will be prompted to determine whether each answer is correct.

    Args:
        output_file (str): The path to the .jsonl file

    Returns:
        float: The accuracy of the generated answers
    """
    with open(output_file, "r") as f:
        lines = f.readlines()
    outputs = [json.loads(line) for line in lines]
    llm = Settings.llm
    for output in outputs:
        question = output["question"]
        ground_truth = output["ground_truth"]
        generated_answer = output["generated_answer"]
        if "correct" in output:
            continue
        prompt = f'Given the question "{question}" whose answer is "{ground_truth}", is answer "{generated_answer}" similar enough to the true answer that it should be considered correct? Answer "yes" or "no" with no other characters or capitalization.'
        response = llm.complete(prompt).text.lower()
        if "yes" in response:
            output["correct"] = True
        else:
            output["correct"] = False
    num_correct = sum([output["correct"] for output in outputs])
    accuracy = num_correct / len(outputs)
    with open(output_file, "w") as f:
        for output in outputs:
            json.dump(output, f)
            f.write("\n")
    return accuracy

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


### Inference function and helpers


In [41]:
def read_output_file(output_file):
    """
    Read a .jsonl file containing generated answers and ground truth answers.

    Args:
        output_file (str): The path to the .jsonl file

    Returns:
        list: The outputs in the file
    """
    if not os.path.exists(output_file):
        return []
    with open(output_file, "r") as f:
        lines = f.readlines()
    outputs = [json.loads(line) for line in lines]
    return outputs


def log_outputs(question, ground_truth, generated_answer, output_file):
    """
    Log a question, its ground truth answer, and a generated answer to a .jsonl file.

    Args:
        question (str): The question
        ground_truth (str): The ground truth answer
        generated_answer (str): The generated answer
        output_file (str): The path to the .jsonl file; if None, a new file will be created in the "output" directory with the current time as the name

    Returns:
        tuple: The path to the .jsonl file and the outputs in the file
    """
    if output_file is None:
        os.makedirs("output", exist_ok=True)
        output_file = f"output/{time.time()}.jsonl"
    with open(output_file, "a") as f:
        json.dump({"question": question, "ground_truth": ground_truth, "generated_answer": generated_answer}, f)
        f.write("\n")
    existing_output = read_output_file(output_file)
    return output_file, existing_output


def question_is_answered(question, existing_output):
    """
    Determine whether a question has already been answered in a list of outputs.

    Args:
        question (str): The question
        existing_output (list): The outputs

    Returns:
        bool: Whether the question has already been answered
    """
    if question in [output["question"] for output in existing_output]:
        return True
    return False


def test_longdep_qa(inference_function, output_file=None, debug_lim=None):
    """
    Test an inference function on the longdep_qa dataset.

    Args:
        inference_function (function): The function to test
        output_file (str): The path to the .jsonl file to log outputs to; if None, a new file will be created in the "output" directory with the current time as the name
        debug_lim (int): The number of questions to test; if None, all questions will be tested

    Returns:
        None
    """
    n_questions = sum([len(eval(env["qa_pairs"])) for env in longdep_qa_ds])
    if debug_lim is None:
        debug_lim = n_questions
    existing_output = read_output_file(output_file)
    with tqdm.tqdm(total=debug_lim) as pbar:
        for environment in longdep_qa_ds:
            context = environment["input"]
            title = environment["title"]
            qa_pairs = eval(environment["qa_pairs"])
            for question_dict in qa_pairs:
                question = question_dict["Q"]
                ground_truth = question_dict["A"]
                if not question_is_answered(question, existing_output):
                    generated_answer = inference_function(question, context_title=title, context=context)
                    output_file, existing_output = log_outputs(question, ground_truth, generated_answer, output_file)
                pbar.update(1)
                if pbar.n >= debug_lim:
                    break
            if pbar.n >= debug_lim:
                break

### Inference with RAG


In [35]:
test_longdep_qa(answer_reading_comprehension_with_rag, output_file="output/baseline_with_rag_100.jsonl", debug_lim=100)


100%|██████████| 100/100 [00:00<00:00, 22030.06it/s]


### Calculate metrics


In [63]:
rouge_metrics = get_rouge_metrics("output/baseline_with_rag_100.jsonl")
print("Rouge Metrics:", rouge_metrics)

llm_self_score = llm_self_score("output/baseline_with_rag_100.jsonl")
print("LLM Self-Score:", llm_self_score)

Rouge Metrics: {'rouge1': 0.19845643848785122, 'rouge2': 0.07862381462209828, 'rougeL': 0.1619569034833745, 'rougeLsum': 0.17183036257033626}
LLM Self-Score: 0.53
