# Benchmarking of LitQA2 (revised)

Why? 

A new benchmark was developed for the LitQA2 dataset. 

### inspect_evals package

Developed as a unified benchmark for LLM evaluation. 

The eval can be loaded in command line using: 
```
inspect eval inspect_eval/lab_bench_litqa --model openai/gpt-4o-mini
```

From running the code in CLI, we can see that with gpt-4o-mini, we get an accuracy of 0.291, precision of 0.389, and coverage of 0.749. 

How do we implement this into PaperQA2?

We need to make a custom task for PaperQA2 to run. 

Develop the custom task using the test dataset and scale up to the full dataset. 

Breaking down the Task function, we have: 
- Dataset
- Solver
- Scorer
- Metrics

In [1]:
# Installing Libraries

import random

import pandas as pd
from pandas import DataFrame
import nest_asyncio
import asyncio

# PaperQA2 Imports 
from paperqa import ask, Settings, agent_query
from paperqa.settings import AgentSettings, AnswerSettings

# Inspect AI Imports
from inspect_ai import eval
from inspect_ai import task, Task, Epochs
from inspect_ai.dataset import MemoryDataset, json_dataset, FieldSpec, Sample
from inspect_ai.solver._solver import solver, Solver, Generate
from inspect_ai.solver._task_state import TaskState, ChatMessageUser
from inspect_ai.agent import bridge
from inspect_ai.solver import _multiple_choice
from inspect_ai.scorer import Target, Scorer


# Inspect Evals Imports
from inspect_evals.lab_bench.record_to_sample_helpers import record_to_sample_base
from inspect_evals.lab_bench.scorer import precision_choice

In [2]:
# Dataset
litqa2_test_data = pd.read_parquet("/root/paperQA2_analysis/data/LitQA_data/test-00000-of-00001.parquet")
litqa2_test_data.head()

Unnamed: 0,id,question,ideal,distractors,canary,tag,version,sources,is_opensource,subtask,key-passage
0,e6ece709-c919-4388-9f64-ab0e0822b03a,Approximately what percentage of topologically...,31%,"[21%, 11%, 41%, 51%]",BENCHMARK DATA SHOULD NEVER APPEAR IN TRAINING...,litqa,1.1-dev,[https://doi.org/10.1038/s41467-024-44782-6],True,litqa-v2-test,Good control in FPR does not necessarily repre...
1,813a9053-3f67-4d58-80af-02153de90ae4,At least how long do SynNotch-MCF10DCIS cells ...,72 h,"[24, 48 h, 0 h, 12 h, 6 h, 96 h]",BENCHMARK DATA SHOULD NEVER APPEAR IN TRAINING...,litqa,1.1-dev,[https://doi.org/10.1073/pnas.2322688121],True,litqa-v2-test,Spatial heterogeneity within tumors due to var...
2,831621de-5e32-4006-af84-a40dba100866,DK015 and DK038 strains of Verticillium dahlia...,95%,"[94%, 96%, 97%, 98%]",BENCHMARK DATA SHOULD NEVER APPEAR IN TRAINING...,litqa,1.1-dev,[https://doi.org/10.1186/s12915-024-01900-6],True,litqa-v2-test,"The strains DK015 and DK038, with opposite MAT..."
3,3e6d7a54-5b8a-4aa0-ac6e-1fce986d1636,Expression of which of the following genes was...,Aldh1l1,"[MAPK, Actin, none of the above]",BENCHMARK DATA SHOULD NEVER APPEAR IN TRAINING...,litqa,1.1-dev,[https://doi.org/10.1073/pnas.2321711121],True,litqa-v2-test,The mitogen-activated protein kinase (MAPK) pa...
4,e4579ca5-c7d4-47a0-88f5-8adc460fc936,For which of the following Trub1 substrates di...,SCP2,"[FBXO5, HECTD1, NKAIN1, CCDC22, IDI1]",BENCHMARK DATA SHOULD NEVER APPEAR IN TRAINING...,litqa,1.1-dev,[https://doi.org/10.1101/2024.03.26.586895],True,litqa-v2-test,"Among the Trub1 substrates, FBXO5 (chr6:152975..."


In [4]:
UNCERTAIN_ANSWER_CHOICE = "Insufficient information to answer the question."

def record_to_sample_custom(record: dict) -> Sample:
    # Preprocessing 
    choices = []
    choices.append(record["ideal"])
    choices.extend(record["distractors"])
    choices.append(UNCERTAIN_ANSWER_CHOICE)
    
    return Sample(
        input=record["question"],
        choices=choices,
        target="A"
    )

def convert_pandas_to_dataset(data: DataFrame) -> MemoryDataset:
    records = data.to_dict(orient="records")
    samples = [record_to_sample_custom(i) for i in records]
    
    # Add to Dataset Object
    dataset = MemoryDataset(samples)
    
    return dataset
    

In [3]:
# Set up LLM config (main LLM for reasoning, extract metadata, ...)
llm_config_dict = {
    "model_list": [
        {
            "model_name": "gpt-4o-mini",
            "litellm_params": {
                "model": "gpt-4o-mini",
                "temperature": 0,
                "max_tokens": 4096
            }
        }
    ],
    "rate_limit": {"gpt-4o-mini": "30000 per 1 minute"}
}

# Set up agent (answer search and selecting tools):
agent_settings = AgentSettings(
    agent_llm="gpt-4o-mini",
    agent_llm_config={
        "rate_limit": "30000 per 1 minute"
    }
)

# Set up summary LLM config
summary_config_dict = {
    "rate_limit": {"gpt-4o-mini": "30000 per 1 minute"}
}

# Set up answer format
answer_settings = AnswerSettings(
    evidence_k=30,
    evidence_detailed_citations=False,
    evidence_retrieval=False,
    evidence_summary_length="around 100 words",
    evidence_skip_summary=False,
    answer_max_sources=5,
    max_answer_attempts=5,
    answer_length="1 letter"
)

# Set up the final settings object
paperqa_settings = Settings(
    llm="gpt-4o-mini",
    llm_config=llm_config_dict,
    summary_llm="gpt-4o-mini",
    summary_llm_config=summary_config_dict,
    agent=agent_settings,
    temperature=0,
    batch_size=1,
    verbosity=1,
    paper_directory="/root/paperQA2_analysis/data/LitQA_data/LitQA2_test_pdfs"
)      


In [4]:
MULTIPLE_CHOICE_TEMPLATE_CUSTOM = """
The following is a multiple choice question about biology.
Please answer by responding with the letter of the correct answer.

Think step by step.

{question}

Return your answer in the following format:

"letter".

where the letter denotes your chosen answer from the available options. You MUST only include the letter (with no quotation marks) and NOTHING ELSE.
"""

UNCERTAIN_ANSWER_CHOICE = "Insufficient information to answer the question."


# You MUST include the letter of the correct answer within the following format: 'ANSWER: $LETTER' (without quotes). For example, ’ANSWER: <answer>’, where <answer> is the correct letter. Always answer in exactly this format of a single letter, even if you are unsure. We require this because we use automatic parsing. Include your reasoning and context after this, separated by a line.  


# Record to Sample Custom
def record_to_sample_custom(record: dict) -> Sample:
    # Get the question
    message = f"Question: {record["question"]} \n"
    
    # Concatenate the choices
    choices = [record["ideal"]]
    choices.extend(record["distractors"])
    choices.append(UNCERTAIN_ANSWER_CHOICE)
    
    # Shuffle the dataset
    random.shuffle(choices)
    
    # Find the ideal answer
    ideal_idx = choices.index(record["ideal"])
    
    # Add prefixes to the shuffled choices
    indices = list[range(len(choices))]
    message +=  "\n".join(
        [f"{chr(65 + i)}) {j}" for i, j in enumerate(choices)]
    )
    
    # Make the message a part of the Sample
    return Sample(
        input=message,
        choices=choices,
        target=f"{chr(65 + ideal_idx)}"
    )
    
    

# Preprocessing Code for Bridge Method
def df_2_sample_bridge(data: DataFrame) -> MemoryDataset:
    records = data.to_dict(orient="records")
    samples = [record_to_sample_custom(i) for i in records]
    return MemoryDataset(samples)

In [5]:
test_dataset = df_2_sample_bridge(litqa2_test_data)
print(test_dataset.samples[0])
print(test_dataset.samples[1])

input='Question: Approximately what percentage of topologically associated domains in the GM12878 blood cell line does DiffDomain classify as reorganized in the K562 cell line? \nA) 51%\nB) 41%\nC) Insufficient information to answer the question.\nD) 21%\nE) 11%\nF) 31%' choices=['51%', '41%', 'Insufficient information to answer the question.', '21%', '11%', '31%'] target='F' id=None metadata=None sandbox=None files=None setup=None
input='Question: At least how long do SynNotch-MCF10DCIS cells express BFP after contact with GFP+BMSC3 cells? \nA) 12 h\nB) Insufficient information to answer the question.\nC) 24\nD) 6 h\nE) 72 h\nF) 48 h\nG) 96 h\nH) 0 h' choices=['12 h', 'Insufficient information to answer the question.', '24', '6 h', '72 h', '48 h', '96 h', '0 h'] target='E' id=None metadata=None sandbox=None files=None setup=None


In [6]:
def paperqa_agent(
    template: str,
    settings: dict,
):
    # Async run the solve:
    async def run(sample: dict[str]) -> dict:
        # print(sample["messages"][0]["content"])
        
        response = ask(
            query=template.format(
                question=sample["messages"][0]["content"],
            ),
            settings=settings
        )
        
        return {"output": response.session.answer}
    
    return run

Create a mini-version with 1 sample

In [11]:
from scorers.paperqa_scorer import paperqa_scorer

# Mini Example with 1 working sample
example = {
    "question": litqa2_test_data["question"][0],
    "ideal": litqa2_test_data["ideal"][0],
    "distractors": litqa2_test_data["distractors"][0]
}

sample = record_to_sample_custom(example)
mini_dataset = MemoryDataset([sample])
# mini_dataset.shuffle_choices()

@task
def paperqa_eval_mini():
    return Task(
        dataset=mini_dataset,
        solver = bridge(paperqa_agent(template=MULTIPLE_CHOICE_TEMPLATE_CUSTOM, settings=paperqa_settings)),
        scorer=paperqa_scorer(no_answer=UNCERTAIN_ANSWER_CHOICE),
        epochs=Epochs(1, "mode")
    )
    
# asyncio.run(eval(paperqa_eval_mini()))
eval(paperqa_eval_mini())

Output()