# Benchmarking of LitQA2 (revised)

Why? 

A new benchmark was developed for the LitQA2 dataset. 

### inspect_evals package

Developed as a unified benchmark for LLM evaluation. 

The eval can be loaded in command line using: 
```
inspect eval inspect_eval/lab_bench_litqa --model openai/gpt-4o-mini
```

From running the code in CLI, we can see that with gpt-4o-mini, we get an accuracy of 0.291, precision of 0.389, and coverage of 0.749. 

How do we implement this into PaperQA2?

We need to make a custom task for PaperQA2 to run. 

Develop the custom task using the test dataset and scale up to the full dataset. 

Breaking down the Task function, we have: 
- Dataset
- Solver
- Scorer
- Metrics

In [12]:
# Installing Libraries

import random

import pandas as pd
from pandas import DataFrame
import nest_asyncio
import asyncio

# PaperQA2 Imports 
from paperqa import ask, Settings, agent_query
from paperqa.settings import AgentSettings, AnswerSettings

# Inspect AI Imports
from inspect_ai import eval
from inspect_ai import task, Task, Epochs
from inspect_ai.dataset import MemoryDataset, json_dataset, FieldSpec, Sample
from inspect_ai.solver._solver import solver, Solver, Generate
from inspect_ai.solver._task_state import TaskState, ChatMessageUser#
from inspect_ai.agent import bridge
from inspect_ai.solver import _multiple_choice


# Inspect Evals Imports
from inspect_evals.lab_bench.record_to_sample_helpers import record_to_sample_base
from inspect_evals.lab_bench.scorer import precision_choice

In [14]:
# Dataset
litqa2_test_data = pd.read_parquet("/root/paperQA2_analysis/data/LitQA_data/test-00000-of-00001.parquet")
litqa2_test_data.head()

Unnamed: 0,id,question,ideal,distractors,canary,tag,version,sources,is_opensource,subtask,key-passage
0,e6ece709-c919-4388-9f64-ab0e0822b03a,Approximately what percentage of topologically...,31%,"[21%, 11%, 41%, 51%]",BENCHMARK DATA SHOULD NEVER APPEAR IN TRAINING...,litqa,1.1-dev,[https://doi.org/10.1038/s41467-024-44782-6],True,litqa-v2-test,Good control in FPR does not necessarily repre...
1,813a9053-3f67-4d58-80af-02153de90ae4,At least how long do SynNotch-MCF10DCIS cells ...,72 h,"[24, 48 h, 0 h, 12 h, 6 h, 96 h]",BENCHMARK DATA SHOULD NEVER APPEAR IN TRAINING...,litqa,1.1-dev,[https://doi.org/10.1073/pnas.2322688121],True,litqa-v2-test,Spatial heterogeneity within tumors due to var...
2,831621de-5e32-4006-af84-a40dba100866,DK015 and DK038 strains of Verticillium dahlia...,95%,"[94%, 96%, 97%, 98%]",BENCHMARK DATA SHOULD NEVER APPEAR IN TRAINING...,litqa,1.1-dev,[https://doi.org/10.1186/s12915-024-01900-6],True,litqa-v2-test,"The strains DK015 and DK038, with opposite MAT..."
3,3e6d7a54-5b8a-4aa0-ac6e-1fce986d1636,Expression of which of the following genes was...,Aldh1l1,"[MAPK, Actin, none of the above]",BENCHMARK DATA SHOULD NEVER APPEAR IN TRAINING...,litqa,1.1-dev,[https://doi.org/10.1073/pnas.2321711121],True,litqa-v2-test,The mitogen-activated protein kinase (MAPK) pa...
4,e4579ca5-c7d4-47a0-88f5-8adc460fc936,For which of the following Trub1 substrates di...,SCP2,"[FBXO5, HECTD1, NKAIN1, CCDC22, IDI1]",BENCHMARK DATA SHOULD NEVER APPEAR IN TRAINING...,litqa,1.1-dev,[https://doi.org/10.1101/2024.03.26.586895],True,litqa-v2-test,"Among the Trub1 substrates, FBXO5 (chr6:152975..."


In [15]:
UNCERTAIN_ANSWER_CHOICE = "Insufficient information to answer the question."

def record_to_sample_custom(record: dict) -> Sample:
    # Preprocessing 
    choices = []
    choices.append(record["ideal"])
    choices.extend(record["distractors"])
    choices.append(UNCERTAIN_ANSWER_CHOICE)
    
    return Sample(
        input=record["question"],
        choices=choices,
        target="A"
    )

def convert_pandas_to_dataset(data: DataFrame) -> MemoryDataset:
    records = data.to_dict(orient="records")
    samples = [record_to_sample_custom(i) for i in records]
    
    # Add to Dataset Object
    dataset = MemoryDataset(samples)
    
    return dataset
    

In [16]:
test_dataset = convert_pandas_to_dataset(litqa2_test_data)
print(test_dataset.samples[0])
print(test_dataset.samples[1])

input='Approximately what percentage of topologically associated domains in the GM12878 blood cell line does DiffDomain classify as reorganized in the K562 cell line?' choices=['31%', '21%', '11%', '41%', '51%', 'Insufficient information to answer the question.'] target='A' id=None metadata=None sandbox=None files=None setup=None
input='At least how long do SynNotch-MCF10DCIS cells express BFP after contact with GFP+BMSC3 cells?' choices=['72 h', '24', '48 h', '0 h', '12 h', '6 h', '96 h', 'Insufficient information to answer the question.'] target='A' id=None metadata=None sandbox=None files=None setup=None


In [17]:
# Set up LLM config (main LLM for reasoning, extract metadata, ...)
llm_config_dict = {
    "model_list": [
        {
            "model_name": "gpt-4o-mini",
            "litellm_params": {
                "model": "gpt-4o-mini",
                "temperature": 0,
                "max_tokens": 4096
            }
        }
    ],
    "rate_limit": {"gpt-4o-mini": "30000 per 1 minute"}
}

# Set up agent (answer search and selecting tools):
agent_settings = AgentSettings(
    agent_llm="gpt-4o-mini",
    agent_llm_config={
        "rate_limit": "30000 per 1 minute"
    }
)

# Set up summary LLM config
summary_config_dict = {
    "rate_limit": {"gpt-4o-mini": "30000 per 1 minute"}
}

# Set up answer format
answer_settings = AnswerSettings(
    evidence_k=30,
    evidence_detailed_citations=False,
    evidence_retrieval=False,
    evidence_summary_length="around 100 words",
    evidence_skip_summary=False,
    answer_max_sources=5,
    max_answer_attempts=5,
    answer_length="1 letter"
)

# Set up the final settings object
paperqa_settings = Settings(
    llm="gpt-4o-mini",
    llm_config=llm_config_dict,
    summary_llm="gpt-4o-mini",
    summary_llm_config=summary_config_dict,
    agent=agent_settings,
    temperature=0,
    batch_size=1,
    verbosity=1,
    paper_directory="/root/paperQA2_analysis/data/LitQA_data/LitQA2_test_pdfs"
)       

In [80]:
# Answer Templates
# Create the templates for the questions?
MULTIPLE_CHOICE_TEMPLATE = """
The following is a multiple choice question about biology.
Please answer by responding with the letter of the correct answer.

Think step by step.

Question: {question}
Options:
{choices}

You MUST include the letter of the correct answer within the following format: 'ANSWER: $LETTER' (without quotes). For example, ’ANSWER: <answer>’, where <answer> is the correct letter. Always answer in exactly this format of a single letter, even if you are unsure. We require this because we use automatic parsing.
"""

In [137]:
# The Solver should use the bridge function
@solver
def paperqa_solver(
    template: str,
    settings: dict,
    shuffle: bool | None=None
) -> Solver:
    
    # Async run PaperQA2 
    async def solve(state: TaskState, generate: Generate):
        print(state.user_prompt.text)
        # Check choices available
        if not state.choices:
            raise ValueError("The multiple_choice solver requires samples with choices")
        
        
        # Format the prompt
        prompt = _multiple_choice.prompt(
            question=state.user_prompt.text,
            choices=state.choices,
            template=str(template)
        )
        
        # Run paperQA2 with the template as the input
        response = await agent_query(
            query=prompt,
            settings=settings
        )
        
        print("response completed")
        print(type(response))
        
        # Update the state
        state.messages.append(
            ChatMessageUser(
                content=response.completion
            )
        )
        
        # return await generate(state)
        return state
    
    return solve

In [None]:
# Mini Example with 1 working sample
example = {
    "question": litqa2_test_data["question"][0],
    "ideal": litqa2_test_data["ideal"][0],
    "distractors": litqa2_test_data["distractors"][0]
}

sample = record_to_sample_custom(example)
mini_dataset = MemoryDataset([sample])
# mini_dataset.shuffle_choices()

@task
def paperqa_eval_mini():
    return Task(
        dataset=mini_dataset,
        solver = paperqa_solver(template=MULTIPLE_CHOICE_TEMPLATE, settings=paperqa_settings),
        scorer=precision_choice(no_answer=UNCERTAIN_ANSWER_CHOICE),
        epochs=Epochs(1, "mode")
    )
    
nest_asyncio.apply()
eval(paperqa_eval_mini())


In [None]:
nest_asyncio.apply()

test_template = _multiple_choice.resource(MULTIPLE_CHOICE_TEMPLATE)

test_sample = record_to_sample_custom(example)
print(test_sample)
test_prompt = MULTIPLE_CHOICE_TEMPLATE.format(
    question=test_sample.input,
    choices=test_sample.choices
)

# Test if the ask function is working correctly:
test_repsonse = ask(
    query=test_prompt,
    settings=paperqa_settings
)

In [98]:
@task
def paperqa_eval():
    return Task(
        dataset=test_dataset,
        solver = bridge(paperqa_solver(template=MULTIPLE_CHOICE_TEMPLATE, settings=paperqa_settings)),
        scorer=precision_choice(no_answer=UNCERTAIN_ANSWER_CHOICE),
        epochs=Epochs(1, "mode")
    )

In [2]:
MULTIPLE_CHOICE_TEMPLATE_CUSTOM = """
The following is a multiple choice question about biology.
Please answer by responding with the letter of the correct answer.

Think step by step.

{question}

You MUST include the letter of the correct answer within the following format: 'ANSWER: $LETTER' (without quotes). For example, ’ANSWER: <answer>’, where <answer> is the correct letter. Always answer in exactly this format of a single letter, even if you are unsure. We require this because we use automatic parsing.
Strictly include none of your reasoning. 
"""



# Record to Sample Custom
def record_to_sample_custom(record: dict) -> Sample:
    # Get the question
    message = f"Question: {record["question"]} \n"
    
    # Concatenate the choices
    choices = [record["ideal"]]
    choices.extend(record["distractors"])
    choices.append(UNCERTAIN_ANSWER_CHOICE)
    
    # Shuffle the dataset
    random.shuffle(choices)
    
    # Find the ideal answer
    ideal_idx = choices.index(record["ideal"])
    
    # Add prefixes to the shuffled choices
    indices = list[range(len(choices))]
    message +=  "\n".join(
        [f"{chr(65 + i)}) {j}" for i, j in enumerate(choices)]
    )
    
    # Make the message a part of the Sample
    return Sample(
        input=message,
        choices=choices,
        target=f"{chr(65 + ideal_idx)}"
    )
    
    

# Preprocessing Code for Bridge Method
def df_2_sample_bridge(data: DataFrame) -> MemoryDataset:
    records = data.to_dict(orient="records")
    samples = [record_to_sample_custom(i) for i in records]
    return MemoryDataset(samples)

In [18]:
test_dataset = df_2_sample_bridge(litqa2_test_data)
print(test_dataset.samples[0])
print(test_dataset.samples[1])

input='Approximately what percentage of topologically associated domains in the GM12878 blood cell line does DiffDomain classify as reorganized in the K562 cell line?' choices=['31%', '21%', '11%', '41%', '51%', 'Insufficient information to answer the question.'] target='A' id=None metadata=None sandbox=None files=None setup=None
input='At least how long do SynNotch-MCF10DCIS cells express BFP after contact with GFP+BMSC3 cells?' choices=['72 h', '24', '48 h', '0 h', '12 h', '6 h', '96 h', 'Insufficient information to answer the question.'] target='A' id=None metadata=None sandbox=None files=None setup=None


In [20]:
import os
import copy
from autogen import LLMConfig, ConversableAgent
from pydantic import BaseModel, Field

class StructuredModel(BaseModel):
    answer: str = Field(..., description="Answer, the single letter answer to the question, in the format of $LETTER")
    explanation: str = Field(..., description="Explanation, a short explanation of the answer with any citations found within the text.")
    citations: list[str] = Field(..., description="Citations, a list of citations found within the text.")
    
    def format(self) -> str:
        return f"Answer: {self.answer}\nExplanation: {self.explanation}\nCitations: {self.citations}"

llm_config = LLMConfig(
    api_type="openai",
    api_key=os.getenv("OPENAI_API_KEY"),
    model="gpt-4o-mini",
    temperature=0.1,
    response_format=StructuredModel,
)
    
    
def structured_agent(output_text: str, structure: BaseModel, llm_config: LLMConfig):        
    # Create the agent 
    agent = ConversableAgent(
        name="structured_agent",
        llm_config=llm_config,
        system_message="You are an agent that is able to parse the output of a given text and return the desired output.",
    )
    
    # Create the message
    answer_message = """
    Please could you parse the following text and return the desired output.
    
    Text:
    {text}
    """
    answer_message = answer_message.format(text=output_text)
    
    
    # Run the agent to take the output text and return the desired output.
    response =agent.run(
        message=answer_message,
        max_turns=1,
    )
    
    response.process()
    
    return response.output
    

In [11]:
!echo $HOME

/root


In [9]:
print(os.getenv("OPENAI_API_KEY"))

None


In [None]:
# Testing the output of the agent

nest_asyncio.apply()

test_text = """Answer: DiffDomain identifies that approximately 30.771% of topologically associated domains (TADs) in  
           the GM12878 blood cell line are reorganized in the K562 cell line                                       
           (hua2024diffdomainenablesidentification pages 4-4). This finding is significant when compared to other  
           methods, such as TADCompare, HiCcompare, and HiC-DC+, which only identify ≤8.256% of GM12878 TADs as    
           reorganized in K562. The benchmarking results highlight the efficacy of DiffDomain in detecting         
           reorganized TADs between these cell lines (hua2024diffdomainenablesidentification pages 4-4).           
                                                                                                                   
           Additionally, the analysis indicates that the majority of identified reorganized TADs have a minimum of 
           43.137%, a median of 81.357%, and a maximum of 98.022% represented by other subtypes                    
           (hua2024diffdomainenablesidentification pages 4-5). This suggests a robust capability of DiffDomain in  
           identifying reorganized TADs, establishing a notable extent of reorganization between GM12878 and K562  
           (hua2024diffdomainenablesidentification pages 4-5).                                                     
                                                                                                                   
           In summary, the percentage of TADs in GM12878 classified as reorganized in K562 by DiffDomain is        
           approximately 30.771%, which aligns with option E in the multiple-choice question.                      
                                                                                                                   
           ANSWER: E"""

structured_answer = structured_agent(
    output_text=test_text,
    structure=StructuredModel,
    llm_config=llm_config,
)



In [6]:
def paperqa_agent(
    template: str,
    settings: dict,
):
    # Async run the solve:
    async def run(sample: dict[str]) -> dict:
        
        # TODO: We need to run async here, but currently left as synchronous for testting
        response = ask(
            query=template.format(
                question=sample["messages"][0]["content"],
            ),
            settings=settings
        )
        
        # TODO: Implement another AG2 agent to take the output of the first agent and return the desired output. 
        
        return {"output": response.session.answer[-1]}
    
    return run

Create a mini-version with 1 sample

In [None]:
# Mini Example with 1 working sample
example = {
    "question": litqa2_test_data["question"][0],
    "ideal": litqa2_test_data["ideal"][0],
    "distractors": litqa2_test_data["distractors"][0]
}

sample = record_to_sample_custom(example)
mini_dataset = MemoryDataset([sample])
# mini_dataset.shuffle_choices()

@task
def paperqa_eval_mini():
    return Task(
        dataset=mini_dataset,
        solver = bridge(paperqa_agent(template=MULTIPLE_CHOICE_TEMPLATE_CUSTOM, settings=paperqa_settings)),
        scorer=precision_choice(no_answer=UNCERTAIN_ANSWER_CHOICE),
        epochs=Epochs(1, "mode")
    )
    
# asyncio.run(eval(paperqa_eval_mini()))
eval(paperqa_eval_mini())