In [1]:
import os
import sys
import torch

# Add src directory to path
sys.path.insert(0, os.path.abspath('../src'))

from parser import process_pdf
from rag import build_rag_index_from_text
from multiagent import DocumentMultiAgent
from section_splitter import split_into_sections
from section_classifier import embed, TARGET_QUERIES
from agents import *



In [2]:
parsed_data = process_pdf('../data/protocol.pdf')

In [None]:
print(parsed_data)

In [3]:
sections = split_into_sections(parsed_data)

In [7]:
sections

{'PROTOCOL   SUMMARY: Synopsis': "1.1     Synopsis\nProtocol Title: A Phase III Randomized, Double-blind, Placebo-controlled Multicenter Study\nin Adults to Determine the Safety, Efficacy, and Immunogenicity of AZD1222, a\nNon-replicating ChAdOx1 Vector Vaccine, for the Prevention of COVID-19\nShort Title: Phase III Double-blind, Placebo-controlled Study of AZD1222 for the Prevention\nof COVID-19 in Adults\nRationale: The aim of the study is to assess the safety, efficacy, and immunogenicity of\nAZD1222 for the prevention of COVID-19. The COVID-19 pandemic has caused major\ndisruption to healthcare systems with significant socioeconomic impacts. Currently, there are\nno specific treatments available against COVID-19 and accelerated vaccine development is\nurgently needed. A safe and effective vaccine for COVID-19 prevention would have\nsignificant public health impact.\nObjectives and Endpoints\n          | Objective a                                 | Estimand bDescription/Endpoint   

In [None]:
# Agent configuration mapping
AGENT_CONFIG = {
    "objectives": {
        "target": "objectives and endpoints",
        "function": extract_objectives,
    },
    "eligibility": {
        "target": "eligibility",
        "function": extract_eligibility,
    },
    "soa": {
        "target": "schedule of activities",
        "function": extract_soa,
    },
    "visit_definitions": {
        "target": "visit_definitions",
        "function": extract_visit_definitions,
    },
    "key_assessments": {
        "target": "key_assessments",
        "function": extract_key_assessments,
    },
}


def _top_sections_for_target(sections, target, num_sections=2):
    section_titles = list(sections.keys())
    section_embeddings = embed(list(sections.keys()))

    query_embedding = embed([TARGET_QUERIES[target]])[0]
    scores = torch.matmul(section_embeddings, query_embedding)

    top_scores, top_indices = torch.topk(scores, k=min(num_sections, len(section_titles)))

    results = []
    for score, idx in zip(top_scores.tolist(), top_indices.tolist()):
        results.append((section_titles[idx], score))

    return results


def run_agent_extraction(sections, agent_name, num_sections=2):
    """
    Run extraction for a specific agent.
    
    Args:
        sections: Dictionary of section name -> content
        agent_name: One of "objectives", "eligibility", "soa", "visit_definitions", "key_assessments"
        num_sections: Number of top sections to retrieve
    
    Returns:
        Validated Pydantic model output from the agent
    """
    if agent_name not in AGENT_CONFIG:
        raise ValueError(f"Unknown agent: {agent_name}. Must be one of {list(AGENT_CONFIG.keys())}")
    
    config = AGENT_CONFIG[agent_name]
    target = config["target"]
    extraction_func = config["function"]
    
    # Get top relevant sections
    top_sections = _top_sections_for_target(sections, target, num_sections=num_sections)
    
    print(f"{agent_name} -> " + ", ".join([f"{title} (score={score:.3f})" for title, score in top_sections]))
    
    # Combine content
    combined_content = "\n\n".join([sections[title] for title, _ in top_sections])
    
    # Run extraction
    return extraction_func(combined_content)

In [None]:
objectives_output = run_agent_extraction(sections, "objectives", num_sections=2)
# eligibility_output = run_agent_extraction(sections, "eligibility", num_sections=2)
# soa_output = run_agent_extraction(sections, "soa", num_sections=2)
# visit_definitions_output = run_agent_extraction(sections, "visit_definitions", num_sections=2)
key_assessments_output = run_agent_extraction(sections, "key_assessments", num_sections=5)

objectives -> STATISTICAL  CONSIDERATIONS: Statistical Analyses: Efficacy: Primary Endpoint (score=0.681), STATISTICAL  CONSIDERATIONS: Statistical Analyses: Efficacy: Secondary Endpoints (score=0.679)
key assessments -> STUDY  ASSESSMENTS   AND  PROCEDURES: Safety Assessments: Clinical Laboratory Assessments (score=0.697), STUDY  ASSESSMENTS   AND  PROCEDURES: Safety Assessments: Vital Signs (score=0.681), STUDY  ASSESSMENTS   AND  PROCEDURES: Safety Assessments: Physical Examinations
A complete physical examination will be performed at screening followed bytargeted (score=0.648), STUDY  ASSESSMENTS   AND  PROCEDURES: Adverse Events and Serious Adverse Events
The principal investigator is responsible for ensuring that all staff involved in the study are: Adverse Events Based on Examinations and Tests
The results from the Clinical Study Protocol-mandated vital signs will be summarized in the (score=0.599), STUDY  ASSESSMENTS   AND  PROCEDURES: Adverse Events and Serious Adverse Events


In [18]:
# json printing for better readability
print(objectives_output)

primary=[Objective(objective='The primary endpoint is the first case of SARS-CoV-2 RT-PCR-positive symptomatic illness occurring ≥ 15 days post second doseof study intervention, in a participant with negative serostatus at baseline.', endpoints=['first case of SARS-CoV-2 RT-PCR-positive symptomatic illness occurring ≥ 15 days post second doseof study intervention, in a participant with negative serostatus at baseline'])] secondary=[Objective(objective='Incidence of the first post-treatment response (negative at baseline to positive post treatment with study intervention) for SARS-CoV-2 Nucleocapsid antibodiesoccurring ≥ 15 days post second dose of study intervention(key secondary endpoint)', endpoints=['Incidence of the first post-treatment response (negative at baseline to positive post treatment with study intervention) for SARS-CoV-2 Nucleocapsid antibodiesoccurring ≥ 15 days post second dose of study intervention']), Objective(objective='Incidence of the first case of SARS-CoV-2 RT

In [None]:
rag = build_rag_index_from_text(
    text=parsed_data,
    persist_dir="../data/rag_index",
    use_existing=False  # rebuild
)

question = "What are the key assessments and procedures in this study?"
rag_answer = rag.answer(question, top_k=8)
print(rag_answer)

In [None]:
multiagent = DocumentMultiAgent(sections, parsed_data)

question = "What are the key assessments and procedures in this study?"
response_json = multiagent.answer(question)
print(response_json)

In [None]:
response_json