In [1]:
import os
import sys
import torch

# Add src directory to path
sys.path.insert(0, os.path.abspath('../src'))

from parser import process_pdf
from rag import build_rag_index_from_text
from multiagent import DocumentMultiAgent
from section_splitter import split_into_sections
from section_classifier import embed, TARGET_QUERIES
from structure_chunker import *
from structured_retriever import *
from agents import *



In [2]:
parsed_data = process_pdf('../data/protocol.pdf')

In [None]:
parsed_data

In [3]:
chunks=build_structured_chunks(parsed_data)

In [4]:
retriever = BM25StructuredRetriever(chunks, TARGET_QUERIES)

results = retriever.retrieve_context("eligibility", top_k=3)


In [5]:
results



In [None]:
sections = split_into_sections(parsed_data)

In [None]:
sections

In [6]:
# Agent configuration mapping
AGENT_CONFIG = {
    "objectives": {
        "target": "objectives and endpoints",
        "function": extract_objectives,
    },
    "eligibility": {
        "target": "eligibility",
        "function": extract_eligibility,
    },
    "soa": {
        "target": "schedule of activities",
        "function": extract_soa,
    },
    "visit_definitions": {
        "target": "visit_definitions",
        "function": extract_visit_definitions,
    },
    "key_assessments": {
        "target": "key_assessments",
        "function": extract_key_assessments,
    },
}


# def _top_sections_for_target(sections, target, num_sections=2):
#     section_titles = list(sections.keys())
#     section_embeddings = embed(list(sections.keys()))

#     query_embedding = embed([TARGET_QUERIES[target]])[0]
#     scores = torch.matmul(section_embeddings, query_embedding)

#     top_scores, top_indices = torch.topk(scores, k=min(num_sections, len(section_titles)))

#     results = []
#     for score, idx in zip(top_scores.tolist(), top_indices.tolist()):
#         results.append((section_titles[idx], score))

#     return results


def run_agent_extraction(chunks, agent_name, num_sections=3):
    """
    Run extraction for a specific agent.
    
    Args:
        chunks: List of chunk objects with attributes like section_id, full_title, content
        agent_name: One of "objectives", "eligibility", "soa", "visit_definitions", "key_assessments"
        num_sections: Number of top sections to retrieve
    
    Returns:
        Validated Pydantic model output from the agent
    """
    if agent_name not in AGENT_CONFIG:
        raise ValueError(f"Unknown agent: {agent_name}. Must be one of {list(AGENT_CONFIG.keys())}")
    
    config = AGENT_CONFIG[agent_name]
    target = config["target"]
    extraction_func = config["function"]
    
    # Get top relevant sections
    retriever = BM25StructuredRetriever(chunks, TARGET_QUERIES)
    top_sections = retriever.retrieve_context(target, top_k=num_sections)
    
    # Run extraction
    return extraction_func(top_sections)

In [None]:
# objectives_output = run_agent_extraction(chunks, "objectives", num_sections=2)
# eligibility_output = run_agent_extraction(sections, "eligibility", num_sections=2)
# soa_output = run_agent_extraction(chunks, "soa", num_sections=2)
# visit_definitions_output = run_agent_extraction(sections, "visit_definitions", num_sections=2)
key_assessments_output = run_agent_extraction(chunks, "key_assessments", num_sections=5)

In [None]:
# json printing for better readability
print(json.dumps(key_assessments_output, indent=4))

ObjectivesByCategory(primary=[Objective(objective='To estimate the efficacy of 2 IM doses of AZD1222 compared to saline placebo for the prevention of COVID-19', endpoints=['A binary response, whereby a participant with negative serostatus at baseline is defined as a COVID-19 case if their first case of SARS-CoV-2 RT-PCR-positive symptomatic illness occurs ≥15days post second dose of study intervention. Otherwise, a participant is not defined as a COVID-19 case.']), Objective(objective='To assess the safety and tolerability of 2IM doses of AZD1222 compared to saline placebo', endpoints=['Incidence of AEs for 28days post each dose of study intervention', 'Incidence of SAEs, MAAEs, and AESIs from Day 1 post treatment through Day 730']), Objective(objective='To assess the reactogenicity of 2 IM doses of AZD1222 compared to saline placebo', endpoints=['Incidence of local and systemic solicited AEs for 7days post each dose of study intervention'])], secondary=[Objective(objective='To estimat

In [None]:
rag = build_rag_index_from_text(
    text=parsed_data,
    persist_dir="../data/rag_index",
    use_existing=False  # rebuild
)

question = "What are the key assessments and procedures in this study?"
rag_answer = rag.answer(question, top_k=8)
print(rag_answer)

In [None]:
multiagent = DocumentMultiAgent(sections, parsed_data)

question = "What are the key assessments and procedures in this study?"
response_json = multiagent.answer(question)
print(response_json)

In [None]:
response_json