In [1]:
import os
import sys
import torch

# Add src directory to path
sys.path.insert(0, os.path.abspath('../src'))

from parser import process_pdf
from rag import build_rag_index_from_sections
from multiagent import DocumentMultiAgent
from section_splitter import split_into_sections
from section_classifier import embed, TARGET_QUERIES
from agents import *



In [2]:
parsed_data = process_pdf('../data/protocol.pdf')

In [None]:
print(parsed_data)

In [3]:
sections = split_into_sections(parsed_data)

In [4]:
def _top_sections_for_target(sections, target, num_sections=2):
    section_titles = list(sections.keys())
    section_embeddings = embed(list(sections.keys()))

    query_embedding = embed([TARGET_QUERIES[target]])[0]
    scores = torch.matmul(section_embeddings, query_embedding)

    top_scores, top_indices = torch.topk(scores, k=min(num_sections, len(section_titles)))

    results = []
    for score, idx in zip(top_scores.tolist(), top_indices.tolist()):
        results.append((section_titles[idx], score))

    return results


def run_objectives_extraction(sections, num_sections=2):
    """
    Extract objectives/endpoints without enforced structure.
    """
    top_sections = _top_sections_for_target(sections, "objectives and endpoints", num_sections=num_sections)

    print("objectives -> " + ", ".join([f"{title} (score={score:.3f})" for title, score in top_sections]))

    combined_content = "\n\n".join([sections[title] for title, _ in top_sections])

    return extract_objectives(combined_content)


def run_eligibility_extraction(sections, num_sections=2):
    """
    Extract eligibility criteria without enforced structure.
    """
    top_sections = _top_sections_for_target(sections, "eligibility", num_sections=num_sections)

    print("eligibility -> " + ", ".join([f"{title} (score={score:.3f})" for title, score in top_sections]))

    combined_content = "\n\n".join([sections[title] for title, _ in top_sections])

    return extract_eligibility(combined_content)

def run_soa_extraction(sections, num_sections=2):
    """
    Extract schedule of activities without enforced structure.
    """
    top_sections = _top_sections_for_target(sections, "schedule of activities", num_sections=num_sections)

    print("schedule of activities -> " + ", ".join([f"{title} (score={score:.3f})" for title, score in top_sections]))

    combined_content = "\n\n".join([sections[title] for title, _ in top_sections])
    
    return extract_soa(combined_content)

def run_visit_definitions_extraction(sections, num_sections=2):
    """
    Extract visit definitions and timing without enforced structure.
    """
    top_sections = _top_sections_for_target(sections, "visit_definitions", num_sections=num_sections)

    print("visit definitions -> " + ", ".join([f"{title} (score={score:.3f})" for title, score in top_sections]))

    combined_content = "\n\n".join([sections[title] for title, _ in top_sections])
    
    return extract_visit_definitions(combined_content)

In [8]:
# objectives_output = run_objectives_extraction(sections, num_sections=2)
# eligibility_output = run_eligibility_extraction(sections, num_sections=2)
soa_output = run_soa_extraction(sections, num_sections=2)
# visit_definitions_output = run_visit_definitions_extraction(sections, num_sections=2)

schedule of activities -> PROTOCOL   SUMMARY: Schedule of Activities
The SoA tables include: (score=0.483), STUDY  ASSESSMENTS   AND  PROCEDURES: Medical Resource Utilization and Health Economics (score=0.477)


In [9]:
print(soa_output)

| Procedure / Study Day                                      | Day -14 to Day 1a   | For details see Section   |
|:-----------------------------------------------------------|:--------------------|:--------------------------|
| Informed consent: main study                               | X                   | 5.1                       |
| Assignment SID number                                      | X                   | 6.3                       |
| Medical history                                            | X                   | 5.1, 5.2                  |
| Complete physical examination, including height and weight | X                   | 8.2.1                     |
| Vital signs (including pulse oximetry)                     | X                   | 8.2.2                     |
| Pregnancy test –urine or serum (WOCBP only) b              | X                   | 8.2.3                     |
| Assessment of SAEs                                         | X                   | 8.3        

In [None]:
rag_index = build_rag_index_from_sections(sections)

question = "What are the primary objectives of the study?"
rag_answer = rag_index.answer(question, top_k=5)
print(rag_answer)

In [None]:
len(rag_index.documents)

In [None]:
multiagent = DocumentMultiAgent(sections)

question = "What are the primary objectives of the study?"
response_json = multiagent.answer(question)
print(response_json)

In [None]:
response_json