In [1]:
import os
import sys
import torch

# Add src directory to path
sys.path.insert(0, os.path.abspath('../src'))

from parser import process_pdf
from section_splitter import split_into_sections
from section_classifier import embed, TARGET_QUERIES
from agents import *

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


In [2]:
parsed_data = process_pdf('../data/protocol.pdf')

In [3]:
sections = split_into_sections(parsed_data)

In [4]:
def _top_sections_for_target(sections, target, num_sections=2):
    section_titles = list(sections.keys())
    section_embeddings = embed(list(sections.keys()))

    query_embedding = embed([TARGET_QUERIES[target]])[0]
    scores = torch.matmul(section_embeddings, query_embedding)

    top_scores, top_indices = torch.topk(scores, k=min(num_sections, len(section_titles)))

    results = []
    for score, idx in zip(top_scores.tolist(), top_indices.tolist()):
        results.append((section_titles[idx], score))

    return results


def run_objectives_extraction(sections, num_sections=2):
    """
    Extract objectives/endpoints without enforced structure.
    """
    top_sections = _top_sections_for_target(sections, "objectives and endpoints", num_sections=num_sections)

    print("objectives -> " + ", ".join([f"{title} (score={score:.3f})" for title, score in top_sections]))

    combined_content = "\n\n".join([sections[title] for title, _ in top_sections])

    return extract_objectives(combined_content)


def run_eligibility_extraction(sections, num_sections=2):
    """
    Extract eligibility criteria without enforced structure.
    """
    top_sections = _top_sections_for_target(sections, "eligibility", num_sections=num_sections)

    print("eligibility -> " + ", ".join([f"{title} (score={score:.3f})" for title, score in top_sections]))

    combined_content = "\n\n".join([sections[title] for title, _ in top_sections])

    return extract_eligibility(combined_content)

def run_soa_extraction(sections, num_sections=2):
    """
    Extract schedule of activities without enforced structure.
    """
    top_sections = _top_sections_for_target(sections, "schedule of activities", num_sections=num_sections)

    print("schedule of activities -> " + ", ".join([f"{title} (score={score:.3f})" for title, score in top_sections]))

    combined_content = "\n\n".join([sections[title] for title, _ in top_sections])
    
    return extract_soa(combined_content)

def run_visit_definitions_extraction(sections, num_sections=2):
    """
    Extract visit definitions and timing without enforced structure.
    """
    top_sections = _top_sections_for_target(sections, "visit_definitions", num_sections=num_sections)

    print("visit definitions -> " + ", ".join([f"{title} (score={score:.3f})" for title, score in top_sections]))

    combined_content = "\n\n".join([sections[title] for title, _ in top_sections])
    
    return extract_visit_definitions(combined_content)

In [5]:
objectives_output = run_objectives_extraction(sections, num_sections=2)
eligibility_output = run_eligibility_extraction(sections, num_sections=2)
soa_output = run_soa_extraction(sections, num_sections=2)
visit_definitions_output = run_visit_definitions_extraction(sections, num_sections=2)

objectives -> OBJECTIVES   AND ENDPOINTS (score=0.571), STUDY  DESIGN: Scientific Rationale for Study Design: Rationale for Study Design and Participant Population, Rationale for Study Endpoints
The efficacy endpoints in this study are analogous to endpoints used for evaluating the (score=0.561)
eligibility -> STUDY  POPULATION: Inclusion Criteria
Participants are eligible to be included inthe study only if all of the following criteria apply:
Age (score=0.803), STUDY  POPULATION: Exclusion Criteria
Participants are excluded from the study if any of the following criteria apply:
Medical Conditions
1  History of allergy to any component of the vaccine (score=0.620)
schedule of activities -> PROTOCOL   SUMMARY: Schedule of Activities
The SoA tables include: (score=0.483), STUDY  ASSESSMENTS   AND  PROCEDURES: Medical Resource Utilization and Health Economics (score=0.477)
visit definitions -> STUDY  ASSESSMENTS   AND  PROCEDURES: Safety Assessments: Physical Examinations
A complete physi

In [10]:
print(eligibility_output)

```json
{
  "inclusion_criteria": [
    "Adult, ≥ 18 years of age at thetime of consent",
    "Increased risk of SARS-CoV-2 infection\n    Defined as adults whose locations or circumstances put them at appreciable risk of\n     exposure to SARS-CoV-2 and COVID-19, based on available risk assessment\n     contemporaneous to enrollment (believed to be at risk/exposure)",
    "Medically stable such that, according to the judgment of the investigator, hospitalization\n   within the study period is not anticipated and the participant appears likely to be able to\n   remain on study through the end of protocol-specified follow-up\n    A stable medical condition is defined as disease not requiring significant change in\n     therapy or hospitalization for worsening disease during the 3 months prior to\n     enrollment",
    "Able to understand and comply with study requirements/procedures (if applicable, with\n   assistance by caregiver, surrogate, or legally authorized representative) base