In [1]:
from src.bda_processor import BDAProcessor
from src.evaluator import Evaluator

  from pandas.core.computation.check import NUMEXPR_INSTALLED


# Intelligent Document Processing using Bedrock Data Automation (BDA) 

#### The purpose of this notebook is to demonstrate how to transform unstructured medical data into structured data using Bedrock Data Automation. The BDAProcessor Class has already been implemented to be used in this notebook. The class contains the functions required to:
* Create a BDA Project
* Create a BDA Blueprint
* Start a BDA Invocation

## Configure Bedrock Data Automation Project 

#### Initialize BDAProcessor class

In [2]:
bda_processor = BDAProcessor()

#### Create BDA project (Note: the project name must unique)

In [4]:
project_arn = bda_processor.create_bda_project(project_name="bda-project-demo-1")

In [5]:
project_arn

'arn:aws:bedrock:us-east-1:839300737906:data-automation-project/4fe5c8f03f96'

## Configure Bedrock Data Automation Blueprint 

#### TODO: Your task is to edit the blueprint schema, which contains the instructions used by the LLM to guide the extraction process

In [None]:
blueprint_schema = {
    "$schema": "http://json-schema.org/draft-07/schema#",
    "description": "This is a blueprint for a pathology report",
    "class": "Pathology Report",
    "type": "object",
    "definitions": {},
    "properties": {
        "hospital_name": {
            "type": "string",
            "inferenceType": "explicit",
            "instruction": "Name of hospital"
        },
        "lab_name": {
            "type": "string",
            "inferenceType": "explicit",
            "instruction": "Name of lab"
        },
        "physician_name": {
            "type": "string",
            "inferenceType": "explicit",
            "instruction": "Name of physician. Return first name and last name as a single string value"
        },
        "has_serum_specimen": {
            "type": "string",
            "inferenceType": "explicit",
            "instruction": "Whether a serum specimen was collected. Return Yes or No"
        },
        "serum_receiving_date": {
            "type": "string",
            "inferenceType": "explicit",
            "instruction": "Date in which serum specimen was received. Return the date in this format: MM/DD/YYY. If the date is not explicitly labeled 'Receiving Date', return 'Unknown'."
        },
        "serum_reporting_date": {
            "type": "string",
            "inferenceType": "explicit",
            "instruction": "Date in which serum specimen was reported. Return the date in this format: MM/DD/YYY"
        },
        "serum_turnaround_time": {
            "type": "string",
            "inferenceType": "explicit",
            "instruction": "Turnaround time (TAT) for serum specimen"
        },
        "bilirubin_total": {
            "type": "string",
            "inferenceType": "explicit",
            "instruction": "Total bilirubin level"
        },
        "bilirubin_total_unit": {
            "type": "string",
            "inferenceType": "explicit",
            "instruction": "Total bilirubin unit"
        },
        "bilirubin_level": {
            "type": "string",
            "inferenceType": "explicit",
            "instruction": "High, low, or normal total bilirubin level. Return H, L, or N"
        },
        "bilirubin_conjugated": {
            "type": "string",
            "inferenceType": "explicit",
            "instruction": "Conjugated bilirubin level"
        },
        "bilirubin_conjugated_unit": {
            "type": "string",
            "inferenceType": "explicit",
            "instruction": "Conjugated bilirubin unit"
        },
        "bilirubin_conjugated_level": {
            "type": "string",
            "inferenceType": "explicit",
            "instruction": "High, low, or normal conjugated bilirubin level. Return H, L, or N"
        },
        "bilirubin_unconjugated": {
            "type": "string",
            "inferenceType": "explicit",
            "instruction": "Unconjugated bilirubin level"
        },
        "bilirubin_unconjugated_unit": {
            "type": "string",
            "inferenceType": "explicit",
            "instruction": "Unconjugated bilirubin unit"
        },
        "bilirubin_unconjugated_level": {
            "type": "string",
            "inferenceType": "explicit",
            "instruction": "High, low, or normal unconjugated bilirubin level. Return H, L, or N"
        },
        "has_blood_specimen": {
            "type": "string",
            "inferenceType": "explicit",
            "instruction": "Whether a blood specimen was collected. Return Yes or No"
        },
        "blood_receiving_date": {
            "type": "string",
            "inferenceType": "explicit",
            "instruction": "Date in which blood specimen was received. Return the date in this format: MM/DD/YYY"
        },
        "blood_reporting_date": {
            "type": "string",
            "inferenceType": "explicit",
            "instruction": "Date in which blood specimen was reported. Return the date in this format: MM/DD/YYY"
        },
        "blood_turnaround_time": {
            "type": "string",
            "inferenceType": "explicit",
            "instruction": "Turnaround time (TAT) for blood specimen"
        },
        "ammonia": {
            "type": "string",
            "inferenceType": "explicit",
            "instruction": "Ammonia level"
        },
        "ammonia_unit": {
            "type": "string",
            "inferenceType": "explicit",
            "instruction": "Ammonia unit"
        },
        "ammonia_level": {
            "type": "string",
            "inferenceType": "explicit",
            "instruction": "High, low, or normal ammonia level. Return H, L, or N"
        }
    }
}


#### Create BDA blueprint (Note: the blueprint name must be unique)

In [8]:
blueprint_arn = bda_processor.create_blueprint(
    blueprint_name="bda-blueprint-demo-1", 
    blueprint_schema=blueprint_schema)

In [9]:
blueprint_arn

'arn:aws:bedrock:us-east-1:839300737906:blueprint/d8a1b1b72c88'

## Invoke Bedrock Data Automation Job

#### The BDA automation job is asynchronous. A job ID is returned, which can be used later to get the BDA results.

In [10]:
job_id = bda_processor.start_data_automation(
    file_path="input_files/patient_02.pdf", 
    blueprint_arn=blueprint_arn)

In [11]:
job_id

'7d366621-582b-4d4c-a11f-a7881e4c227f'

#### Get BDA results using the job ID

In [12]:
bda_results_df = bda_processor.get_data_automation_results(job_id=job_id)

## BDA Extraction Results

#### Initialize Evaluator class

In [13]:
evaluator = Evaluator()

#### Create comparison dataframe that joins the ground truth and LLM results

In [14]:
comparison_df = evaluator.create_comparison_df(
    ground_truth_path="ground_truth.csv",
    results_path="output/results/bda_results/patient_02_bda_results.csv"
)

In [15]:
comparison_df

Unnamed: 0,field_name,field_value,bda_value
0,hospital_name,Sarvodaya Hospital,SARVODAYA HOSPITAL
1,lab_name,Diagnostic Point Pathology Labs,Diagnostic Point Pathology Labs
2,physician_name,Atul Kapila,Dr. (Capt) Atul Kapila
3,has_serum_specimen,Yes,Yes
4,serum_receiving_date,Unknown,04/18/2025
5,serum_reporting_date,Unknown,04/18/2025
6,serum_turnaround_time,Unknown,2 Hours 29 Minute
7,bilirubin_total,9.42,9.42
8,bilirubin_total_unit,mg/dl,mg/dl
9,bilirubin_level,H,H


## Perform LLM Evaluation

#### Oftentimes, the extracted results do not perfectly match the ground truth results. As such, we may want to use additional evaluation methods, such as fuzzy match or LLM match to handle these imperfections. The following values can be specified for the "match_type" when calculating the accuracy of the extraction results.
* EXACT
* FUZZY
* LLM
* FUZZY_AND_LLM

In [16]:
exact_match_df = evaluator.calculate_accuracy(comparison_df, match_type="EXACT")
fuzzy_match_df = evaluator.calculate_accuracy(comparison_df, match_type="FUZZY")
llm_match_df = evaluator.calculate_accuracy(comparison_df, match_type="LLM")
llm_and_fuzzy_df = evaluator.calculate_accuracy(comparison_df, match_type="FUZZY_AND_LLM")

Exact match accuracy: 56.52%
Fuzzy match accuracy: 82.61%
LLM match accuracy: 86.96%
LLM and Fuzzy match accuracy: 86.96%
