In [None]:
!pip install -qq -r requirements.txt

In [None]:
from src.bda_processor import BDAProcessor
from src.evaluator import Evaluator

# Intelligent Document Processing using Bedrock Data Automation (BDA) 

#### The purpose of this notebook is to demonstrate how to transform unstructured medical data into structured data using Bedrock Data Automation. The BDAProcessor Class has already been implemented to be used in this notebook. The class contains the functions required to:
* Create a BDA Project
* Create a BDA Blueprint
* Start a BDA Invocation

## Configure Bedrock Data Automation Project 

#### Initialize BDAProcessor class

In [None]:
bda_processor = BDAProcessor()

#### Create BDA project (Note: the project name must unique)

In [None]:
project_arn = bda_processor.create_bda_project(project_name="bda-project-demo-1")

In [None]:
project_arn

## Configure Bedrock Data Automation Blueprint 

#### TODO: Your task is to edit the blueprint schema, which contains the instructions used by the LLM to guide the extraction process

In [None]:
blueprint_schema = {
    "$schema": "http://json-schema.org/draft-07/schema#",
    "description": "This is a blueprint for a pathology report",
    "class": "Pathology Report",
    "type": "object",
    "definitions": {},
    "properties": {
        "hospital_name": {
            "type": "string",
            "inferenceType": "explicit",
            "instruction": "Name of hospital"
        },
        "lab_name": {
            "type": "string",
            "inferenceType": "explicit",
            "instruction": "Name of lab"
        },
        "physician_name": {
            "type": "string",
            "inferenceType": "explicit",
            "instruction": "Name of physician. Return first name and last name as a single string value"
        },
        "has_serum_specimen": {
            "type": "string",
            "inferenceType": "explicit",
            "instruction": "Whether a serum specimen was collected. Return Yes or No"
        },
        "serum_receiving_date": {
            "type": "string",
            "inferenceType": "explicit",
            "instruction": "Date in which serum specimen was received. Return the date in this format: MM/DD/YYY."
        }
    }
}


#### Create BDA blueprint (Note: the blueprint name must be unique)

In [None]:
blueprint_arn = bda_processor.create_blueprint(
    blueprint_name="bda-blueprint-demo-1", 
    blueprint_schema=blueprint_schema)

In [None]:
blueprint_arn

## Invoke Bedrock Data Automation Job

#### The BDA automation job is asynchronous. A job ID is returned, which can be used later to get the BDA results.

In [None]:
job_id = bda_processor.start_data_automation(
    file_path="input_files/patient_02.pdf", 
    blueprint_arn=blueprint_arn)

In [None]:
job_id

#### Get BDA results using the job ID

In [None]:
bda_results_df = bda_processor.get_data_automation_results(job_id=job_id)

## BDA Extraction Results

#### Initialize Evaluator class

In [None]:
evaluator = Evaluator()

#### Create comparison dataframe that joins the ground truth and LLM results

In [None]:
comparison_df = evaluator.create_comparison_df(
    ground_truth_path="ground_truth.csv",
    results_path="output/results/bda_results/patient_02_bda_results.csv"
)

In [None]:
comparison_df

## Perform LLM Evaluation

#### Oftentimes, the extracted results do not perfectly match the ground truth results. As such, we may want to use additional evaluation methods, such as fuzzy match or LLM match to handle these imperfections. The following values can be specified for the "match_type" when calculating the accuracy of the extraction results. In order to use the LLM match, you must first request access to "Claude 3.5 Haiku" from the Bedrock Console.
* EXACT
* FUZZY
* LLM
* FUZZY_AND_LLM

In [None]:
exact_match_df = evaluator.calculate_accuracy(comparison_df, match_type="EXACT")
fuzzy_match_df = evaluator.calculate_accuracy(comparison_df, match_type="FUZZY")
llm_match_df = evaluator.calculate_accuracy(comparison_df, match_type="LLM")
llm_and_fuzzy_df = evaluator.calculate_accuracy(comparison_df, match_type="FUZZY_AND_LLM")