In [56]:
%pip install boto3 PyPDF2

Note: you may need to restart the kernel to use updated packages.


In [62]:
import boto3
import json
import io
import base64
from pathlib import Path
# from pypdf import PdfReader, PdfWriter
from PyPDF2 import PdfWriter, PdfReader, PdfFileMerger
from botocore.config import Config
import logging
import re
# logging.basicConfig(level=logging.DEBUG)
# logging.getLogger("botocore").setLevel(logging.DEBUG)

# (Optional) customize timeouts if you found you needed it:
cfg = Config(connect_timeout=30, read_timeout=300)
session = boto3.Session(profile_name="par_servicios")
# Create the Bedrock client:
region = "us-east-1"
bedrock = session.client(
    "bedrock-runtime",
    region_name=region,
    config=cfg
)


DEBUG:botocore.hooks:Changing event name from creating-client-class.iot-data to creating-client-class.iot-data-plane
DEBUG:botocore.hooks:Changing event name from before-call.apigateway to before-call.api-gateway
DEBUG:botocore.hooks:Changing event name from request-created.machinelearning.Predict to request-created.machine-learning.Predict
DEBUG:botocore.hooks:Changing event name from before-parameter-build.autoscaling.CreateLaunchConfiguration to before-parameter-build.auto-scaling.CreateLaunchConfiguration
DEBUG:botocore.hooks:Changing event name from before-parameter-build.route53 to before-parameter-build.route-53
DEBUG:botocore.hooks:Changing event name from request-created.cloudsearchdomain.Search to request-created.cloudsearch-domain.Search
DEBUG:botocore.hooks:Changing event name from docs.*.autoscaling.CreateLaunchConfiguration.complete-section to docs.*.auto-scaling.CreateLaunchConfiguration.complete-section
DEBUG:botocore.hooks:Changing event name from before-parameter-buil

In [None]:
def read_document(file_to_read):
    with open(file_to_read, "rb") as document:
        raw = document.read()
        return raw

def markdown_to_plain(md_path: Path) -> str:
    """Return a prompt string with most Markdown chrome removed."""
    text = md_path.read_text(encoding="utf-8")

    # 1) Remove HTML comments (often used for model delimiters)
    text = re.sub(r"<!--.*?-->", "", text, flags=re.DOTALL)

    # 2) Drop <details> … </details> blocks (examples already in your codebase)
    text = re.sub(r"<details>.*?</details>", "", text, flags=re.DOTALL | re.IGNORECASE)

    # 3) Remove fenced-code blocks ```…``` (they tend to confuse the model)
    text = re.sub(r"```.*?```", "", text, flags=re.DOTALL)

    # 4) Strip Markdown headings, bold, italics, tables
    text = re.sub(r"^#+\s*", "", text, flags=re.MULTILINE)      # headings
    text = re.sub(r"\*\*(.+?)\*\*", r"\1", text)               # bold → plain
    text = re.sub(r"__(.+?)__", r"\1", text)
    text = re.sub(r"\*(.+?)\*",  r"\1", text)                  # italics
    text = re.sub(r"_([^_]+)_", r"\1", text)
    text = re.sub(r"^\|.*\|\s*$", "", text, flags=re.MULTILINE) # tables rows

    # 5) Collapse multiple blank lines
    text = re.sub(r"\n{2,}", "\n", text).strip()

    return text

def get_instructions(action: str) -> str:
    mapping = {"clasification": "./instructions/clasification.txt"}
    fn = mapping.get(action, "./instructions/clasification.txt")
    return Path(fn).read_text()

def create_messages(prompt: str, pdf_bytes: bytes):
    # One single message that contains both:
    #  1) A text prompt block to tell the model what to do, and
    #  2) A document block carrying your PDF
    msg = {
        "role": "user",
        "content": [
            # 1) prompt about what you want done
            {"text": prompt},

            # 2) the PDF itself, under a "document" key:
            {
                "document": {
                    "name":   "document_to_evaluate.pdf",  # an arbitrary label
                    "format": "pdf",                       # file format
                    "source": {
                        # "bytes": pdf_bytes                # raw bytes → SDK handles the rest
                        "bytes": base64.b64encode(pdf_bytes).decode("utf-8")
                    }
                }
            }
        ]
    }
    return [msg]

def set_model_params(max_tokens=300, top_p=0.1, temperature=0.3):
    return {
        "maxTokens":    max_tokens,
        "topP":         top_p,
        "temperature":  temperature
    }

def invoke_nova(model_id: str, messages: list, inference_cfg: dict):
    payload = {
        "messages":        messages,
        "inferenceConfig": inference_cfg
    }

    print(">>> PAYLOAD:", json.dumps(payload, indent=2, ensure_ascii=False))

    resp = bedrock.invoke_model(
        modelId=model_id,
        contentType="application/json",
        accept="application/json",
        body=json.dumps(payload).encode("utf-8")
    )
    body = resp["body"].read().decode("utf-8")
    print(">>> RAW RESPONSE (first 300 chars):", body[:300])
    return json.loads(body)


In [None]:
def split_pdf_into_chunks(pdf_path: Path, chunk_size: int) -> list[bytes]:
    """
    Split the PDF into chunks of `chunk_size` pages.
    Returns a list where each element is the raw-PDF bytes for that chunk.
    """
    reader = PdfReader(str(pdf_path))
    total_pages = len(reader.pages)
    chunks = []

    for start in range(0, total_pages, chunk_size):
        writer = PdfWriter()
        # add pages [start .. start+chunk_size)
        for i in range(start, min(start + chunk_size, total_pages)):
            writer.add_page(reader.pages[i])
        buf = io.BytesIO()
        writer.write(buf)
        chunks.append(buf.getvalue())

    return chunks


def escape_controls(s):
    def esc(m): return r"\u%04x" % ord(m.group())
    # return re.sub(r"[\x00-\x1F\x7F]", esc, s)
    return re.sub(r"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]", esc, s)

def resume_json(partial: str, model_id: str, cfg: dict) -> str:
    follow = (
        "Your last JSON response was cut off. "
        "Please continue exactly from where you left off, "
        "completing the JSON without repeating keys."
    )

    msgs = [
        {"role":"user", "content":[{"text": partial}]},
        {"role":"user", "content":[{"text": follow}]}
    ]

    cont = invoke_nova(model_id, msgs, cfg)
    return cont["output"]["message"]["content"][0]["text"]


def parse_json_blob(text: str) -> dict:
    """
    Parse JSON from the model response text.
    The text should contain a JSON object with the classification results.
    """
    text = text.strip()

    # If the text starts and ends with curly braces, it's likely already the JSON we want
    if text.startswith('{') and text.endswith('}'):
        try:
            return json.loads(text)
        except json.JSONDecodeError as e:
            print(f"Direct JSON parse failed: {e}")
            # Fall through to try other methods

    # Try to find JSON within the text
    # Look for content between first { and last }
    start_idx = text.find('{')
    end_idx = text.rfind('}')

    if start_idx != -1 and end_idx != -1 and end_idx > start_idx:
        json_str = text[start_idx:end_idx + 1]
        try:
            return json.loads(json_str)
        except json.JSONDecodeError as e:
            print(f"Extracted JSON parse failed: {e}")
            print(f"Attempted to parse: {repr(json_str[:200])}")

    # If all else fails, return an error dict
    raise ValueError(f"Could not extract valid JSON from text: {repr(text[:200])}")

def classify_chunks(
    pdf_chunks: list[bytes],
    prompt: str,
    model_id: str,
    cfg: dict,
    chunk_size: int
) -> list[dict]:
    """
    For each PDF‐blob chunk, invoke the model and return a dict with:
        - parsed JSON fields
        - role
        - stopReason
        - usage
    """
    results      = []
    total_chunks = len(pdf_chunks)

    for idx, blob in enumerate(pdf_chunks, start=1):
        # 1) prompt header
        pages_in_blob = len(PdfReader(io.BytesIO(blob)).pages)
        start_page    = (idx-1)*chunk_size + 1
        end_page      = start_page + pages_in_blob - 1

        chunk_prompt = (
            f"{prompt}\n\n"
            f"--- Chunk {idx}/{total_chunks} "
            f"(pages {start_page} to {end_page}) ---"
        )

        # 2) invoke
        msgs = create_messages(chunk_prompt, blob)
        raw  = invoke_nova(model_id, msgs, cfg)
        raw_text = raw["output"]["message"]["content"][0]["text"]
        sr       = raw.get("stopReason")

        # Only try to resume if the response was actually truncated AND ends with incomplete JSON
        if sr == "max_tokens" and not raw_text.strip().endswith("}"):
            print(f"⚠️ Response truncated for chunk {idx}, attempting to resume...")
            # get the rest
            suffix   = resume_json(raw_text, model_id, cfg)
            # stitch, dropping any duplicate "{" or "}"
            raw_text = raw_text.rstrip("\n") + suffix

        # Debug: Print the raw response to see what we're actually getting
        print(f"Raw response for chunk {idx} (first 200 chars):", repr(raw_text[:200]))

        save_to_json(raw, f"response_{idx:02d}.json")
        save_to_json(raw_text, f"response_{idx:02d}_raw.json")

        # 3) extract the assistant role + stopReason + usage
        role       = raw["output"]["message"]["role"]
        stop_reason= raw.get("stopReason")
        usage      = raw.get("usage", {})

        try:
            safe = escape_controls(raw_text)
            parsed = parse_json_blob(safe)

            # Ensure we have the required fields, provide defaults if missing
            if "document_number" not in parsed:
                parsed["document_number"] = f"UNKNOWN_{idx}"
            if "document_type" not in parsed:
                parsed["document_type"] = "unknown"
            if "Category" not in parsed and "category" in parsed:
                # Handle case sensitivity
                parsed["Category"] = parsed["category"]
            elif "Category" not in parsed:
                parsed["Category"] = "unknown"

        except ValueError as ve:
            print(f"⚠️ Chunk {idx} JSON parse error:", ve)
            print(f"Raw text that failed to parse: {repr(raw_text)}")
            # Provide a fallback structure with required fields
            parsed = {
                "document_number": f"ERROR_{idx}",
                "document_type": "error",
                "Category": "error",
                "error": f"Failed to parse JSON: {str(ve)}",
                "raw_response": raw_text[:500]  # First 500 chars for debugging
            }

        # 5) merge everything into one dict
        parsed.update({
            "index":      idx,
            "role":       role,
            "stopReason": stop_reason,
            "usage":      usage
        })

        results.append(parsed)

    return results

def combine_responses(chunks: list[dict]) -> dict:
    """
    Combine multiple chunk responses into a single document structure.
    Pulls header info from the first valid chunk.
    """
    if not chunks:
        raise ValueError("No chunks to combine")

    # Find the first chunk with valid header info
    header = None
    for chunk in chunks:
        if (chunk.get("document_number") and
            chunk.get("document_type") and
            chunk.get("Category")):
            header = chunk
            break

    if not header:
        # If no valid header found, use the first chunk with defaults
        header = chunks[0]

    out = {
        "document_number": header.get("document_number", "UNKNOWN"),
        "document_type":   header.get("document_type", "unknown"),
        "Category":        header.get("Category", "unknown"),
        "path":            header.get("path"),
        "chunks": chunks   # embed all of our enriched chunk-dicts
    }
    return out

def save_to_json(response, output_path="response_indented.json", indent=2):
    """
    Dumps `response` to JSON at `output_path`.
    If necessary, creates parent directories.
    Catches and reports serialization errors.
    """
    out = Path(output_path)
    out.parent.mkdir(parents=True, exist_ok=True)

    try:
        with out.open("w", encoding="utf-8") as f:
            json.dump(response, f, indent=indent, ensure_ascii=False)
        print(f"✅ Saved JSON to {out}")
    except TypeError as e:
        print(f"⚠️ Could not JSON-serialize response: {e}")
        # Fall back to writing the raw repr
        with out.open("w", encoding="utf-8") as f:
            f.write(repr(response))
        print(f"🔧 Wrote raw Python repr to {out}")

In [60]:
pdf_path = Path("../files_examples/CERL/800035887/9_CamCom_2020-02-28.pdf")
chunk_size = 2

modelId = "us.amazon.nova-pro-v1:0"
temperature = 0.1
top_p = 0.9
max_tokens = 10000

pdf_chunks = split_pdf_into_chunks(pdf_path, chunk_size)
clasification_prompt_raw = get_instructions("clasification")
clasification_prompt = markdown_to_plain(clasification_prompt_raw)
cfg = set_model_params(max_tokens, top_p, temperature)
chunk_responses = classify_chunks(
    pdf_chunks, clasification_prompt, modelId, cfg, chunk_size
)

final_response = combine_responses(chunk_responses)
print(final_response)
save_to_json(final_response, "outputs/my_result_v2.json", 2)


DEBUG:botocore.hooks:Event before-parameter-build.bedrock-runtime.InvokeModel: calling handler <function generate_idempotent_uuid at 0x79e97b98a160>
DEBUG:botocore.hooks:Event before-parameter-build.bedrock-runtime.InvokeModel: calling handler <function _handle_request_validation_mode_member at 0x79e97b9acd60>


DEBUG:botocore.regions:Calling endpoint provider with parameters: {'Region': 'us-east-1', 'UseDualStack': False, 'UseFIPS': False}
DEBUG:botocore.regions:Endpoint provider result: https://bedrock-runtime.us-east-1.amazonaws.com
DEBUG:botocore.hooks:Event before-call.bedrock-runtime.InvokeModel: calling handler <function add_recursion_detection_header at 0x79e97b989080>
DEBUG:botocore.hooks:Event before-call.bedrock-runtime.InvokeModel: calling handler <function add_query_compatibility_header at 0x79e97b9accc0>
DEBUG:botocore.hooks:Event before-call.bedrock-runtime.InvokeModel: calling handler <function inject_api_version_header_if_needed at 0x79e97b98bc40>
DEBUG:botocore.endpoint:Making request for OperationModel(name=InvokeModel) with params: {'url_path': '/model/us.amazon.nova-pro-v1%3A0/invoke', 'query_string': {}, 'method': 'POST', 'headers': {'Content-Type': 'application/json', 'Accept': 'application/json', 'User-Agent': 'Boto3/1.38.21 md/Botocore#1.38.21 ua/2.1 os/linux#6.11.0-25

>>> PAYLOAD: {
  "messages": [
    {
      "role": "user",
      "content": [
        {
          "text": "SYSTEM: You are a front-line support agent at PAR Servicios’ Document Filing Desk.\nYour job is to verify each incoming PDF has extractable content, then classify it into one of five legal-document categories and output exactly one JSON object (no extra text).\n\nCATEGORIES:\n• CERL   → Certificados de Existencia y Representación Legal\n• CECRL  → Copia de cédulas de ciudadanía del Representante Legal\n• RUT    → Registro Único Tributario\n• RUB    → Registro Único de Beneficiarios\n• ACC    → Composiciones Accionarias\n\nINSTRUCTIONS:\n\nIf the PDF is blank or contains only whitespace, set \"Category\": \"BLANK\" and \"Text\": \"\".\n\nIf it contains only a hyperlink (no other text), set \"Category\": \"LINK_ONLY\" and \"Text\": \"\".\n\nOtherwise, read the text and choose the correct one of the five categories above.\n\nAlways output valid JSON matching this schema exactly:\n\n{

DEBUG:urllib3.connectionpool:https://bedrock-runtime.us-east-1.amazonaws.com:443 "POST /model/us.amazon.nova-pro-v1%3A0/invoke HTTP/1.1" 200 9089
DEBUG:botocore.parsers:Response headers: {'Date': 'Fri, 23 May 2025 21:03:09 GMT', 'Content-Type': 'application/json', 'Content-Length': '9089', 'Connection': 'keep-alive', 'x-amzn-RequestId': '52aac17e-2789-4f4b-97cc-8dba0cbce352', 'X-Amzn-Bedrock-Invocation-Latency': '100997', 'X-Amzn-Bedrock-Cache-Write-Input-Token-Count': '0', 'X-Amzn-Bedrock-Cache-Read-Input-Token-Count': '0', 'X-Amzn-Bedrock-Output-Token-Count': '5269', 'X-Amzn-Bedrock-Input-Token-Count': '27466'}
DEBUG:botocore.parsers:Response body:
<botocore.response.StreamingBody object at 0x79e97ad77580>
DEBUG:botocore.hooks:Event needs-retry.bedrock-runtime.InvokeModel: calling handler <botocore.retryhandler.RetryHandler object at 0x79e97220ab70>
DEBUG:botocore.retryhandler:No retry needed.
DEBUG:botocore.hooks:Event before-parameter-build.bedrock-runtime.InvokeModel: calling hand

>>> RAW RESPONSE (first 300 chars): {"output":{"message":{"content":[{"text":"{\n  \"document_number\": \"800.035.887-9\",\n  \"document_type\": \"company\",\n  \"category\": \"CERL\",\n  \"text\": \"Cámara de Comercio de Bogotá\\n\\nSede Virtual\\n\\nCERTIFICADO DE EXISTENCIA Y REPRESENTACIÓN LEGAL\\n\\nFecha Expedición: 9 de diciemb
Raw response for chunk 1 (first 200 chars): '{\n  "document_number": "800.035.887-9",\n  "document_type": "company",\n  "category": "CERL",\n  "text": "Cámara de Comercio de Bogotá\\n\\nSede Virtual\\n\\nCERTIFICADO DE EXISTENCIA Y REPRESENTACIÓN LEGAL\\'
✅ Saved JSON to response_01.json
✅ Saved JSON to response_01_raw.json
⚠️ Chunk 1 JSON parse error: Could not extract valid JSON from text: '{\n  "document_number": "800.035.887-9",\n  "document_type": "company",\n  "category": "CERL",\n  "text": "Cámara de Comercio de Bogotá\\n\\nSede Virtual\\n\\nCERTIFICADO DE EXISTENCIA Y REPRESENTACIÓN LEGAL\\'
Raw text that failed to parse: '{\n  "document_number": 

DEBUG:urllib3.connectionpool:https://bedrock-runtime.us-east-1.amazonaws.com:443 "POST /model/us.amazon.nova-pro-v1%3A0/invoke HTTP/1.1" 200 13655
DEBUG:botocore.parsers:Response headers: {'Date': 'Fri, 23 May 2025 21:04:48 GMT', 'Content-Type': 'application/json', 'Content-Length': '13655', 'Connection': 'keep-alive', 'x-amzn-RequestId': '8f09b71b-2da6-42dc-855b-f89ae47486b0', 'X-Amzn-Bedrock-Invocation-Latency': '98578', 'X-Amzn-Bedrock-Cache-Write-Input-Token-Count': '0', 'X-Amzn-Bedrock-Cache-Read-Input-Token-Count': '0', 'X-Amzn-Bedrock-Output-Token-Count': '5132', 'X-Amzn-Bedrock-Input-Token-Count': '27466'}
DEBUG:botocore.parsers:Response body:
<botocore.response.StreamingBody object at 0x79e972308940>
DEBUG:botocore.hooks:Event needs-retry.bedrock-runtime.InvokeModel: calling handler <botocore.retryhandler.RetryHandler object at 0x79e97220ab70>
DEBUG:botocore.retryhandler:No retry needed.
DEBUG:botocore.hooks:Event before-parameter-build.bedrock-runtime.InvokeModel: calling han

>>> RAW RESPONSE (first 300 chars): {"output":{"message":{"content":[{"text":"{\n  \"document_number\": \"AB24817719\",\n  \"document_type\": \"company\",\n  \"category\": \"CERL\",\n  \"text\": \"Cámara de Comercio de Bogotá\\n\\nSede Virtual\\n\\nCERTIFICADO DE EXISTENCIA Y REPRESENTACIÓN LEGAL\\n\\nFecha Expedición: 9 de diciembre 
Raw response for chunk 2 (first 200 chars): '{\n  "document_number": "AB24817719",\n  "document_type": "company",\n  "category": "CERL",\n  "text": "Cámara de Comercio de Bogotá\\n\\nSede Virtual\\n\\nCERTIFICADO DE EXISTENCIA Y REPRESENTACIÓN LEGAL\\n\\n'
✅ Saved JSON to response_02.json
✅ Saved JSON to response_02_raw.json
Direct JSON parse failed: Invalid \escape: line 5 column 5972 (char 6060)
Extracted JSON parse failed: Invalid \escape: line 5 column 5972 (char 6060)
Attempted to parse: '{\n  "document_number": "AB24817719",\n  "document_type": "company",\n  "category": "CERL",\n  "text": "Cámara de Comercio de Bogotá\\n\\nSede Virtual\\n\\nCERTIFIC

DEBUG:urllib3.connectionpool:https://bedrock-runtime.us-east-1.amazonaws.com:443 "POST /model/us.amazon.nova-pro-v1%3A0/invoke HTTP/1.1" 200 8489
DEBUG:botocore.parsers:Response headers: {'Date': 'Fri, 23 May 2025 21:06:29 GMT', 'Content-Type': 'application/json', 'Content-Length': '8489', 'Connection': 'keep-alive', 'x-amzn-RequestId': '747526dc-b515-4f34-9e1b-dd984ed7f8fb', 'X-Amzn-Bedrock-Invocation-Latency': '101119', 'X-Amzn-Bedrock-Cache-Write-Input-Token-Count': '0', 'X-Amzn-Bedrock-Cache-Read-Input-Token-Count': '0', 'X-Amzn-Bedrock-Output-Token-Count': '5269', 'X-Amzn-Bedrock-Input-Token-Count': '27466'}
DEBUG:botocore.parsers:Response body:
<botocore.response.StreamingBody object at 0x79e97ad76da0>
DEBUG:botocore.hooks:Event needs-retry.bedrock-runtime.InvokeModel: calling handler <botocore.retryhandler.RetryHandler object at 0x79e97220ab70>
DEBUG:botocore.retryhandler:No retry needed.
DEBUG:botocore.hooks:Event before-parameter-build.bedrock-runtime.InvokeModel: calling hand

>>> RAW RESPONSE (first 300 chars): {"output":{"message":{"content":[{"text":"{\n  \"document_number\": \"AB24817719\",\n  \"document_type\": \"company\",\n  \"category\": \"CERL\",\n  \"text\": \"Cámara de Comercio de Bogotá\\n\\nSede Virtual\\n\\nCERTIFICADO DE EXISTENCIA Y REPRESENTACIÓN LEGAL\\n\\nFecha Expedición: 9 de diciembre 
Raw response for chunk 3 (first 200 chars): '{\n  "document_number": "AB24817719",\n  "document_type": "company",\n  "category": "CERL",\n  "text": "Cámara de Comercio de Bogotá\\n\\nSede Virtual\\n\\nCERTIFICADO DE EXISTENCIA Y REPRESENTACIÓN LEGAL\\n\\n'
✅ Saved JSON to response_03.json
✅ Saved JSON to response_03_raw.json
⚠️ Chunk 3 JSON parse error: Could not extract valid JSON from text: '{\n  "document_number": "AB24817719",\n  "document_type": "company",\n  "category": "CERL",\n  "text": "Cámara de Comercio de Bogotá\\n\\nSede Virtual\\n\\nCERTIFICADO DE EXISTENCIA Y REPRESENTACIÓN LEGAL\\n\\n'
Raw text that failed to parse: '{\n  "document_number"

DEBUG:urllib3.connectionpool:https://bedrock-runtime.us-east-1.amazonaws.com:443 "POST /model/us.amazon.nova-pro-v1%3A0/invoke HTTP/1.1" 200 7471
DEBUG:botocore.parsers:Response headers: {'Date': 'Fri, 23 May 2025 21:07:09 GMT', 'Content-Type': 'application/json', 'Content-Length': '7471', 'Connection': 'keep-alive', 'x-amzn-RequestId': '78989c50-f4b9-4f75-bc4a-72d59fcc8a24', 'X-Amzn-Bedrock-Invocation-Latency': '38730', 'X-Amzn-Bedrock-Cache-Write-Input-Token-Count': '0', 'X-Amzn-Bedrock-Cache-Read-Input-Token-Count': '0', 'X-Amzn-Bedrock-Output-Token-Count': '1881', 'X-Amzn-Bedrock-Input-Token-Count': '27466'}
DEBUG:botocore.parsers:Response body:
<botocore.response.StreamingBody object at 0x79e97b4f3b50>
DEBUG:botocore.hooks:Event needs-retry.bedrock-runtime.InvokeModel: calling handler <botocore.retryhandler.RetryHandler object at 0x79e97220ab70>
DEBUG:botocore.retryhandler:No retry needed.
DEBUG:botocore.hooks:Event before-parameter-build.bedrock-runtime.InvokeModel: calling handl

>>> RAW RESPONSE (first 300 chars): {"output":{"message":{"content":[{"text":"{\n  \"document_number\": \"AB24817719\",\n  \"document_type\": \"company\",\n  \"category\": \"CERL\",\n  \"text\": \"Cámara de Comercio de Bogotá\\n\\nSede Virtual\\n\\nCERTIFICADO DE EXISTENCIA Y REPRESENTACIÓN LEGAL\\n\\nFecha Expedición: 9 de diciembre 
Raw response for chunk 4 (first 200 chars): '{\n  "document_number": "AB24817719",\n  "document_type": "company",\n  "category": "CERL",\n  "text": "Cámara de Comercio de Bogotá\\n\\nSede Virtual\\n\\nCERTIFICADO DE EXISTENCIA Y REPRESENTACIÓN LEGAL\\n\\n'
✅ Saved JSON to response_04.json
✅ Saved JSON to response_04_raw.json
>>> PAYLOAD: {
  "messages": [
    {
      "role": "user",
      "content": [
        {
          "text": "SYSTEM: You are a front-line support agent at PAR Servicios’ Document Filing Desk.\nYour job is to verify each incoming PDF has extractable content, then classify it into one of five legal-document categories and output exactly o

DEBUG:urllib3.connectionpool:https://bedrock-runtime.us-east-1.amazonaws.com:443 "POST /model/us.amazon.nova-pro-v1%3A0/invoke HTTP/1.1" 200 18086
DEBUG:botocore.parsers:Response headers: {'Date': 'Fri, 23 May 2025 21:08:51 GMT', 'Content-Type': 'application/json', 'Content-Length': '18086', 'Connection': 'keep-alive', 'x-amzn-RequestId': '824b61e2-e655-4d46-a637-139d1db6267e', 'X-Amzn-Bedrock-Invocation-Latency': '102064', 'X-Amzn-Bedrock-Cache-Write-Input-Token-Count': '0', 'X-Amzn-Bedrock-Cache-Read-Input-Token-Count': '0', 'X-Amzn-Bedrock-Output-Token-Count': '5268', 'X-Amzn-Bedrock-Input-Token-Count': '27467'}
DEBUG:botocore.parsers:Response body:
<botocore.response.StreamingBody object at 0x79e97ad742b0>
DEBUG:botocore.hooks:Event needs-retry.bedrock-runtime.InvokeModel: calling handler <botocore.retryhandler.RetryHandler object at 0x79e97220ab70>
DEBUG:botocore.retryhandler:No retry needed.
DEBUG:botocore.hooks:Event before-parameter-build.bedrock-runtime.InvokeModel: calling ha

>>> RAW RESPONSE (first 300 chars): {"output":{"message":{"content":[{"text":"{\n  \"document_number\": \"AB24817719\",\n  \"document_type\": \"company\",\n  \"category\": \"CERL\",\n  \"text\": \"Cámara de Comercio de Bogotá\\n\\nSede Virtual\\n\\nCERTIFICADO DE EXISTENCIA Y REPRESENTACIÓN LEGAL\\n\\nFecha Expedición: 9 de diciembre 
Raw response for chunk 5 (first 200 chars): '{\n  "document_number": "AB24817719",\n  "document_type": "company",\n  "category": "CERL",\n  "text": "Cámara de Comercio de Bogotá\\n\\nSede Virtual\\n\\nCERTIFICADO DE EXISTENCIA Y REPRESENTACIÓN LEGAL\\n\\n'
✅ Saved JSON to response_05.json
✅ Saved JSON to response_05_raw.json
⚠️ Chunk 5 JSON parse error: Could not extract valid JSON from text: '{\n  "document_number": "AB24817719",\n  "document_type": "company",\n  "category": "CERL",\n  "text": "Cámara de Comercio de Bogotá\\n\\nSede Virtual\\n\\nCERTIFICADO DE EXISTENCIA Y REPRESENTACIÓN LEGAL\\n\\n'
Raw text that failed to parse: '{\n  "document_number"

DEBUG:urllib3.connectionpool:https://bedrock-runtime.us-east-1.amazonaws.com:443 "POST /model/us.amazon.nova-pro-v1%3A0/invoke HTTP/1.1" 200 21410
DEBUG:botocore.parsers:Response headers: {'Date': 'Fri, 23 May 2025 21:10:33 GMT', 'Content-Type': 'application/json', 'Content-Length': '21410', 'Connection': 'keep-alive', 'x-amzn-RequestId': 'cda938bd-d20c-4af5-a354-c4e38d9f874e', 'X-Amzn-Bedrock-Invocation-Latency': '101352', 'X-Amzn-Bedrock-Cache-Write-Input-Token-Count': '0', 'X-Amzn-Bedrock-Cache-Read-Input-Token-Count': '0', 'X-Amzn-Bedrock-Output-Token-Count': '5267', 'X-Amzn-Bedrock-Input-Token-Count': '27468'}
DEBUG:botocore.parsers:Response body:
<botocore.response.StreamingBody object at 0x79e9781f2980>
DEBUG:botocore.hooks:Event needs-retry.bedrock-runtime.InvokeModel: calling handler <botocore.retryhandler.RetryHandler object at 0x79e97220ab70>
DEBUG:botocore.retryhandler:No retry needed.
DEBUG:botocore.hooks:Event before-parameter-build.bedrock-runtime.InvokeModel: calling ha

>>> RAW RESPONSE (first 300 chars): {"output":{"message":{"content":[{"text":"{\n  \"document_number\": \"AB24817719\",\n  \"document_type\": \"company\",\n  \"category\": \"CERL\",\n  \"text\": \"Cámara de Comercio de Bogotá\\n\\nSede Virtual\\n\\nCERTIFICADO DE EXISTENCIA Y REPRESENTACIÓN LEGAL\\n\\nFecha Expedición: 9 de diciembre 
Raw response for chunk 6 (first 200 chars): '{\n  "document_number": "AB24817719",\n  "document_type": "company",\n  "category": "CERL",\n  "text": "Cámara de Comercio de Bogotá\\n\\nSede Virtual\\n\\nCERTIFICADO DE EXISTENCIA Y REPRESENTACIÓN LEGAL\\n\\n'
✅ Saved JSON to response_06.json
✅ Saved JSON to response_06_raw.json
⚠️ Chunk 6 JSON parse error: Could not extract valid JSON from text: '{\n  "document_number": "AB24817719",\n  "document_type": "company",\n  "category": "CERL",\n  "text": "Cámara de Comercio de Bogotá\\n\\nSede Virtual\\n\\nCERTIFICADO DE EXISTENCIA Y REPRESENTACIÓN LEGAL\\n\\n'
Raw text that failed to parse: '{\n  "document_number"

DEBUG:urllib3.connectionpool:https://bedrock-runtime.us-east-1.amazonaws.com:443 "POST /model/us.amazon.nova-pro-v1%3A0/invoke HTTP/1.1" 200 4347
DEBUG:botocore.parsers:Response headers: {'Date': 'Fri, 23 May 2025 21:11:10 GMT', 'Content-Type': 'application/json', 'Content-Length': '4347', 'Connection': 'keep-alive', 'x-amzn-RequestId': '04e6c1fa-e80f-4ebb-b569-4e8024ed656e', 'X-Amzn-Bedrock-Invocation-Latency': '36814', 'X-Amzn-Bedrock-Cache-Write-Input-Token-Count': '0', 'X-Amzn-Bedrock-Cache-Read-Input-Token-Count': '0', 'X-Amzn-Bedrock-Output-Token-Count': '1825', 'X-Amzn-Bedrock-Input-Token-Count': '27468'}
DEBUG:botocore.parsers:Response body:
<botocore.response.StreamingBody object at 0x79e97b32cee0>
DEBUG:botocore.hooks:Event needs-retry.bedrock-runtime.InvokeModel: calling handler <botocore.retryhandler.RetryHandler object at 0x79e97220ab70>
DEBUG:botocore.retryhandler:No retry needed.
DEBUG:botocore.hooks:Event before-parameter-build.bedrock-runtime.InvokeModel: calling handl

>>> RAW RESPONSE (first 300 chars): {"output":{"message":{"content":[{"text":"{\n  \"document_number\": \"AB24817719\",\n  \"document_type\": \"company\",\n  \"category\": \"CERL\",\n  \"text\": \"Cámara de Comercio de Bogotá\\n\\nSe de Virtual\\n\\nCERTIFICADO DE EXISTENCIA Y REPRESENTACIÓN LEGAL\\n\\nFecha Expedición: 9 de diciembre
Raw response for chunk 7 (first 200 chars): '{\n  "document_number": "AB24817719",\n  "document_type": "company",\n  "category": "CERL",\n  "text": "Cámara de Comercio de Bogotá\\n\\nSe de Virtual\\n\\nCERTIFICADO DE EXISTENCIA Y REPRESENTACIÓN LEGAL\\n\\'
✅ Saved JSON to response_07.json
✅ Saved JSON to response_07_raw.json
>>> PAYLOAD: {
  "messages": [
    {
      "role": "user",
      "content": [
        {
          "text": "SYSTEM: You are a front-line support agent at PAR Servicios’ Document Filing Desk.\nYour job is to verify each incoming PDF has extractable content, then classify it into one of five legal-document categories and output exactly o

DEBUG:urllib3.connectionpool:https://bedrock-runtime.us-east-1.amazonaws.com:443 "POST /model/us.amazon.nova-pro-v1%3A0/invoke HTTP/1.1" 200 4979
DEBUG:botocore.parsers:Response headers: {'Date': 'Fri, 23 May 2025 21:11:47 GMT', 'Content-Type': 'application/json', 'Content-Length': '4979', 'Connection': 'keep-alive', 'x-amzn-RequestId': '110faa5e-62cc-42b9-a1c6-1151818b23cc', 'X-Amzn-Bedrock-Invocation-Latency': '35894', 'X-Amzn-Bedrock-Cache-Write-Input-Token-Count': '0', 'X-Amzn-Bedrock-Cache-Read-Input-Token-Count': '0', 'X-Amzn-Bedrock-Output-Token-Count': '1766', 'X-Amzn-Bedrock-Input-Token-Count': '27468'}
DEBUG:botocore.parsers:Response body:
<botocore.response.StreamingBody object at 0x79e972308940>
DEBUG:botocore.hooks:Event needs-retry.bedrock-runtime.InvokeModel: calling handler <botocore.retryhandler.RetryHandler object at 0x79e97220ab70>
DEBUG:botocore.retryhandler:No retry needed.
DEBUG:botocore.hooks:Event before-parameter-build.bedrock-runtime.InvokeModel: calling handl

>>> RAW RESPONSE (first 300 chars): {"output":{"message":{"content":[{"text":"{\n  \"document_number\": \"00357483\",\n  \"document_type\": \"company\",\n  \"category\": \"CERL\",\n  \"text\": \"Cámara de Comercio de Bogotá\\n\\nSede Virtual\\n\\nCERTIFICADO DE EXISTENCIA Y REPRESENTACIÓN LEGAL\\n\\nFecha Expedición: 9 de diciembre de
Raw response for chunk 8 (first 200 chars): '{\n  "document_number": "00357483",\n  "document_type": "company",\n  "category": "CERL",\n  "text": "Cámara de Comercio de Bogotá\\n\\nSede Virtual\\n\\nCERTIFICADO DE EXISTENCIA Y REPRESENTACIÓN LEGAL\\n\\nFe'
✅ Saved JSON to response_08.json
✅ Saved JSON to response_08_raw.json
>>> PAYLOAD: {
  "messages": [
    {
      "role": "user",
      "content": [
        {
          "text": "SYSTEM: You are a front-line support agent at PAR Servicios’ Document Filing Desk.\nYour job is to verify each incoming PDF has extractable content, then classify it into one of five legal-document categories and output exactly o

DEBUG:urllib3.connectionpool:https://bedrock-runtime.us-east-1.amazonaws.com:443 "POST /model/us.amazon.nova-pro-v1%3A0/invoke HTTP/1.1" 200 11486
DEBUG:botocore.parsers:Response headers: {'Date': 'Fri, 23 May 2025 21:13:28 GMT', 'Content-Type': 'application/json', 'Content-Length': '11486', 'Connection': 'keep-alive', 'x-amzn-RequestId': '3be3e6fc-b860-421a-a6f3-5b9369fa97b9', 'X-Amzn-Bedrock-Invocation-Latency': '101025', 'X-Amzn-Bedrock-Cache-Write-Input-Token-Count': '0', 'X-Amzn-Bedrock-Cache-Read-Input-Token-Count': '0', 'X-Amzn-Bedrock-Output-Token-Count': '5267', 'X-Amzn-Bedrock-Input-Token-Count': '27468'}
DEBUG:botocore.parsers:Response body:
<botocore.response.StreamingBody object at 0x79e97ad75d80>
DEBUG:botocore.hooks:Event needs-retry.bedrock-runtime.InvokeModel: calling handler <botocore.retryhandler.RetryHandler object at 0x79e97220ab70>
DEBUG:botocore.retryhandler:No retry needed.


>>> RAW RESPONSE (first 300 chars): {"output":{"message":{"content":[{"text":"{\n  \"document_number\": \"AB24817719\",\n  \"document_type\": \"company\",\n  \"category\": \"CERL\",\n  \"text\": \"Cámara de Comercio de Bogotá\\n\\nSede Virtual\\n\\nCERTIFICADO DE EXISTENCIA Y REPRESENTACIÓN LEGAL\\n\\nFecha Expedición: 9 de diciembre 
Raw response for chunk 9 (first 200 chars): '{\n  "document_number": "AB24817719",\n  "document_type": "company",\n  "category": "CERL",\n  "text": "Cámara de Comercio de Bogotá\\n\\nSede Virtual\\n\\nCERTIFICADO DE EXISTENCIA Y REPRESENTACIÓN LEGAL\\n\\n'
✅ Saved JSON to response_09.json
✅ Saved JSON to response_09_raw.json
⚠️ Chunk 9 JSON parse error: Could not extract valid JSON from text: '{\n  "document_number": "AB24817719",\n  "document_type": "company",\n  "category": "CERL",\n  "text": "Cámara de Comercio de Bogotá\\n\\nSede Virtual\\n\\nCERTIFICADO DE EXISTENCIA Y REPRESENTACIÓN LEGAL\\n\\n'
Raw text that failed to parse: '{\n  "document_number"