In [2]:
%pip install boto3 PyPDF2 pydantic

Note: you may need to restart the kernel to use updated packages.


In [3]:
import boto3, json, io, base64, re, datetime, logging
from __future__ import annotations
from pathlib import Path, PurePosixPath
# from pypdf import PdfReader, PdfWriter
from PyPDF2 import PdfWriter, PdfReader, PdfFileMerger
from botocore.config import Config
from pydantic import BaseModel, ValidationError
from typing   import Literal, Optional, List
from string import Template
logging.basicConfig(level=logging.DEBUG)
logging.getLogger("botocore").setLevel(logging.DEBUG)

# (Optional) customize timeouts if you found you needed it:
cfg = Config(connect_timeout=30, read_timeout=300)
session = boto3.Session(profile_name="par_servicios")
# Create the Bedrock client:
region = "us-east-1"
bedrock = session.client(
    "bedrock-runtime",
    region_name=region,
    config=cfg
)

DEBUG:botocore.hooks:Changing event name from creating-client-class.iot-data to creating-client-class.iot-data-plane
DEBUG:botocore.hooks:Changing event name from before-call.apigateway to before-call.api-gateway
DEBUG:botocore.hooks:Changing event name from request-created.machinelearning.Predict to request-created.machine-learning.Predict
DEBUG:botocore.hooks:Changing event name from before-parameter-build.autoscaling.CreateLaunchConfiguration to before-parameter-build.auto-scaling.CreateLaunchConfiguration
DEBUG:botocore.hooks:Changing event name from before-parameter-build.route53 to before-parameter-build.route-53
DEBUG:botocore.hooks:Changing event name from request-created.cloudsearchdomain.Search to request-created.cloudsearch-domain.Search
DEBUG:botocore.hooks:Changing event name from docs.*.autoscaling.CreateLaunchConfiguration.complete-section to docs.*.auto-scaling.CreateLaunchConfiguration.complete-section
DEBUG:botocore.hooks:Changing event name from before-parameter-buil

In [4]:
def read_document(file_to_read):
    with open(file_to_read, "rb") as document:
        raw = document.read()
        return raw


def markdown_to_plain(md_path: Path) -> str:
    """Return a prompt string with most Markdown chrome removed."""
    text = md_path.read_text(encoding="utf-8")

    # 1) Remove HTML comments (often used for model delimiters)
    text = re.sub(r"<!--.*?-->", "", text, flags=re.DOTALL)

    # 2) Drop <details> … </details> blocks (examples already in your codebase)
    text = re.sub(r"<details>.*?</details>", "", text,
                  flags=re.DOTALL | re.IGNORECASE)

    # 3) Remove fenced-code blocks ```…``` (they tend to confuse the model)
    text = re.sub(r"```.*?```", "", text, flags=re.DOTALL)

    # 4) Strip Markdown headings, bold, italics, tables
    text = re.sub(r"^#+\s*", "", text, flags=re.MULTILINE)      # headings
    text = re.sub(r"\*\*(.+?)\*\*", r"\1", text)               # bold → plain
    text = re.sub(r"__(.+?)__", r"\1", text)
    text = re.sub(r"\*(.+?)\*",  r"\1", text)                  # italics
    text = re.sub(r"_([^_]+)_", r"\1", text)
    text = re.sub(r"^\|.*\|\s*$", "", text, flags=re.MULTILINE)  # tables rows

    # 5) Collapse multiple blank lines
    text = re.sub(r"\n{2,}", "\n", text).strip()

    return text


def set_model_params(max_tokens=300, top_p=0.1, temperature=0.3):
    return {
        "maxTokens":    max_tokens,
        "topP":         top_p,
        "temperature":  temperature
    }


def converse_with_nova(model_id: str, messages: list, inference_cfg: dict, toolConfig: dict=None):
    payload = {
        "modelId": model_id,
        "messages": messages,
        "inferenceConfig": inference_cfg
    }
    if toolConfig is not None:
        payload["toolConfig"] = toolConfig

    model_response = bedrock.converse(**payload)

    return model_response

In [5]:
def get_instructions(action: str) -> Template:
    mapping = {
        "user": Path("./instructions/ACC/user.txt"),
        "system": Path("./instructions/ACC/system.txt")
    }
    fn = mapping.get(action, Path("./instructions/ACC/user.txt"))
    text = fn.read_text(encoding="utf-8")
    return Template(text)

In [6]:
def build_user_prompt(
    pdf_path: str,
    document_number: str,
    document_type: str = "company",
    category: str = "ACC"
) -> str:
    """
    Render the user prompt with the runtime values.
    """
    # 1️⃣ Build the S3/local key where the JSON should be saved
    save_key = PurePosixPath(
        category,
        document_number,
        f"{category}_{document_number}.json"
    )

    # 2️⃣ Load the Template (not a str!)
    user_tmpl = get_instructions("user")

    # logging.info(f"save_key: {save_key}")
    # logging.info(f"user template raw:\n{user_tmpl.template}")

    # 3️⃣ Substitute into the Template
    final_prompt = user_tmpl.substitute(
        pdf_path=pdf_path,
        document_number=document_number,
        document_type=document_type,
        category=category,
        save_key=save_key
    )

    # logging.info(f"final_prompt:\n{final_prompt}")

    return final_prompt

def build_system_prompt(schema_path: str, examples_dir: str) -> str:
    """
    Build the system prompt by injecting the JSON schema and
    all example JSON outputs into the prompt template.

    :param schema_path: Path to the JSON schema file
    :param examples_dir: Directory containing example JSON files
    :return: A formatted system prompt string
    """
    # Load the schema
    schema = Path(schema_path).read_text().strip()
    logging.info(f"schema:\n{schema}")
    # Load each example and wrap in ```json ... ``` fences
    example_blocks = []
    for example_file in Path(examples_dir).glob("*.json"):
        content = Path(example_file).read_text().strip()
        example_blocks.append(f"```json\n{content}\n```")

    # Join all example blocks with spacing
    examples_section = "\n\n".join(example_blocks)
    SYSTEM_TEMPLATE = get_instructions("system")
    prompt = SYSTEM_TEMPLATE.substitute(
        schema=schema,
        examples_section=examples_section
    )

    # logging.info(f"system template raw:\n{SYSTEM_TEMPLATE}")

    return prompt

def sanitize_name(raw_name: str) -> str:
    """
    Strip off any disallowed characters (including periods) from a filename.
    We'll just keep the stem (no extension) and remove anything but
    alphanumerics, spaces, hyphens, parentheses, and brackets.
    """
    stem = Path(raw_name).stem  # e.g. "231_CA_2020-02-29"
    # Remove underscores and periods, keep only allowed chars
    safe = "".join(ch for ch in stem
                   if ch.isalnum()
                   or ch in " -()[]")
    # Collapse multiple spaces or hyphens if you like
    return safe or "document"  # fallback if everything got stripped


def create_message( prompt: str,
                    role: str,
                    pdf_bytes: bytes | None = None,
                    pdf_path: str | None = None
                    ):
    """
    Build a single Bedrock message that contains:
        • main prompt (instructions + result envelope)
        • schema.json as plain text
        • N example JSON files as plain text
        • optional PDF document block
    """
    content = []

    content.append({"text": prompt})

    # 4) Optional PDF document
    if pdf_bytes is not None:
        raw_name = pdf_path or "document.pdf"
        name = sanitize_name(raw_name)

        content.append({
            "document": {
                "name":   name,
                "format": "pdf",
                "source": {
                    "bytes": pdf_bytes
                }
            }
        })

    # Assemble the message envelope Bedrock expects
    msg = {"role": role, "content": content}
    return msg



In [7]:
def parse_bedrock_response(resp: dict) -> dict:
    """
    Parse a Bedrock response dict and extract company data.

    :param resp: The JSON-decoded response from Bedrock.
    :return: A dict with companyName, documentType, taxId and relatedParties.
    :raises ValueError: if the embedded JSON block isn’t found or can’t be parsed.
    """
    # 1) Grab the assistant’s text bundle
    text = resp["output"]["message"]["content"][0]["text"]

    # 2) Extract the JSON snippet between ```json and ```
    match = re.search(r"```json\s*(\{[\s\S]*?\})\s*```", text)
    if not match:
        raise ValueError("Embedded JSON not found in Bedrock response")

    raw = match.group(1)

    # 3) Parse it
    try:
        data = json.loads(raw)
    except json.JSONDecodeError as e:
        raise ValueError(f"Failed to parse embedded JSON: {e}")

    return data

def generate_accionary_data(data):

    # 4) The payload might sit under "result" or at the top level
    payload = data.get("result", data)

    # 5) Return only the fields you care about
    response = {
        "companyName":    payload["companyName"],
        "documentType":   payload["documentType"],
        "taxId":          payload["taxId"],
        "relatedParties": payload["relatedParties"],
    }

    if "country" in payload:
        response["country"] = payload["country"]

    if "identificationDetails" in payload:
        response["identificationDetails"] = payload["identificationDetails"]

    return response

In [8]:
def save_to_json(response, output_path="response_indented.json", indent=2):
    """
    Dumps `response` to JSON at `output_path`.
    If necessary, creates parent directories.
    Catches and reports serialization errors.
    """
    out = Path(output_path)
    out.parent.mkdir(parents=True, exist_ok=True)

    try:
        with out.open("w", encoding="utf-8") as f:
            json.dump(response, f, indent=indent, ensure_ascii=False)
        print(f"✅ Saved JSON to {out}")
    except TypeError as e:
        print(f"⚠️ Could not JSON-serialize response: {e}")
        # Fall back to writing the raw repr
        with out.open("w", encoding="utf-8") as f:
            f.write(repr(response))
        print(f"🔧 Wrote raw Python repr to {out}")

In [9]:
# pdf_path = Path("../../shared/file_examples/CERL/800035887/9_CamCom_2020-02-28.pdf")
source_category = "ACC"
origin_path = Path(f"../../shared/clasification_results_examples/meta_{source_category}.json")
# origin_path = Path("../../shared/clasification_results_examples/meta_v8.json")


with open(origin_path, "r") as file:
    print(origin_path)
    meta = json.load(file)
    pdf_path = meta["path"]
    document_number = meta["document_number"]
    document_type = meta["document_type"]
    category = meta["category"]

final_pdf_path = Path(f"../../shared/{pdf_path}/231_CA_2020-02-29.pdf")
user_message = build_user_prompt(
    pdf_path=str(final_pdf_path),
    document_number=document_number,
    document_type=document_type,
    category=category
)

system_message = build_system_prompt(
    schema_path=Path("../../shared/evaluation_type/ACC/schema.json"),
    examples_dir=Path("../../shared/evaluation_type/ACC/examples")
)

messages = [
    create_message(user_message, "user", read_document(final_pdf_path)),
    create_message(system_message, "assistant")
]


INFO:root:schema:
{
  "$schema": "https://json-schema.org/draft/2020-12/schema",
  "title": "CompanyParticipationInfo",
  "type": "object",
  "properties": {
    "companyName": { "type": "string" },
    "country": { "type": "string" },
    "documentType": { "type": "string" },
    "taxId": {
      "type": "string",
      "pattern": "^[0-9.\\-]+$"
    },

    "relatedParties": {
      "type": "array",
      "items": {
        "oneOf": [
          { "$ref": "#/$defs/PersonRelatedParty" },
          { "$ref": "#/$defs/CompanyRelatedParty" }
        ]
      }
    }
  },
  "required": ["companyName", "documentType", "taxId", "relatedParties"],
  "additionalProperties": false,
  "$defs": {
    "CommonFields": {
      "type": "object",
      "properties": {
        "identificationType": { "type": "string" },
        "identificationNumber": {
          "type": "string",
          "pattern": "^[0-9A-Za-z.\\-]+$"
        },
        "relationshipType": { "type": "string" },
        "participation

../../shared/clasification_results_examples/meta_ACC.json


In [10]:
modelId = "us.amazon.nova-pro-v1:0"
temperature = 0.1
top_p = 0.9
max_tokens = 8192

cfg = set_model_params(max_tokens, top_p, temperature)
version = 2
folder_route = f"outputs/extraction/ACC/exec{version}/v{version}"
# folder_route = "outputs/prompt_md"

logging.info(f"messages: {messages}")
logging.info(f"cfg: {cfg}")

resp_json = converse_with_nova(modelId, messages, cfg)
save_to_json(resp_json, f"{folder_route}/response_model_test_v{version}.json", 2)

meta    = parse_bedrock_response(resp_json)
logging.info(f"raw meta:\n{meta}")
payload = generate_accionary_data(meta)
save_to_json(meta, f"{folder_route}/meta_v{version}.json",   2)
save_to_json(payload, f"{folder_route}/payload_v{version}.json",2)

INFO:root:messages: [{'role': 'user', 'content': [{'text': 'Extract company data from the PDF document following these strict guidelines:\n\n# Extraction Requirements\n1. Country Identification:\n   - Primary: Explicit references (country/city names, official stamps)\n   - Fallback: ID analysis using COMPANY_ID/PERSON_ID tables\n   - Document conflicts in identificationDetails.requiresReview\n\n2. Entity Handling:\n   - Create separate entries for repeated entities with timeFound counter\n   - Add job field when titles like "Representante Legal" are present\n   - Strictly follow PERSON_ID/COMPANY_ID classification rules\n\n3. Validation:\n   - Verify against schema.json\n   - Ensure all ID types match country\'s allowed values\n   - Confirm timeFound increments correctly\n   - Remove any extra fields not in schema\n\n# Required Output\n```json\n{\n  "path": "../../shared/file_examples/ACC/800216686/231_CA_2020-02-29.pdf",\n  "result": {\n    "companyName": "",\n    "documentType": "Equ

DEBUG:botocore.endpoint:Making request for OperationModel(name=Converse) with params: {'url_path': '/model/us.amazon.nova-pro-v1%3A0/converse', 'query_string': {}, 'method': 'POST', 'headers': {'Content-Type': 'application/json', 'User-Agent': 'Boto3/1.38.21 md/Botocore#1.38.21 ua/2.1 os/linux#6.11.0-26-generic md/arch#x86_64 lang/python#3.12.9 md/pyimpl#CPython m/b,Z cfg/retry-mode#legacy Botocore/1.38.21'}, 'body': b'{"messages": [{"role": "user", "content": [{"text": "Extract company data from the PDF document following these strict guidelines:\\n\\n# Extraction Requirements\\n1. Country Identification:\\n   - Primary: Explicit references (country/city names, official stamps)\\n   - Fallback: ID analysis using COMPANY_ID/PERSON_ID tables\\n   - Document conflicts in identificationDetails.requiresReview\\n\\n2. Entity Handling:\\n   - Create separate entries for repeated entities with timeFound counter\\n   - Add job field when titles like \\"Representante Legal\\" are present\\n   -

✅ Saved JSON to outputs/extraction/ACC/exec2/v2/response_model_test_v2.json
✅ Saved JSON to outputs/extraction/ACC/exec2/v2/meta_v2.json
✅ Saved JSON to outputs/extraction/ACC/exec2/v2/payload_v2.json
