In [1]:
%pip install boto3 PyPDF2 pydantic

Note: you may need to restart the kernel to use updated packages.


In [2]:
import boto3, json, io, base64, re, datetime
from __future__ import annotations
from pathlib import Path
# from pypdf import PdfReader, PdfWriter
from PyPDF2 import PdfWriter, PdfReader, PdfFileMerger
from botocore.config import Config
from pydantic import BaseModel, ValidationError
from typing   import Literal, Optional

# (Optional) customize timeouts if you found you needed it:
cfg = Config(connect_timeout=30, read_timeout=300)
session = boto3.Session(profile_name="par_servicios")
# Create the Bedrock client:
region = "us-east-1"
bedrock = session.client(
    "bedrock-runtime",
    region_name=region,
    config=cfg
)

In [3]:
def read_document(file_to_read):
    with open(file_to_read, "rb") as document:
        raw = document.read()
        return raw


def markdown_to_plain(md_path: Path) -> str:
    """Return a prompt string with most Markdown chrome removed."""
    text = md_path.read_text(encoding="utf-8")

    # 1) Remove HTML comments (often used for model delimiters)
    text = re.sub(r"<!--.*?-->", "", text, flags=re.DOTALL)

    # 2) Drop <details> … </details> blocks (examples already in your codebase)
    text = re.sub(r"<details>.*?</details>", "", text,
                  flags=re.DOTALL | re.IGNORECASE)

    # 3) Remove fenced-code blocks ```…``` (they tend to confuse the model)
    text = re.sub(r"```.*?```", "", text, flags=re.DOTALL)

    # 4) Strip Markdown headings, bold, italics, tables
    text = re.sub(r"^#+\s*", "", text, flags=re.MULTILINE)      # headings
    text = re.sub(r"\*\*(.+?)\*\*", r"\1", text)               # bold → plain
    text = re.sub(r"__(.+?)__", r"\1", text)
    text = re.sub(r"\*(.+?)\*",  r"\1", text)                  # italics
    text = re.sub(r"_([^_]+)_", r"\1", text)
    text = re.sub(r"^\|.*\|\s*$", "", text, flags=re.MULTILINE)  # tables rows

    # 5) Collapse multiple blank lines
    text = re.sub(r"\n{2,}", "\n", text).strip()

    return text


def set_model_params(max_tokens=300, top_p=0.1, temperature=0.3):
    return {
        "maxTokens":    max_tokens,
        "topP":         top_p,
        "temperature":  temperature
    }


def invoke_nova(model_id: str, messages: list, inference_cfg: dict):
    payload = {
        "messages":        messages,
        "inferenceConfig": inference_cfg
    }
    resp = bedrock.invoke_model(
        modelId=model_id,
        contentType="application/json",
        accept="application/json",
        body=json.dumps(payload).encode("utf-8")
    )
    return json.loads(resp["body"].read().decode("utf-8"))

In [4]:
LOCAL_ROOT = Path("../../shared/file_examples")
S3_BUCKET  = "par-servicios-docs"

def create_messages(prompt: str, pdf_bytes: bytes):

    msg = {
        "role": "user",
        "content": [
            # 1) prompt about what you want done
            {"text": prompt},

            # 2) the PDF itself, under a "document" key:
            {
                "document": {
                    "name":   "document_to_evaluate.pdf",  # an arbitrary label
                    "format": "pdf",                       # file format
                    "source": {
                        # "bytes": pdf_bytes                # raw bytes → SDK handles the rest
                        "bytes": base64.b64encode(pdf_bytes).decode("utf-8")
                    }
                }
            }
        ]
    }
    return [msg]

def add_now_process(folder_path):
    now_process = (
        f"\nNOW PROCESS:\n"
        f"Folder path: `{folder_path}`\n"
        f"Extracted text follows the PDF below."
    )

    return now_process


def build_folder_path(pdf_path: Path, use_s3: bool = False) -> str:
    rel_parts = pdf_path.relative_to(LOCAL_ROOT).parts[:-1]   # drop filename
    key = "/".join(rel_parts)                                 # ACC/800216686
    return f"s3://{S3_BUCKET}/{key}" if use_s3 else f"file_examples/{key}"

In [5]:
def get_first_pdf_page(file_name):
    inputpdf = PdfReader(open(file_name, "rb"))
    first_page = inputpdf.pages[0]
    writer = PdfWriter()
    writer.add_page(first_page)
    buffer = io.BytesIO()
    writer.write(buffer)
    return buffer.getvalue()

In [6]:
def _normalise(raw_obj: dict, *, file_path: str | None = None) -> dict:
    """
    • rename documenttype → document_type (case-insensitive)
    • ensure document_number and path exist (fallbacks)
    • keep snippet only if present
    """
    norm = {k.lower(): v for k, v in raw_obj.items()}          # case-fold keys

    # key mapping
    if "documenttype" in norm and "document_type" not in norm:
        norm["document_type"] = norm.pop("documenttype")

    # fallback values
    if "document_number" not in norm:
        # try to derive from the file path    e.g.  .../800035887/...
        if file_path:
            m = re.search(r"/(\d{6,})/", file_path)
            norm["document_number"] = m.group(1) if m else "UNKNOWN"
        else:
            norm["document_number"] = "UNKNOWN"

    if "path" not in norm and file_path:
        norm["path"] = file_path
    elif "path" not in norm:
        norm["path"] = "UNKNOWN"

    return norm

def _strip_fences(text: str) -> str:
    """
    Removes ```json ... ``` or ``` ... ``` even if the opening fence is
    immediately followed by '{'.
    """
    text = text.strip()

    # opening fence
    text = re.sub(r'^```(?:json)?', '', text, flags=re.IGNORECASE).lstrip()
    # closing fence
    text = re.sub(r'```$', '', text).rstrip()
    return text

In [7]:
# ──────────────────────────────────────────────────────────────────
# 1)  Pydantic schema – guarantees we got the 5 required keys
# ──────────────────────────────────────────────────────────────────
class ClassMeta(BaseModel):
    document_number: str
    document_type:   Literal["person", "company"]
    category:        Literal["CERL", "CECRL", "RUT", "RUB", "ACC",
                            "BLANK", "LINK_ONLY"]
    path:            str
    text: Optional[str]

# ──────────────────────────────────────────────────────────────────
# 2)  Pull the assistant’s line of JSON out of Bedrock’s response
# ──────────────────────────────────────────────────────────────────
def _extract_text(resp_json: dict) -> str:
    """
    Bedrock Nova returns:
        {"output":{"message":{"content":[{"text":"..."}]}}}
    """
    try:
        return resp_json["output"]["message"]["content"][0]["text"].strip()
    except (KeyError, IndexError, TypeError):
        raise RuntimeError("Unexpected response shape from Bedrock") from None

# ──────────────────────────────────────────────────────────────────
# 3)  Validate + return a ClassMeta instance (raises on error)
# ──────────────────────────────────────────────────────────────────
def parse_classification(resp_json: dict, *, pdf_path: str | None = None) -> ClassMeta:
    raw_text = _extract_text(resp_json)
    raw_text = _strip_fences(raw_text)          # remove ```json … ```

    try:
        raw_obj = json.loads(raw_text)
    except json.JSONDecodeError as e:
        # last-chance: grab first '{' … last '}'
        m1, m2 = raw_text.find("{"), raw_text.rfind("}")
        if m1 != -1 and m2 != -1:
            raw_obj = json.loads(raw_text[m1:m2+1])
        else:
            raise RuntimeError(f"Assistant did not return JSON: {e}") from None

    patched = _normalise(raw_obj, file_path=pdf_path)

    return ClassMeta.model_validate(patched)

# ──────────────────────────────────────────────────────────────────
# 4)  Build the payload that Phase-2 expects
# ──────────────────────────────────────────────────────────────────
def build_payload(meta: ClassMeta) -> dict:
    return {
        "path":            meta.path,
        "result":          meta.model_dump(mode="json"),   # dict
        "document_type":   meta.document_type,
        "document_number": meta.document_number,
        "category":        meta.category
    }

In [8]:
def save_to_json(response, output_path="response_indented.json", indent=2):
    """
    Dumps `response` to JSON at `output_path`.
    If necessary, creates parent directories.
    Catches and reports serialization errors.
    """
    out = Path(output_path)
    out.parent.mkdir(parents=True, exist_ok=True)

    try:
        with out.open("w", encoding="utf-8") as f:
            json.dump(response, f, indent=indent, ensure_ascii=False)
        print(f"✅ Saved JSON to {out}")
    except TypeError as e:
        print(f"⚠️ Could not JSON-serialize response: {e}")
        # Fall back to writing the raw repr
        with out.open("w", encoding="utf-8") as f:
            f.write(repr(response))
        print(f"🔧 Wrote raw Python repr to {out}")

In [9]:
def get_instructions(action: str) -> str:
    mapping = {
        "clasification": Path("./instructions/clasification.txt")
    }
    fn = mapping.get(action, Path("./instructions/clasification.txt"))
    return fn

In [13]:
pdf_path = Path("../../shared/file_examples/CERL/860006752/22_CamCom_2020-02-28.pdf")
# pdf_path = Path("../../shared/file_examples/ACC/800216686/231_CA_2020-02-29.pdf")
folder_path = build_folder_path(pdf_path)
first_page = get_first_pdf_page(pdf_path)
# file_to_read  = read_document(first_page)
file_to_read = first_page

clasification_prompt_raw = get_instructions("clasification")
base_prompt = markdown_to_plain(clasification_prompt_raw)
clasification_prompt = base_prompt + add_now_process(folder_path)

messages = create_messages(clasification_prompt, file_to_read)


In [14]:
modelId = "us.amazon.nova-pro-v1:0"
temperature = 0.1
top_p = 0.9
max_tokens = 8192

cfg = set_model_params(max_tokens, top_p, temperature)
version = "cerl_9"
folder_route = f"outputs/classification/prompt_txt/v{version}"
# folder_route = "outputs/prompt_md"

resp_json = invoke_nova(modelId, messages, cfg)
save_to_json(resp_json, f"{folder_route}/response_model_test_v{version}.json", 2)

meta    = parse_classification(resp_json, pdf_path = str(pdf_path) )
payload = build_payload(meta)
save_to_json(meta.model_dump(), f"{folder_route}/meta_v{version}.json",   2)
save_to_json(payload, f"{folder_route}/payload_v{version}.json",2)

✅ Saved JSON to outputs/classification/prompt_txt/vcerl_9/response_model_test_vcerl_9.json
✅ Saved JSON to outputs/classification/prompt_txt/vcerl_9/meta_vcerl_9.json
✅ Saved JSON to outputs/classification/prompt_txt/vcerl_9/payload_vcerl_9.json
