In [None]:
import boto3, json
from botocore.config import Config

In [3]:

session = boto3.Session(profile_name="par_servicios")
client = session.client("bedrock-runtime",region_name='us-east-2')

In [4]:
def setup_bedrock_client(region='us-east-2'):
    """Setup Bedrock client with retry configuration"""
    config = Config(
        region_name=region,
        retries=dict(max_attempts=1000)
    )
    
    session = boto3.Session(profile_name="par_servicios")
    client = session.client(
        service_name='bedrock-runtime',
        config=config
    )
    
    return client

# Initialize the client
bedrock_client = setup_bedrock_client()

In [85]:
import json
import boto3
import base64

# Read and encode the PDF
with open("prueba (1).pdf", "rb") as pdf_file:
    encoded_pdf = base64.b64encode(pdf_file.read()).decode("utf-8")

In [86]:
with open("prueba.txt", "r", encoding="utf-8") as file:
    contenido = file.read()

In [87]:
request_body = {
    "anthropic_version": "bedrock-2023-05-31",
    "messages": [
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": contenido,
                    "cache_control": {"type": "ephemeral"} 
                },
                {
                    "type": "document",
                    "source": {
                        "type": "base64",
                        "media_type": "application/pdf",
                        "data": encoded_pdf
                    },
                    "title": "ACC Report",
                    "citations": {"enabled": True}
                    #"cache_control": {"type": "ephemeral"} 
                     # Enable prompt caching!
                }
            ]
        }
    ],
    "max_tokens": 500
}

In [88]:
response = bedrock_client.invoke_model(
    modelId="us.anthropic.claude-sonnet-4-20250514-v1:0",
    contentType='application/json',
    accept='application/json',
    body=json.dumps(request_body)
)

In [89]:
response_body = json.loads(response['body'].read())
response_body

{'id': 'msg_bdrk_01WynQWXhtcPnHMVQAuEpA1h',
 'type': 'message',
 'role': 'assistant',
 'model': 'claude-sonnet-4-20250514',
 'content': [{'type': 'text',
   'text': 'Looking at this document, I can see it\'s clearly a tax form issued by DIAN (Dirección de Impuestos y Aduanas Nacionales), which is Colombia\'s tax authority.\n\nKey identifying elements:\n- Header shows "DIAN" and "Formulario del Registro Único Tributario"\n- Form number 001 and document number 14280290057\n- Contains tax identification number (NIT): 900686567-2\n- Company name: MOLIAGRO NEIVA S.A.S.\n- Structured sections for tax classification, economic activities, and tax responsibilities\n- Contains CIIU codes for economic activities\n- Tax responsibility codes (05, 07, 09, 11, 14, 35)\n- Official DIAN signature and stamps\n\nThis is definitively a RUT (Registro Único Tributario) document - the official tax registration form that consolidates taxpayer data, tax classification, economic activities, and responsibilities

In [84]:
response_body = json.loads(response['body'].read())
response_body

{'id': 'msg_bdrk_01V73ACGZtE1GFcoipaGSVNP',
 'type': 'message',
 'role': 'assistant',
 'model': 'claude-sonnet-4-20250514',
 'content': [{'type': 'text',
   'text': 'Looking at this document, I need to analyze its content and purpose to classify it correctly.\n\n**TEXT EXTRACTION:**\nThe document contains extensive text including form numbers, DIAN references, beneficiary information, and corporate data.\n\n**DOCUMENT FUNCTION ANALYSIS:**\n'},
  {'citations': [{'type': 'page_location',
     'cited_text': ' Espacio reservado para la DIAN\r\n4. ',
     'document_index': 0,
     'document_title': 'ACC Report',
     'start_page_number': 1,
     'end_page_number': 2},
    {'type': 'page_location',
     'cited_text': ' Espacio reservado para la DIAN\r\n4. ',
     'document_index': 0,
     'document_title': 'ACC Report',
     'start_page_number': 2,
     'end_page_number': 3}],
   'type': 'text',
   'text': 'This document has "Espacio reservado para la DIAN" (Space reserved for DIAN)'},
  {'t

In [79]:
response_body = json.loads(response['body'].read())
response_body

{'id': 'msg_bdrk_01JMiVq6Yp31DyNFCSmBmo2c',
 'type': 'message',
 'role': 'assistant',
 'model': 'claude-sonnet-4-20250514',
 'content': [{'type': 'text',
   'text': 'Looking at this document, I can see it is a tax registration form with clear identifying elements.\n\n**TEXT EXTRACTION:**\nThe document contains "Formulario del Registro Único Tributario" at the top, issued by DIAN, with form number 14451412687. It includes sections for identification, location, classification, economic activities, tax responsibilities, and other tax-related information.\n\n**CLASSIFICATION ANALYSIS:**\n\n1. **Issuer**: '},
  {'type': 'text',
   'text': 'DIAN (Dirección de Impuestos y Aduanas Nacionales)'},
  {'type': 'text', 'text': '\n2. **Document Type**: '},
  {'type': 'text', 'text': 'Formulario del Registro Único Tributario'},
  {'type': 'text',
   'text': '\n3. **Structure**: Numbered form with sections for tax classification, economic activities, and tax responsibilities\n4. **Purpose**: Tax regis

In [74]:
response_body = json.loads(response['body'].read())
response_body #cache en el prompt

{'id': 'msg_bdrk_01ChdH5VM1FTdZtcmz5xbdhr',
 'type': 'message',
 'role': 'assistant',
 'model': 'claude-sonnet-4-20250514',
 'content': [{'type': 'text',
   'text': 'Looking at this document, I can see it\'s a form from DIAN (Dirección de Impuestos y Aduanas Nacionales) which is Colombia\'s tax authority. The document shows:\n\n1. **DIAN header and official formatting** - This is clearly a tax authority document\n2. **Form number 14128054034** - Structured numbering typical of tax forms\n3. **"Formulario del Registro Único Tributario"** - This translates to "Single Tax Registry Form"\n4. **Tax-related sections** including:\n   - Identification sections\n   - Economic activity codes (CIIU codes visible)\n   - Tax responsibilities and classifications\n   - Business location and establishment information\n\nThis is clearly a RUT (Registro Único Tributario) document - the official tax registration form that consolidates taxpayer data, tax classification, economic activities, and responsibi

In [69]:
response_body = json.loads(response['body'].read())
response_body #los tokens en total son como 5k

{'id': 'msg_bdrk_01NgNcZMvQ7zRXsH3wQVz88F',
 'type': 'message',
 'role': 'assistant',
 'model': 'claude-sonnet-4-20250514',
 'content': [{'type': 'text',
   'text': 'Looking at this document, I can extract the following text and analyze its content:\n\n**TEXT EXTRACTION:**\nThe document contains fragmented text including "Fa$}le;éd R@rlhlÚf,á,riai¡", "Hrii *iieliai", "i41200s4034", "E.c.¡ 2.9 I 1,0 ¡i-la ib*!", "DIAN", "Formulario del Registro Único Tributario", "Hoja Principal", "14128094034", and various form fields and sections typical of a tax registration document.\n\n**DOCUMENT FUNCTION ANALYSIS:**\n'},
  {'citations': [{'type': 'page_location',
     'cited_text': 'Fa$}le;éd R@rlhlÚf,á,riai¡\r\nHrii *iieliai\r\ni41200s4034\r\nE.c.¡ 2.9 I 1,0 ¡i-la ib*!"ea{\r\n--l\r\nj:ti...;;";*-\r\ni5;CK Cal.)lAlNEA ilM¡:ArJA\r\nI - ¡*;;;.*\'\r\n¡ii!éé6 tfui¡eb ñrd dr\'€\r\ne. ñdej r4r!^Ho M!1aJ r,1A\r\n$1c.9o61\'rtJ\r\n::\' ::::T:i :t l: -\'-:-*\' "1:r1# "',
     'document_index': 0,
     'docu

In [64]:
response_body = json.loads(response['body'].read())
response_body #prompt caching en el documento

{'id': 'msg_bdrk_01AySYCGk42p4LRnRD5sX9PP',
 'type': 'message',
 'role': 'assistant',
 'model': 'claude-sonnet-4-20250514',
 'content': [{'type': 'text',
   'text': 'Looking at this document, I can see it\'s a tax form from DIAN (Dirección de Impuestos y Aduanas Nacionales), which is Colombia\'s tax authority.\n\nKey indicators that make this a RUT document:\n- Header clearly shows "DIAN" and "Formulario del Registro Único Tributario"\n- Form number 14280290057\n- Tax identification number (NIT): 900686567-2\n- Structured sections for identification, location, economic activities, and tax responsibilities\n- Tax classification codes and responsibilities sections\n- Company information: MOLIAGRO NEIVA S.A.S.\n- Located in Colombia, Huila department, Neiva\n\nThis is clearly a RUT (Registro Único Tributario) document issued by DIAN for tax registration purposes.\n\n```json\n{\n  "category": "RUT",\n  "text": "DIAN\\nFormulario del Registro Único Tributario\\nHoja Principal\\n001\\n2. Con

In [59]:
response_body

{'id': 'msg_bdrk_01P6rCCT2u41btaPCKEFKxfo',
 'type': 'message',
 'role': 'assistant',
 'model': 'claude-sonnet-4-20250514',
 'content': [{'type': 'text',
   'text': '```json\n{\n  "category": "ACC",\n  "text": "EL SUSCRITO REVISOR FISCAL DE LA SOCIEDAD COTEL S.A.S\\nNIT .800.122.811-2\\n\\nCERTIFICA QUE:\\n\\nDe acuerdo con el Acta No. 083 de la Asamblea General Ordinaria de Accionistas de Cotel S.A.S de fecha marzo 29 de 2022, la nueva composición accionaria quedó registrada así:\\n\\nAccionista | Documento Identificación | No. de Acciones | % de Participación\\nJames Herney Quijano Morán | 16.758.096 | 5.740.000.000 | 82%\\nMaría del Pilar Zuluaga Marín | 66.901.361 | 700.000.000 | 10%\\nNicolle Dayana Quijano Zuluaga | 1.107.089.788 | 210.000.000 | 3%\\nJuan Pablo Quijano Zuluaga | 1.144.109.912 | 210.000.000 | 3%\\nCotel S.A.S | 800.122.811-2 | 140.000.000 | 2%\\nTotal | | 7.000.000.000 | 100%\\n\\nLa nueva composición accionaria fue registrada en los libros contables, así mismo fu

In [51]:
response_body

{'id': 'msg_bdrk_01MaL6oMAJGeeKVystxignF7',
 'type': 'message',
 'role': 'assistant',
 'model': 'claude-sonnet-4-20250514',
 'content': [{'type': 'text',
   'text': 'Looking at this document, I can see it contains a Colombian identification card (cédula de ciudadanía) with personal information.\n\n**TEXT EXTRACTION:**\nThe document shows:\n- "REPUBLICA DE COLOMBIA"\n- "IDENTIFICACION PERSONAL"\n- "CEDULA DE CIUDADANIA"\n- Personal details including name "SANCHEZ DUARTE" and "DARIO"\n- ID number "3.357.420"\n- Birth information "01-AGO-1962"\n- Location "YONDO - CASABE"\n- "YONDO (ANTIOQUIA)"\n- Physical characteristics "1.75" (height), "O+" (blood type), "M" (sex)\n- Issue date "13-ABR-1991 YONDO"\n- Various security elements including fingerprint and barcode\n\n**DOCUMENT FUNCTION ANALYSIS:**\nThis is clearly a personal identification document issued by the Colombian government for individual citizen identification purposes.\n\n**CATEGORY CLASSIFICATION:**\nThis document fits perfectl

In [45]:
response_body

{'id': 'msg_bdrk_01C1rTaqoX4ceQDfwvdEjCoZ',
 'type': 'message',
 'role': 'assistant',
 'model': 'claude-sonnet-4-20250514',
 'content': [{'type': 'text',
   'text': 'Looking at this document, I need to extract all the text and analyze its function and content.\n\n**TEXT EXTRACTION:**\nThe document contains:\n- Header: "EL SUSCRITO REVISOR FISCAL DE LA SOCIEDAD COTEL S.A.S NIT .800.122.811-2"\n- "CERTIFICA QUE:"\n- Reference to "Acta No. 063 de la Asamblea General Ordinaria de Accionistas de Cotel S.A.S de fecha marzo 29 de 2022"\n- A detailed table showing shareholding composition with columns for: Accionista, Documento Identificación, No. de Acciones, % de Participación\n- Five shareholders listed with their ownership percentages (82%, 10%, 3%, 3%, 2%)\n- Total shares: 7,000,000,000 (100%)\n- Statement about registration in accounting books\n- Certification date and signature of fiscal revisor\n\n**DOCUMENT FUNCTION ANALYSIS:**\nThis document\'s main purpose is to certify the sharehol

In [39]:
response_body

{'id': 'msg_bdrk_013DC76kUo8Db7DBUdJd9TWB',
 'type': 'message',
 'role': 'assistant',
 'model': 'claude-sonnet-4-20250514',
 'content': [{'type': 'text',
   'text': 'Looking at this document, I need to extract the text and analyze its function.\n\n**TEXT EXTRACTION:**\nThe document contains:\n- Header: "EL SUSCRITO REVISOR FISCAL DE LA SOCIEDAD COTEL S.A.S NIT .800.122.811-2"\n- "CERTIFICA QUE:"\n- Reference to "Acta No. 083 de la Asamblea General Ordinaria de Accionistas de Cotel S.A.S de fecha marzo 29 de 2022"\n- A table showing shareholder composition ("nueva composición accionaria") with columns for:\n  - Accionista (Shareholder)\n  - Documento Identificación (ID Document)\n  - No. de Acciones (Number of Shares)  \n  - % de Participación (Participation Percentage)\n- Lists 5 shareholders with their respective shareholdings and percentages (82%, 10%, 3%, 3%, 2%)\n- Total shares: 7,000,000,000 (100%)\n- Signature of fiscal auditor\n\n**DOCUMENT FUNCTION ANALYSIS:**\nThis document\'s

In [69]:
model_id = "us.anthropic.claude-sonnet-4-20250514-v1:0"


In [70]:
system_message = [
	{
		"text": "You are a helpful assistant."
	}
]

In [78]:
with open("259_CA_2020-02-29.pdf", "rb") as pdf_file:
    pdf_bytes = pdf_file.read()


In [79]:
messages = [
    {
        "role": "user",
        "content": [
            {
                "document": {
                    "name": "ACC Report",
                    "format": "pdf",
                    "source": {
                        "bytes": pdf_bytes  # Raw bytes, not base64!
                    },
                    "citations": {"enabled": True}  # REQUIRED for Claude's PDF support in Converse!
                }
            },
            {
                "text": "What visual elements (charts, graphs, diagrams) are present in this document? Describe what they show."
            }
        ]
    }
]


In [80]:
response = bedrock_client.converse(
    modelId="us.anthropic.claude-sonnet-4-20250514-v1:0",
    messages=messages,
    inferenceConfig={
        "maxTokens": 500,
        "temperature": 0.5
    }
)

ParamValidationError: Parameter validation failed:
Unknown parameter in messages[0].content[0].document: "citations", must be one of: format, name, source

In [73]:
response = client.converse(
    modelId="us.anthropic.claude-sonnet-4-20250514-v1:0",
    messages=messages,
    inferenceConfig={
        "maxTokens": 500,
        "temperature": 0.5
    }
)

ParamValidationError: Parameter validation failed:
Unknown parameter in messages[0].content[0].document: "citations", must be one of: format, name, source

In [51]:
messages = [
	{
		'role': 'user', 
		'content': [
			{
				"document": {
        			"name": 'documento',
        			"format": "pdf",
        			"source": {"bytes": base64_data},
        			"citations": {            
           			 "enabled": True
       							 }
    				},
				'text': "Analyze the pdf"
			}
		]
	}
]

In [64]:
messages = [
    {
        "role": "user",
        "content": [
            {"text": "Analyze the pdf"},
            {
				"document": {
        			"name": 'documento',
        			"format": "pdf",
        			"source": {"bytes": base64_data},
        			"citations": {            
           			 "enabled": True
       							 },
    				}
            },
        ],
    }
]

In [65]:
response = client.converse(
	modelId=model_id,
	messages=messages,
	system=system_message,
	inferenceConfig={
		"maxTokens": 8192,
		"temperature": 0.7,
		"topP": 1,
	}
)

ParamValidationError: Parameter validation failed:
Unknown parameter in messages[0].content[1].document: "citations", must be one of: format, name, source

In [61]:
print(json.dumps(response, indent=4))

{
    "ResponseMetadata": {
        "RequestId": "4ec5a1bd-2faa-49f2-ad48-226a777aa560",
        "HTTPStatusCode": 200,
        "HTTPHeaders": {
            "date": "Sun, 24 Aug 2025 05:03:28 GMT",
            "content-type": "application/json",
            "content-length": "1131",
            "connection": "keep-alive",
            "x-amzn-requestid": "4ec5a1bd-2faa-49f2-ad48-226a777aa560"
        },
        "RetryAttempts": 0
    },
    "output": {
        "message": {
            "role": "assistant",
            "content": [
                {
                    "text": "I can see that you've referenced a PDF document called \"documento.pdf\", but the document content appears to be empty in what was provided to me. The document section shows:\n\n```\n<document index=\"1\">\n<source>documento.pdf</source>\n<document_content></document_content>\n</document>\n```\n\nThere's no actual content between the `<document_content>` tags for me to analyze.\n\nTo help you analyze the PDF, I wou

In [62]:
response = client.converse(
	modelId=model_id,
	messages=messages,
	system=system_message,
	inferenceConfig={
		"maxTokens": 8192,
		"temperature": 1, # must be 1
		 #"topP": 1, # must be unset
	},
	additionalModelRequestFields={
		'thinking': {
			"type": "enabled",
			"budget_tokens": 2000
		},
	}
)

In [57]:
response = client.converse(
	modelId=model_id,
	messages=messages,
	system=system_message,
	inferenceConfig={
		"maxTokens": 8000,
		"temperature": 1, # must be 1
		"topP": 0.95, # must be unset
	},
	additionalModelRequestFields={
		'thinking': {
			"type": "enabled",
			"budget_tokens": 2000
		}
	}
)

In [63]:
print(json.dumps(response, indent=4))

{
    "ResponseMetadata": {
        "RequestId": "728f2cdb-4b47-4176-9909-5ed69771cc22",
        "HTTPStatusCode": 200,
        "HTTPHeaders": {
            "date": "Sun, 24 Aug 2025 05:04:02 GMT",
            "content-type": "application/json",
            "content-length": "2744",
            "connection": "keep-alive",
            "x-amzn-requestid": "728f2cdb-4b47-4176-9909-5ed69771cc22"
        },
        "RetryAttempts": 0
    },
    "output": {
        "message": {
            "role": "assistant",
            "content": [
                {
                    "reasoningContent": {
                        "reasoningText": {
                            "text": "I can see that there's a document reference with the filename \"documento.pdf\" but the document_content field is empty. This means I don't have access to the actual content of the PDF file to analyze.\n\nI should let the user know that I cannot see the content of the PDF and ask them to either:\n1. Provide the PDF content 