In [17]:
import requests
import json

OpenRouter provides several PDF processing engines:

"mistral-ocr": Best for scanned documents or PDFs with images ($2 per 1,000 pages)  
"pdf-text": Best for well-structured PDFs with clear text content (Free)  
"native": Only available for models that support file input natively (charged as input tokens)

If you don’t explicitly specify an engine, OpenRouter will default first to the model’s native file processing capabilities
If that’s not available, we will use the "mistral-ocr" engine.

To select an engine, use the plugin configuration:

In [20]:
import requests
import json
import base64
from pathlib import Path

In [None]:
plugins = [
    {
        "id": "file-parser",
        "pdf": {
            "engine": "native"
        }
    }
]

In [None]:
# First, encode and send the PDF
def encode_pdf_to_base64(pdf_path):
    with open(pdf_path, "rb") as pdf_file:
        return base64.b64encode(pdf_file.read()).decode('utf-8')

url = "https://openrouter.ai/api/v1/chat/completions"
headers = {
    "Authorization": "Bearer ",
    "Content-Type": "application/json"
}

# Read and encode the PDF
pdf_path = "PDF6.pdf"
base64_pdf = encode_pdf_to_base64(pdf_path)
data_url = f"data:application/pdf;base64,{base64_pdf}"

# Initial request with the PDF
messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "text",
                "text": "This PDF contains details about job openings. Extract the following information in a structured JSON format. If the document lists multiple job openings, treat each one separately. Do NOT combine or mix information across different jobs. Display each job as a separate object in a list, in the order they appear in the PDF.\n\nDo NOT separate job postings based on caste, category, or reservation type (e.g., SC/ST/OBC/EWS/UR). If a job includes reservation breakdowns, include those details under 'Reservation details' within the same job object.\n\nFor each job, extract:\n- Company name\n- Job title\n- Number of openings (if mentioned)\n- Reservation details (if applicable)\n- Location\n- Qualifications required\n- Skills required\n- Age limit (if mentioned)\n- Salary or compensation details\n- Application deadline\n- Mode of application (online/offline, email, etc.)\n- Contact details (if any)\n\nIf any section is missing, use \"not mentioned\".\n\nReturn only a clean JSON array of job objects. Each object must represent a single job posting. Do not include any additional explanation, summary, or text outside of the JSON output."
            },
            {
                "type": "file",
                "file": {
                    "filename": "document.pdf",
                    "file_data": data_url
                }
            },
        ]
    }
]

payload = {
    "model": "openai/gpt-4o",  
    "messages": messages
}

response = requests.post(url, headers=headers, json=payload)
response_data = response.json()

# Store the annotations from the response
file_annotations = None
if response_data.get("choices") and len(response_data["choices"]) > 0:
    if "annotations" in response_data["choices"][0]["message"]:
        file_annotations = response_data["choices"][0]["message"]["annotations"]

In [25]:
# --- Extract and print the actual text content ---
if response_data.get("choices") and len(response_data["choices"]) > 0:
    message_content = response_data["choices"][0]["message"].get("content")
    if message_content:
        print("Extracted Information:")
        print(message_content)
    else:
        print("No text content found in the response.")
else:
    print("No choices found in the response.")

Extracted Information:
```json
[
    {
        "Company name": "Indian Overseas Bank",
        "Job title": "Local Bank Officer (Tamil Nadu)",
        "Number of openings": "260",
        "Reservation details": {
            "SC": "39",
            "ST": "19",
            "OBC": "70",
            "EWS": "26",
            "UR (GEN)": "106",
            "PwBD": {
                "VI": "2",
                "HI": "2",
                "OC": "3",
                "ID": "3"
            }
        },
        "Location": "Tamil Nadu",
        "Qualifications required": "A Degree (Graduation) in any discipline from a University recognized by the Govt. Of India",
        "Skills required": "Proficiency in Tamil (reading, writing, speaking); Valid Mark-sheet/Degree Certificate with percentage of marks",
        "Age limit": "Min - 20, Max - 30",
        "Salary or compensation details": "Basic: 48480-2000/7-62480-2340/2-67160-2680/7-85920",
        "Application deadline": "31.05.2025",
        "Mode

In [None]:
# # Follow-up request using the annotations (without sending the PDF again)
# if file_annotations:
#     follow_up_messages = [
#         {
#             "role": "user",
#             "content": [
#                 {
#                     "type": "text",
#                     "text": "What are the main points in this document?"
#                 },
#                 {
#                     "type": "file",
#                     "file": {
#                         "filename": "document.pdf",
#                         "file_data": data_url
#                     }
#                 }
#             ]
#         },
#         {
#             "role": "assistant",
#             "content": "The document contains information about...",
#             "annotations": file_annotations
#         },
#         {
#             "role": "user",
#             "content": "Can you elaborate on the second point?"
#         }
#     ]
#     follow_up_payload = {
#         "model": "google/gemma-3-27b-it",
#         "messages": follow_up_messages
#     }
#     follow_up_response = requests.post(url, headers=headers, json=follow_up_payload)
#     print(follow_up_response.json())