In [2]:
import os, re, json, base64
from pathlib import Path
import fitz  # PyMuPDF
from typing import List, Optional, Dict, Any

from mistralai import Mistral

def _create_mistral_client() -> Mistral:
    api_key = os.getenv("MISTRAL_API_KEY")
    if not api_key:
        raise EnvironmentError("Defina MISTRAL_API_KEY en el entorno")
    return Mistral(api_key=api_key, timeout_ms=300000)

client = _create_mistral_client()

import base64

def encode_pdf(pdf_path):
    """Encode the pdf to base64."""
    try:
        with open(pdf_path, "rb") as pdf_file:
            return base64.b64encode(pdf_file.read()).decode('utf-8')
    except FileNotFoundError:
        print(f"Error: The file {pdf_path} was not found.")
        return None
    except Exception as e:  # Added general exception handling
        print(f"Error: {e}")
        return None


from pydantic import BaseModel, Field
from enum import Enum
from mistralai.extra import response_format_from_pydantic_model

class ImageType(str, Enum):
    GRAPH = "graph"
    TEXT = "text"
    TABLE = "table"
    IMAGE = "image"

class Image(BaseModel):
    image_type: ImageType = Field(..., description="The type of the image. Must be one of 'graph', 'text', 'table' or 'image'.")
    description: str = Field(..., description="A description of the image.")

class Document(BaseModel):
    language: str = Field(..., description="The language of the document in ISO 639-1 code format (e.g., 'en', 'fr').")
    summary: str = Field(..., description="A summary of the document.")
    authors: list[str] = Field(..., description="A list of authors who contributed to the document.")

from src.config.models.set_10 import Set10ExtractionModel

def process_ocr_mistral(pdf_path, extraction_model):
    # Getting the base64 string
    base64_pdf = encode_pdf(pdf_path)

    # Call the OCR API
    pdf_response = client.ocr.process(
        model="mistral-ocr-latest",
        pages=list(range(8)), # Document Annotations has a limit of 8 pages, we recommend spliting your documents when using it; bbox annotations does not have the same limit
        document={
            "type": "document_url",
            "document_url": f"data:application/pdf;base64,{base64_pdf}"
        },
        document_annotation_format=response_format_from_pydantic_model(extraction_model),
        include_image_base64=False # We are not interested on retrieving the bbox images in this example, only their annotations
    )

    # Convert response to JSON format
    return pdf_response


In [5]:
# 1) Rasteriza
pdf_path_10 = "C:/Users/Ivan/Documents/VALORACION/ESTABILIDAD_FASE_MOVIL/DATA_CROMATOGRAFICA_ESTABILIDAD_FASE_MOVIL/ESTABILIDAD FM T1 V1.pdf"
pdf_path_8 = "C:/Users/Ivan/Documents/VALORACION/ESTABILIDAD_SOLUCIONES/REPORTE_ESTABILIDAD_SOLUCIONES/REPORTE_LIMS_ESTABILIDAD_SOLUCIONES.pdf"
pdf_path_7 = "C:/Users/Ivan/Documents/VALORACION/PRECISION_INTERMEDIA/REPORTE_PRECISION_INTERMEDIA/REPORTE_LIMS_PRECISION_INTERMEDIA.pdf"

from src.config.models.set_10 import Set10ExtractionModel
from src.config.models.set_8 import Set8ExtractionModel
from src.config.models.set_7 import Set7ExtractionModel

ocr_result = process_ocr_mistral(pdf_path_7, Set7ExtractionModel)


print(json.loads(ocr_result.document_annotations))


AttributeError: 'OCRResponse' object has no attribute 'document_annotations'