# Mistral OCR Cookbook

modified from the Mistral Cookbook [Notebook](https://colab.research.google.com/github/mistralai/cookbook/blob/main/mistral/ocr/structured_ocr.ipynb#scrollTo=po7Cukllt8za)

---

## OCR Exploration and Structured Outputs
In this cookbook, we will explore the basics of OCR and leverage it together with existing models to achieve structured outputs fueled by our OCR model.

You may want to do this in case current vision models are not powerful enough, hence enhancing their vision OCR capabilities with the OCR model to achieve better structured data extraction.

---

### Used
- Mistral OCR
- Pixtral 12B & Ministral 8B


### Setup
First, let's install `mistralai` and download the required files.

In [None]:
%pip -q install mistralai

: 

bring in key from Google Colab Secrets

In [None]:
import os
from google.colab import userdata

os.environ['MISTRAL_API_KEY'] = userdata.get('MISTRAL_API_KEY')


We can now set up our client. You can create an API key on our [Plateforme](https://console.mistral.ai/api-keys/).

In [None]:
from mistralai import Mistral


client = Mistral(api_key=os.environ['MISTRAL_API_KEY'])

There are two types of files you can apply OCR to:
- PDF files, either uploaded or from URLs...

In [None]:
from pathlib import Path

pdf_file = Path("upsc_textbook.pdf")
assert pdf_file.is_file()

In [None]:
from mistralai import DocumentURLChunk, ImageURLChunk, TextChunk
import json

uploaded_file = client.files.upload(
    file={
        "file_name": pdf_file.stem,
        "content": pdf_file.read_bytes(),
    },
    purpose="ocr",
)

signed_url = client.files.get_signed_url(file_id=uploaded_file.id, expiry=1)

pdf_response = client.ocr.process(document=DocumentURLChunk(document_url=signed_url.url),
                                  model="mistral-ocr-latest",
                                  include_image_base64=True)

response_dict = json.loads(pdf_response.json())
json_string = json.dumps(response_dict, indent=4)

# print(json_string)

*The OCR model can output interleaved text and images (set `include_image_base64=True` to return the base64 image ), we can view the result with the following:*

In [None]:
from mistralai.models import OCRResponse
from IPython.display import Markdown, display

def replace_images_in_markdown(markdown_str: str, images_dict: dict) -> str:
    for img_name, base64_str in images_dict.items():
        markdown_str = markdown_str.replace(f"![{img_name}]({img_name})", f"![{img_name}]({base64_str})")
    return markdown_str

def get_combined_markdown(ocr_response: OCRResponse) -> str:
  markdowns: list[str] = []
  for page in pdf_response.pages:
    image_data = {}
    for img in page.images:
      image_data[img.id] = img.image_base64
    markdowns.append(replace_images_in_markdown(page.markdown, image_data))

  return "\n\n".join(markdowns)

display(Markdown(get_combined_markdown(pdf_response)))


In [None]:
import os # Import the os module for path operations (optional but good practice)
from mistralai.models import OCRResponse
from IPython.display import Markdown, display

def replace_images_in_markdown(markdown_str: str, images_dict: dict) -> str:
    """Replaces image placeholders with base64 encoded strings."""
    for img_name, base64_str in images_dict.items():
        # Ensure the base64 string is properly formatted for Markdown image links
        # Usually starts with 'data:image/...'
        markdown_str = markdown_str.replace(f"![{img_name}]({img_name})", f"![{img_name}]({base64_str})")
    return markdown_str

def get_combined_markdown(ocr_response: OCRResponse) -> str:
  """Combines markdown from all pages of an OCR response, embedding images."""
  markdowns: list[str] = []
  # Assuming pdf_response is the correct variable name based on the original code context
  # If ocr_response is the intended variable, use that instead. Let's assume pdf_response for now.
  # Replace pdf_response with ocr_response if that's the actual variable holding the data
  for page in ocr_response.pages: # Changed pdf_response to ocr_response based on function signature
    image_data = {}
    if page.images: # Check if there are images on the page
        for img in page.images:
          # Ensure both id and base64 string exist
          if img.id and img.image_base64:
            image_data[img.id] = img.image_base64
    if page.markdown: # Check if markdown exists for the page
        markdowns.append(replace_images_in_markdown(page.markdown, image_data))

  return "\n\n".join(markdowns)

def save_markdown_to_file(markdown_content: str, filename: str = "output.md"):
  """Saves the given markdown content to a file."""
  try:
    with open(filename, 'w', encoding='utf-8') as f:
      f.write(markdown_content)
    print(f"Successfully saved Markdown to '{filename}'")
  except IOError as e:
    print(f"Error saving Markdown to file '{filename}': {e}")
  except Exception as e:
    print(f"An unexpected error occurred: {e}")

# --- Main execution part ---

# Assume 'pdf_response' is your OCRResponse object containing the data
# Replace 'pdf_response' with the actual variable name if it's different
# Example placeholder: pdf_response = OCRResponse(...) # Load or get your response here

# 1. Generate the combined markdown string
combined_markdown_output = get_combined_markdown(pdf_response) # Pass your OCRResponse object

# 2. Save the combined markdown to a file
output_filename = "generated_report.md" # Choose your desired filename
save_markdown_to_file(combined_markdown_output, output_filename)

# 3. Display the markdown in the IPython environment (optional)
# display(Markdown(combined_markdown_output))

## For single Image files...

In [None]:
image_file = Path("rd-test-img.png")
assert image_file.is_file()

In [None]:
import base64

encoded = base64.b64encode(image_file.read_bytes()).decode()
base64_data_url = f"data:image/jpeg;base64,{encoded}"

image_response = client.ocr.process(document=ImageURLChunk(image_url=base64_data_url), model="mistral-ocr-latest")

response_dict = json.loads(image_response.json())
json_string = json.dumps(response_dict, indent=4)
print(json_string)

## Combining Pixtral for structure with the OCR

We want to be able to extract structured data from these files. For this, we will make use of `pixtral-12b-latest` and support it with our OCR model for better, high-quality answers.

In [None]:
image_ocr_markdown = image_response.pages[0].markdown

chat_response = client.chat.complete(
    model="pixtral-12b-latest",
    messages=[
        {
            "role": "user",
            "content": [
                ImageURLChunk(image_url=base64_data_url),
                TextChunk(text=f"This is image's OCR in markdown:\n<BEGIN_IMAGE_OCR>\n{image_ocr_markdown}\n<END_IMAGE_OCR>.\nConvert this into a sensible structured json response. The output should be strictly be json with no extra commentary")
            ],
        },
    ],
    response_format =  {"type": "json_object"},
    temperature=0
)

response_dict = json.loads(chat_response.choices[0].message.content)
json_string = json.dumps(response_dict, indent=4)
print(json_string)

## Passing the OCR output into an LLM

Note: We are leveraging a model already capable of vision tasks. However, we could also use text-only models for the structured output.

In [None]:
image_ocr_markdown = image_response.pages[0].markdown

chat_response = client.chat.complete(
    model="ministral-8b-latest",
    messages=[
        {
            "role": "user",
            "content": f"This is image's OCR in markdown:\n<BEGIN_IMAGE_OCR>\n{image_ocr_markdown}\n<END_IMAGE_OCR>.\nConvert this into a sensible structured json response. The output should be strictly be json with no extra commentary"
        },
    ],
    response_format =  {"type": "json_object"},
    temperature=0
)

response_dict = json.loads(chat_response.choices[0].message.content)
json_string = json.dumps(response_dict, indent=4)
print(json_string)

### All Together
Let's design a simple function that takes an `image_path` file and returns a JSON structured output in a specific format. In this case, we arbitrarily decided we wanted an output respecting the following:

```python
class StructuredOCR:
    file_name: str  # can be any string
    topics: list[str]  # must be a list of strings
    languages: list[Language]  # a list of languages
    ocr_contents: dict  # any dictionary, can be freely defined by the model
```

We will make use of [custom structured outputs](https://docs.mistral.ai/capabilities/structured-output/custom_structured_output/) as well as `pycountry` for the languages.

In [None]:
!pip install pycountry

In [None]:
from enum import Enum
from pathlib import Path
from pydantic import BaseModel
import base64
import pycountry

languages = {lang.alpha_2: lang.name for lang in pycountry.languages if hasattr(lang, 'alpha_2')}

class LanguageMeta(Enum.__class__):
    def __new__(metacls, cls, bases, classdict):
        for code, name in languages.items():
            classdict[name.upper().replace(' ', '_')] = name
        return super().__new__(metacls, cls, bases, classdict)

class Language(Enum, metaclass=LanguageMeta):
    pass

class StructuredOCR(BaseModel):
    file_name: str
    topics: list[str]
    languages: list[Language]
    ocr_contents: dict

print(StructuredOCR.schema_json())

def structured_ocr(image_path: str) -> StructuredOCR:
    image_file = Path(image_path)
    assert image_file.is_file(), "The provided image path does not exist."

    # Read and encode the image file
    encoded_image = base64.b64encode(image_file.read_bytes()).decode()
    base64_data_url = f"data:image/jpeg;base64,{encoded_image}"

    # Process the image using OCR
    image_response = client.ocr.process(document=ImageURLChunk(image_url=base64_data_url), model="mistral-ocr-latest")
    image_ocr_markdown = image_response.pages[0].markdown

    # Parse the OCR result into a structured JSON response
    chat_response = client.chat.parse(
        model="pixtral-12b-latest",
        messages=[
            {
                "role": "user",
                "content": [
                    ImageURLChunk(image_url=base64_data_url),
                    TextChunk(text=(
                        "This is the image's OCR in markdown:\n"
                        f"<BEGIN_IMAGE_OCR>\n{image_ocr_markdown}\n<END_IMAGE_OCR>.\n"
                        "Convert this into a structured JSON response with the OCR contents in a sensible dictionnary."
                    ))
                ],
            },
        ],
        response_format=StructuredOCR,
        temperature=0
    )

    return chat_response.choices[0].message.parsed

We can now extract structured output from any image parsed with our OCR model.

In [None]:
from PIL import Image

image_path = "rd-test-img.png"

image = Image.open(image_path)
image

In [None]:
image_path = "rd-test-img.png"
structured_response = structured_ocr(image_path)

response_dict = json.loads(structured_response.json())
json_string = json.dumps(response_dict, indent=4)
print(json_string)

In [None]:
image_path = "/content/thai_learning.png"
image = Image.open(image_path)
image

In [None]:
structured_response = structured_ocr(image_path)

response_dict = json.loads(structured_response.json())
json_string = json.dumps(response_dict, indent=4)
print(json_string)

In [None]:
def format_thai_dictionary(thai_dict):
    """
    Format and display Thai characters correctly from the provided JSON dictionary.

    Args:
        thai_dict (dict): Dictionary containing Thai language data

    Returns:
        str: Formatted string with properly aligned Thai characters
    """
    output = []

    # Add title
    title = thai_dict.get("ocr_contents", {}).get("title", "")
    output.append(f"# {title}")
    output.append("")

    # Process sections
    for section in thai_dict.get("ocr_contents", {}).get("sections", []):
        section_title = section.get("title", "")
        output.append(f"## {section_title}")
        output.append("")

        # Create header based on section type
        if section_title == "Thai Consonants":
            output.append(f"{'Consonant':<15}{'Pronunciation':<20}{'Thai Character':<20}{'Meaning':<15}{'Translation':<15}")
            output.append("-" * 85)

            # Add consonant content
            for item in section.get("content", []):
                consonant = item.get("consonant", "")
                pronunciation = item.get("pronunciation", "")
                thai_character = item.get("thai_character", "")
                meaning = item.get("meaning", "")
                translation = item.get("translation", "")

                output.append(f"{consonant:<15}{pronunciation:<20}{thai_character:<20}{meaning:<15}{translation:<15}")

        elif section_title == "Vowels":
            output.append(f"{'Vowel':<15}{'Thai Character':<20}{'Meaning':<15}{'Translation':<15}")
            output.append("-" * 65)

            # Add vowel content
            for item in section.get("content", []):
                vowel = item.get("vowel", "")
                thai_character = item.get("thai_character", "")
                meaning = item.get("meaning", "")
                translation = item.get("translation", "")

                output.append(f"{vowel:<15}{thai_character:<20}{meaning:<15}{translation:<15}")

        output.append("")  # Add empty line after each section

    return "\n".join(output)

# Example usage

#
formatted_output = format_thai_dictionary(response_dict)
print(formatted_output)