# Layout Detection +  Translation

In [1]:
import csv
import json
import fitz  # PyMuPDF
import ocrmypdf
import os
from pathlib import Path
import time
import logging
from deep_translator import GoogleTranslator
from langdetect import detect, DetectorFactory
from PyPDF2 import PdfReader, PdfWriter, Transformation
import copy
import string
import os
import shutil

In [2]:
# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.FileHandler("ocr_recovery.log"),
        logging.StreamHandler()
    ]
)

In [3]:
def find_max_csv_field_size():
    """Find the maximum CSV field size limit using binary search"""
    max_int = 2147483647  # 2^31-1
    min_int = 1024
    
    while min_int < max_int:
        try:
            mid = (min_int + max_int + 1) // 2
            csv.field_size_limit(mid)
            min_int = mid
        except OverflowError:
            max_int = mid - 1
    
    return min_int

In [4]:
# Safely set maximum CSV field size limit
csv.field_size_limit(find_max_csv_field_size())

2147483647

In [5]:
def int_to_rgb(color_int):
    """Convert integer color to RGB tuple."""
    if color_int < 0:
        color_int = color_int & 0xFFFFFFFF

    a = (color_int >> 24) & 0xFF
    r = (color_int >> 16) & 0xFF
    g = (color_int >> 8) & 0xFF
    b = color_int & 0xFF

    if (a == 0):
        a = 255

    return [r, g, b, a]

In [6]:
def normalize_spaced_text(text):
    """
    Normalize text with excessive spacing between characters,
    commonly found in headers like "F I N A N C I A L  S T A T E M E N T S" or ""
    """
    # Check if text has consistent spacing pattern (every character followed by space)
    if len(text) > 3 and all(text[i] == ' ' for i in range(1, len(text), 2)):
        # Join characters by removing spaces
        return ''.join(text[i] for i in range(0, len(text), 2))
    
    # Check if text has spaces between all characters
    if len(text) > 3 and ' ' in text:
        # Count spaces vs non-spaces
        spaces = text.count(' ')
        non_spaces = len(text) - spaces
        
        # If the ratio of spaces to characters is high (e.g., spaces >= characters)
        if spaces >= non_spaces - 1:
            text_split = text.split(' ')
            text_split = [' ' if char == '' else char for char in text_split]
            return ''.join(text_split)
    
    # Return original if no patterns match
    return text

In [7]:
def apply_ocr_to_pdf(input_path, output_dir):
    """Apply OCR to a PDF file using OCRmyPDF"""
    # Create OCR file in data/test/PDF_ocr directory
    output_path = output_dir / f"{input_path.stem}.ocr.pdf"

    try:
        languages = [
            "chi_sim",  # Simplified Chinese
            "chi_tra",  # Traditional Chinese
            "vie",      # Vietnamese
            "eng",      # English
            "jpn",      # Japanese
            "kor",      # Korean
            "fra",      # French
            "deu",      # German
            "spa",      # Spanish
            "rus"       # Russian
        ]

        # Run OCR with multiple language support
        ocrmypdf.ocr(
            input_path,
            output_path,
            language="+".join(languages),
            deskew=True,
            clean=False,
            optimize=0,
            output_type='pdf',  # Changed from 'pdfa' to 'pdf' to retain original color space
            skip_text=True,
            progress_bar=True,
            color_conversion_strategy='UseDeviceIndependentColor',
        )
        logging.info(f"OCR completed: {output_path}")
        return output_path
    except Exception as e:
        logging.error(f"OCR error: {str(e)}")
        return None

In [8]:
def map_language_code_for_deep_translator(lang_code):
    mapping = {
        "zh-cn": "zh-CN",
        "zh-hans": "zh-CN",
        "zh-tw": "zh-TW",
        'zh-hant': 'zh-TW',
        'zh': 'zh-CN',     # Default Chinese to Simplified
        'jw': 'jv',        # Javanese
        'iw': 'he',        # Hebrew
        'in': 'id',        # Indonesian
        'ceb': 'tl',       # Adjust Cebuano to use Tagalog
    }

    return mapping.get(lang_code, lang_code)

In [9]:
def batch_translate_text(texts_with_langs, target='vi', batch_size=25, delay=0):
    """
    Translate a batch of texts with rate limiting.
    
    Args:
        texts: List of texts to translate
        source: Source language code
        target: Target language code
        batch_size: Number of texts to translate in one batch
        delay: Delay between batches in seconds
        
    Returns:
        List of translated texts
    """
    results = [""] * len(texts_with_langs)

    # Group text by detected source language
    lang_groups = {}
    for text, lang, orig_idx in texts_with_langs:
        if not lang in lang_groups:
            lang_groups[lang] = []
        lang_groups[lang].append((text, orig_idx))

    for source_lang, texts_with_indices in lang_groups.items():
        if source_lang == target:
            for text, orig_idx in texts_with_indices:
                results[orig_idx] = text
            continue

        texts = [t[0] for t in texts_with_indices]
        indices = [t[1] for t in texts_with_indices]

        # Create translator for this language
        translator = GoogleTranslator(source=source_lang, target=target)

        translated_batch = []

        for i in range(0, len(texts), batch_size):
            batch = texts[i:min(i + batch_size, len(texts))]
            
            # Process each text in the current batch
            batch_results = []
            for text in batch:
                try:    
                    translated = translator.translate(text)
                    batch_results.append(translated)
                    
                except Exception as e:
                    logging.warning(f"Translation error: {str(e)[:100]}...")
                    # Return original text on error
                    batch_results.append(text)
                    
                    # Handle rate limiting - increase delay and reduce batch size
                    if "429" in str(e) or "too many requests" in str(e).lower():
                        logging.info(f"Rate limit hit. Increasing delay to {delay*2}s and reducing batch size.")
                        delay *= 2
                        batch_size = max(1, batch_size // 2)
                        time.sleep(5)  # Additional pause after hitting rate limit
            
            translated_batch.extend(batch_results)
            
            # Add delay between batches
            if i + batch_size < len(texts):
                time.sleep(delay)

        # Put translated texts back in their original positions
        for translated_text, orig_idx in zip(translated_batch, indices):
            results[orig_idx] = translated_text
            
    return results

In [10]:
def translate_cells(cells, target='vi'):
    """
    Translate text in cells from source language to target language.
    
    Args:
        cells: List of cell dictionaries with text
        source: Source language code
        target: Target language code
        
    Returns:
        List of cell dictionaries with translated text
    """
    # Extract all texts and detect languages
    texts_with_langs = []
    for i, cell in enumerate(cells):
        if cell.get("text"):
            try:
                # Detect language for each text
                lang = detect(cell["text"])
                # Map language code for deep_translator
                mapped_lang = map_language_code_for_deep_translator(lang)
                # Store original language in cell
                texts_with_langs.append((cell["text"], mapped_lang, i))
            except Exception as e:
                logging.warning(f"Language detection error: {str(e)[:100]}... Using 'en' as fallback.")
                texts_with_langs.append((cell["text"], "en", i))
    
    logging.info(f"Translating {len(texts_with_langs)} text segments to {target}...")

    lang_counts = {}
    for _, lang, _ in texts_with_langs:
        lang_counts[lang] = lang_counts.get(lang, 0) + 1
    
    logging.info("Detected languages:")
    for lang, count in lang_counts.items():
        logging.info(f"  - {lang}: {count} segments")
    
    # Perform batch translation
    translated_texts = batch_translate_text(texts_with_langs, target)
    
    # Map translated texts back to cells
    text_index = 0
    for cell in cells:
        if cell.get("text"):
            cell["text_vi"] = translated_texts[text_index]
            text_index += 1
    
    return cells

In [11]:
cell = [ {'x': 72.000732421875,
  'y': 470.55364990234375,
  'width': 388.0931396484375,
  'height': 11.9671630859375,
  'text': 'due to the high thermal excitations in the hot and very dilute nuclear matter (i.e. µ',#'Since the new nuclear matter turns to be dominated by the colorless U (1)',
  'font': {'color': [0, 0, 0, 255], 'name': 'CMR12', 'size': 11},
  'text_vi': 'Since the new nuclear matter turns to be dominated by the colorless U (1)'}]

In [12]:
translate_cells(cell)

2025-05-08 15:09:31,107 [INFO] Translating 1 text segments to vi...
2025-05-08 15:09:31,108 [INFO] Detected languages:
2025-05-08 15:09:31,109 [INFO]   - en: 1 segments


[{'x': 72.000732421875,
  'y': 470.55364990234375,
  'width': 388.0931396484375,
  'height': 11.9671630859375,
  'text': 'due to the high thermal excitations in the hot and very dilute nuclear matter (i.e. µ',
  'font': {'color': [0, 0, 0, 255], 'name': 'CMR12', 'size': 11},
  'text_vi': 'Do sự kích thích nhiệt cao trong chất hạt nhân nóng và rất loãng (tức là'}]

In [13]:
def clean_text(text):
    """Clean text by removing/replacing non-printable characters"""
    if not isinstance(text, str):
        return text
        
    # Replace common problematic Unicode characters
    replacements = {
        '\u0000': '',  # NULL
        '\u0001': '',  # START OF HEADING
        '\u0002': '',  # START OF TEXT
        '\u0003': '',  # END OF TEXT
        '\u0004': '',  # END OF TRANSMISSION
        '\u0005': '',  # ENQUIRY
        '\u0006': '',  # ACKNOWLEDGE
        '\u0007': '',  # BELL
        '\u0014': '',  # DEVICE CONTROL FOUR
        '\u0015': '',  # NEGATIVE ACKNOWLEDGE
        '\ufffd': '',  # REPLACEMENT CHARACTER (�)
        '\u200b': '',  # ZERO WIDTH SPACE
        '\u200e': '',  # LEFT-TO-RIGHT MARK
        '\u200f': '',  # RIGHT-TO-LEFT MARK
        '\ufeff': '',  # ZERO WIDTH NO-BREAK SPACE
    }
    
    # Apply replacements
    for char, replacement in replacements.items():
        text = text.replace(char, replacement)
    
    # Filter out any remaining control characters
    return ''.join(char for char in text if ord(char) >= 32 or char in '\n\r\t')

In [14]:
def extract_pdf_info(pdf_path):
    """Extract text and formatting information from a PDF file"""
    doc = fitz.open(pdf_path)
    cells = []
    
    try:
        for page_num, page in enumerate(doc, start=1):
            blocks = page.get_text("dict")["blocks"]
            for block in blocks:
                for line in block.get("lines", []):
                    for span in line.get("spans", []):
                        span["text"] = clean_text(span["text"]).strip()
                        # Skip empty spans
                        if not span["text"]:
                            continue

                        # Normalize text with excessive spacing
                        normalized_text = normalize_spaced_text(span["text"])

                        cell = {
                            "x": span["bbox"][0],
                            "y": span["bbox"][1],
                            "width": span["bbox"][2] - span["bbox"][0],     # width
                            "height": span["bbox"][3] - span["bbox"][1],    # height
                            "text": normalized_text,
                            "font": {
                                "color": int_to_rgb(span["color"]),
                                "name": span["font"],
                                "size": int(span["size"]),
                            },
                            "text_vi": normalized_text  # Will be translated later
                        }
                        cells.append(cell)
    except Exception as e:
        logging.error(f"Error extracting text from {pdf_path}: {str(e)}")
    finally:
        doc.close()
    
    # Add translation step
    if cells:
        try:
            logging.info(f"Translating {len(cells)} cells to Vietnamese...")
            cells = translate_cells(cells, target='vi')
            logging.info(f"Translation complete for {len(cells)} cells")
        except Exception as e:
            logging.error(f"Translation error: {str(e)}")
            # Continue with untranslated text

    # Final null check before returning
    for cell in cells:
        if cell.get("text_vi") is None:
            cell["text_vi"] = cell.get("text", "")  # Use original or empty string

    return {"cells": cells}

In [15]:
def extract_pdf_cells(pdf_path):
    """Extract text and formatting information from a PDF file without translation."""
    doc = fitz.open(pdf_path)
    cells = []
    
    try:
        for page_num, page in enumerate(doc, start=1):
            blocks = page.get_text("dict")["blocks"]
            for block in blocks:
                for line in block.get("lines", []):
                    for span in line.get("spans", []):
                        # Clean and normalize text
                        span_text = clean_text(span["text"]).strip()
                        # Skip empty spans
                        if not span_text:
                            continue

                        # Normalize text with excessive spacing
                        normalized_text = normalize_spaced_text(span_text)

                        cell = {
                            "x": span["bbox"][0],
                            "y": span["bbox"][1],
                            "width": span["bbox"][2] - span["bbox"][0],     # width
                            "height": span["bbox"][3] - span["bbox"][1],    # height
                            "text": normalized_text,
                            "font": {
                                "color": int_to_rgb(span["color"]),
                                "name": span["font"],
                                "size": int(span["size"]),
                            },
                            "text_vi": normalized_text  # Set to "vietnamese" as requested
                        }
                        cells.append(cell)
    except Exception as e:
        logging.error(f"Error extracting text from {pdf_path}: {str(e)}")
    finally:
        doc.close()

    # Final null check for text fields
    for cell in cells:
        if cell.get("text") is None:
            cell["text"] = ""
        if cell.get("text_vi") is None:
            cell["text_vi"] = cell["text"]

    return {'cells' : cells}

In [16]:
import unicodedata

def normalize_text(text):
    """
    Normalize text to standardize Unicode characters for mathematical and special characters.
    
    Args:
        text (str): Input text to normalize.
    
    Returns:
        str: Normalized text with standardized characters and cleaned formatting.
    """
    # Configure logging for warnings about unmapped characters
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s [%(levelname)s] %(message)s",
        handlers=[logging.StreamHandler()]
    )

    # Define a mapping for problematic Unicode characters to their preferred forms
    unicode_mapping = {
        '\u00B5': '\u03BC',  # Micro Sign (µ) → Greek Small Letter Mu (μ)
        '\u2243': '\u2248',  # Asymptotically Equal To (≃) → Almost Equal To (≈)
        '\u2245': '\u2248',  # Approximately Equal To (≅) → Almost Equal To (≈)
        '\u2212': '\u002D',  # Minus Sign (−) → Hyphen-Minus (-)
        '\u2013': '\u002D',  # En Dash (–) → Hyphen-Minus (-)
        '\u2014': '\u002D',  # Em Dash (—) → Hyphen-Minus (-)
        '\u00A0': '\u0020',  # Non-Breaking Space → Regular Space
        '\u200B': '',        # Zero-Width Space → Remove
        '\uFEFF': '',        # Zero-Width No-Break Space (BOM) → Remove
    }

    # Apply NFKC normalization to handle decomposed forms and compatibility characters
    text = unicodedata.normalize('NFKC', text)

    # Apply custom Unicode mappings
    for source_char, target_char in unicode_mapping.items():
        if source_char in text:
            logging.info(f"Replacing {source_char} (U+{ord(source_char):04X}) with {target_char} (U+{ord(target_char):04X})")
            text = text.replace(source_char, target_char)

    # Check for unmapped special characters and log warnings
    special_chars = set(c for c in text if ord(c) > 127 and c not in unicode_mapping.values())
    if special_chars:
        logging.warning(f"Found unmapped special characters: {special_chars}. Consider adding to unicode_mapping.")

    # Remove non-printable control characters (except spaces)
    text = ''.join(c for c in text if c.isprintable() or c.isspace())

    # Normalize multiple spaces, tabs, or newlines to a single space
    text = ' '.join(text.split())

    return text

In [17]:
def text_visual_length(text, font_size = 11 ,font_name = 'NotoSansMath', font_file_path = './NotoSansMath-Regular.ttf'):

    '''
    font_file_path="./NotoSansMath-Regular.ttf"
    font_name = 'NotoSansMath'
    '''

    font = fitz.Font(fontname=font_name, fontfile=font_file_path)

    # Measure and shrink font to fit the box
    text_width = font.text_length(text, fontsize=font_size)

    return text_width

In [18]:
def document_translation(pdf_path, font_file_path="Roboto.ttf"):
    """
    Extract text from a PDF using OCR, save to CSV, and visualize text in a new PDF.
    
    Args:
        pdf_path (str): Path to the input PDF file.
        font_file_path (str): Path to the custom font file (default: './Roboto.ttf').
    """
    # Configure logging
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s [%(levelname)s] %(message)s",
        handlers=[
            logging.FileHandler("document_translation.log"),
            logging.StreamHandler()
        ]
    )

    # Extract file ID from the PDF path (without extension)
    file_id = os.path.splitext(os.path.basename(pdf_path))[0]

    # Create temporary working folder
    working_folder = f"./{file_id}"
    if os.path.exists(working_folder):
        shutil.rmtree(working_folder)
        logging.info(f"Deleted existing working folder: {working_folder}")
    os.makedirs(working_folder, exist_ok=True)

    # Copy the input PDF to the working folder
    new_pdf_path = os.path.join(working_folder, os.path.basename(pdf_path))
    shutil.copy(pdf_path, new_pdf_path)

    # Define file paths
    ocr_pdf_path = os.path.join(working_folder, f"{file_id}.ocr.pdf")
    csv_file_path = os.path.join(working_folder, f"{file_id}.csv")
    output_pdf_path = f"./translation_{file_id}.pdf"

    # Step 1: Apply OCR to the PDF
    logging.info(f"Applying OCR to {new_pdf_path}...")
    ocr_result = apply_ocr_to_pdf(Path(new_pdf_path), Path(working_folder))
    if not ocr_result:
        logging.error("OCR failed. Exiting.")
        shutil.rmtree(working_folder)
        return
    logging.info(f"OCR PDF saved to {ocr_result}")

    # Step 2: Extract cells using extract_pdf_cells
    logging.info(f"Extracting cells from {ocr_result}...")
    cells = extract_pdf_cells(ocr_result)["cells"]
    
    if not cells:
        logging.error("No cells extracted from PDF. Exiting.")
        shutil.rmtree(working_folder)
        return
    logging.info(f"Extracted {len(cells)} cells")

    # Step 3: Save extracted cells to CSV
    csv_data = [{
        "id": file_id,
        "solution": json.dumps(cells, ensure_ascii=False)
    }]
    try:
        with open(csv_file_path, 'w', encoding='utf-8', newline='') as f:
            writer = csv.DictWriter(f, fieldnames=["id", "solution"])
            writer.writeheader()
            writer.writerows(csv_data)
        logging.info(f"Saved {len(cells)} cells to {csv_file_path}")
    except Exception as e:
        logging.error(f"Error saving CSV: {str(e)}")
        shutil.rmtree(working_folder)
        return

    # Step 4: Open the OCR PDF for modification
    try:
        doc = fitz.open(ocr_pdf_path)
        doc_size = fitz.open(pdf_path)  # Open original PDF to get page size
        page = doc[0]  # Modify the first page
        page_size = doc_size[0]
        page_rect = page_size.rect
        page_width = page_rect.width
        page_height = page_rect.height
        logging.info(f"Opened {ocr_pdf_path}, size: {page_width} x {page_height}")
        doc_size.close()
    except Exception as e:
        logging.error(f"Error opening PDF: {str(e)}")
        shutil.rmtree(working_folder)
        return

    # Step 5: Read the CSV and visualize cells in the PDF
    try:
        with open(csv_file_path, 'r', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            data = list(reader)
        logging.info(f"Read {len(data)} rows from {csv_file_path}")
    except Exception as e:
        logging.error(f"Error reading CSV: {str(e)}")
        doc.close()
        shutil.rmtree(working_folder)
        return

    for item in data:
        try:
            ocr_data = json.loads(item['solution'])
            logging.info(f"Processing item with id: {item['id']}, {len(ocr_data)} entries")
            
            for entry in ocr_data:
                x = entry["x"]
                y = entry["y"]
                width = entry["width"]
                height = entry["height"]
                text = normalize_text(entry["text"]).strip()
                text_vi = entry["text_vi"]  # Placeholder "vietnamese" from extract_pdf_cells
                # Optionally use original text if visualization requires it
                # text_vi = entry["text"]  # Uncomment to use original text instead
                font_size = entry["font"]["size"]
                font_name = 'Roboto'
                
                #text_vi = '= 3, có khả năng vật chất Hagedorn trực giao trải qua bậc ba'

                # width = text_visual_length(text) # No,we are override, this rectangele will smaller than the initial word

                # Define the rectangle for the text
                rect = fitz.Rect(x, y, x + width, y + height)

                # Cover the original text with a white rectangle
                #page.draw_rect(rect, color=[1, 1, 1], fill=[1, 1, 1])
                page.draw_rect(rect, color=[0, 0, 0], fill=[1, 1, 1])

                # Initialize the font object from file
                font = fitz.Font(fontname=font_name, fontfile=font_file_path)

                font_size = 20

                # Measure and shrink font to fit the box
                text_width = font.text_length(text_vi, fontsize=font_size)
                #print(text,':',text_width)
                while text_width > width and font_size > 1:
                     font_size -= 1
                     text_width = font.text_length(text_vi, fontsize=font_size)

                # Center the text vertically in the rectangle
                y_text_pos = y  + (height)

                # Insert the text
                page.insert_text(
                    (x, y_text_pos),
                    text_vi,
                    fontsize=font_size,
                    fontname=font_name,
                    fontfile=font_file_path,
                    encoding='utf-16',
                    fill_opacity=1,
                    stroke_opacity=1,
                    border_width=1
                )
                logging.info(f"Visualized text at ({x}, {y}) with '{text}'")
        except json.JSONDecodeError as e:
            logging.error(f"Error parsing 'solution' for id {item['id']}: {str(e)}")
        except Exception as e:
            logging.error(f"Error processing entry: {str(e)}")

    # Step 6: Save the modified PDF
    try:
        doc.save(output_pdf_path)
        doc.close()
        logging.info(f"PDF modified and saved as {output_pdf_path}")
    except Exception as e:
        logging.error(f"Error saving PDF: {str(e)}")
        doc.close()
        shutil.rmtree(working_folder)
        return

    # Step 7: Clean up working folder
    try:
        shutil.rmtree(working_folder)
        logging.info(f"Deleted temporary folder: {working_folder}")
    except Exception as e:
        logging.error(f"Error deleting working folder: {str(e)}")

In [19]:
document_translation('Math_notation - Copy.pdf')

2025-05-08 15:10:43,755 [INFO] Applying OCR to ./Math_notation - Copy\Math_notation - Copy.pdf...


2025-05-08 15:10:45,775 [INFO] skipping all processing on this page


2025-05-08 15:10:45,798 [INFO] Postprocessing...


2025-05-08 15:10:45,909 [INFO] Image optimization ratio: 1.00 savings: 0.0%
2025-05-08 15:10:45,911 [INFO] Total file size ratio: 1.10 savings: 8.7%
2025-05-08 15:10:45,953 [INFO] OCR completed: Math_notation - Copy\Math_notation - Copy.ocr.pdf
2025-05-08 15:10:45,953 [INFO] OCR PDF saved to Math_notation - Copy\Math_notation - Copy.ocr.pdf
2025-05-08 15:10:45,954 [INFO] Extracting cells from Math_notation - Copy\Math_notation - Copy.ocr.pdf...
2025-05-08 15:10:45,974 [INFO] Extracted 82 cells
2025-05-08 15:10:45,979 [INFO] Saved 82 cells to ./Math_notation - Copy\Math_notation - Copy.csv
2025-05-08 15:10:45,983 [INFO] Opened ./Math_notation - Copy\Math_notation - Copy.ocr.pdf, size: 612.0 x 792.0
2025-05-08 15:10:46,022 [INFO] Read 1 rows from ./Math_notation - Copy\Math_notation - Copy.csv
2025-05-08 15:10:46,032 [INFO] Processing item with id: Math_notation - Copy, 82 entries
2025-05-08 15:10:46,139 [INFO] Visualized text at (72.0, 72.99362182617188) with 'dominated by the unitary H

# Math Equation Detection

## Detection

In [20]:
import os, glob, gc, shutil, yaml
from IPython.display import clear_output
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2
import torch
from ultralytics import YOLO
from pdf2image import convert_from_path
import fitz  # PyMuPDF
import os

In [21]:
IMAGE_SIZE = (2048, 1447)
FONT = cv2.FONT_HERSHEY_SIMPLEX
FONT_SCALE = 0.75
FONT_THICKNESS = 2
BORDER_THICKNESS = 2

RANDOM_STATE = 42
INPUT_SIZE = 1024
N_EPOCHS = 15
PATIENCE = 5
BATCH_SIZE = 4
CACHE_DATA = True
DEVICES = 1

In [22]:
best_weights = "best.pt"
best_model = YOLO(best_weights)

In [23]:
def pdf_to_jpg_with_sizes(pdf_path, output_folder, dpi=300):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Convert PDF to images
    images = convert_from_path(pdf_path, dpi=dpi)

    # Get PDF page sizes using PyMuPDF
    pdf_doc = fitz.open(pdf_path)

    # Use first page for size.txt (assuming all pages same size)
    first_image = images[0]
    first_page = pdf_doc[0]

    # Get sizes
    jpg_size = first_image.size  # (width, height) in pixels
    pdf_size = (first_page.rect.width, first_page.rect.height)  # (width, height) in points

    # Write to size.txt
    size_txt_path = os.path.join(output_folder, 'size.txt')
    with open(size_txt_path, 'w') as f:
        f.write(f"{jpg_size}\n")
        f.write(f"{pdf_size}\n")
    print(f"Saved size.txt at {size_txt_path}")

    # Save JPGs and print sizes
    for i, (image, page) in enumerate(zip(images, pdf_doc)):
        jpg_path = os.path.join(output_folder, f'{output_folder}.jpg')
        image.save(jpg_path, 'JPEG')

        print(f'Page {i+1}: PDF size = {pdf_size[0]} x {pdf_size[1]} pt, JPG size = {jpg_size[0]} x {jpg_size[1]} px')
        print(f'Saved: {jpg_path}')

In [24]:
def scale_box_to_pdf(jpg_box, jpg_size, pdf_size):
    x1, y1, x2, y2 = jpg_box
    jpg_width, jpg_height = jpg_size
    pdf_width, pdf_height = pdf_size

    scale_x = pdf_width / jpg_width
    scale_y = pdf_height / jpg_height

    scaled_x1 = x1 * scale_x
    scaled_y1 = y1 * scale_y
    scaled_x2 = x2 * scale_x
    scaled_y2 = y2 * scale_y

    return [scaled_x1, scaled_y1, scaled_x2, scaled_y2]

In [25]:
def generate_pdf_coordinates(image_folder):
    """
    Generate pdf_coor.txt with PDF-scaled coordinates from index.txt and size.txt.

    Args:
        image_folder (str): Folder containing index.txt, size.txt, and images.
    """
    import ast

    # Load size.txt
    size_path = os.path.join(image_folder, 'size.txt')
    with open(size_path, 'r') as f:
        jpg_size = ast.literal_eval(f.readline().strip())
        pdf_size = ast.literal_eval(f.readline().strip())

    # Load index.txt
    index_path = os.path.join(image_folder, 'index.txt')
    with open(index_path, 'r') as f:
        lines = f.readlines()

    pdf_coordinates = []

    for line in lines:
        parts = line.strip().split()
        if len(parts) != 5:
            continue  # skip bad lines

        box_id, x1, y1, x2, y2 = parts
        x1, y1, x2, y2 = map(float, [x1, y1, x2, y2])

        # Convert coordinates to PDF space
        scaled_box = scale_box_to_pdf([x1, y1, x2, y2], jpg_size, pdf_size)

        # Format line for output
        pdf_line = f"{box_id} {scaled_box[0]:.4f} {scaled_box[1]:.4f} {scaled_box[2]:.4f} {scaled_box[3]:.4f}"
        pdf_coordinates.append(pdf_line)

    # Save to pdf_coor.txt
    pdf_coor_path = os.path.join(image_folder, 'pdf_coor.txt')
    with open(pdf_coor_path, 'w') as f:
        for line in pdf_coordinates:
            f.write(line + '\n')

    print(f"PDF coordinates saved at: {pdf_coor_path}")

In [26]:
def crop_and_normalize_all(root_folder):
    """
    Process all .jpg and .txt pairs in root folder, crop boxes, rename txt files,
    and save normalized data with float coordinates.

    Args:
        root_folder (str): Root folder containing .jpg and .txt files.
    """
    # Create images folder
    images_folder = os.path.join(root_folder, 'images')
    os.makedirs(images_folder, exist_ok=True)

    # Find all jpg files
    jpg_files = glob.glob(os.path.join(root_folder, '*.jpg'))

    for jpg_path in jpg_files:
        base_name = os.path.splitext(os.path.basename(jpg_path))[0]
        txt_path = os.path.join(root_folder, f'{base_name}.txt')

        if not os.path.exists(txt_path):
            print(f'Skipping {base_name}: no matching txt file.')
            continue

        # Load image
        image = cv2.imread(jpg_path)
        h_img, w_img = image.shape[:2]

        # Read txt file
        with open(txt_path, 'r') as f:
            lines = f.readlines()

        normalized_lines = []

        for i, line in enumerate(lines):
            parts = line.strip().split()
            if len(parts) != 5:
                continue  # skip bad lines

            box_id = i + 1
            x1, y1, x2, y2 = map(float, parts[1:])

            # Apply ±5 adjustment
            y1_adj = y1 - 5
            y2_adj = y2 + 5

            # Clamp coordinates within image boundaries
            x1_clamped = max(0.0, x1)
            y1_clamped = max(0.0, y1_adj)
            x2_clamped = min(float(w_img), x2)
            y2_clamped = min(float(h_img), y2_adj)

            # Crop image using int for pixel slicing
            crop = image[int(y1_clamped):int(y2_clamped), int(x1_clamped):int(x2_clamped)]

            # Save as images/id.jpg (only id, no prefix)
            crop_filename = f'{box_id}.jpg'
            crop_path = os.path.join(images_folder, crop_filename)
            cv2.imwrite(crop_path, crop)
            print(f'Saved: {crop_path}')

            # Save normalized line with float precision (4 decimal places)
            normalized_line = f"{box_id} {x1_clamped:.4f} {y1_clamped:.4f} {x2_clamped:.4f} {y2_clamped:.4f}"
            normalized_lines.append(normalized_line)

        # # Rename original txt → conf.txt (no prefix)
        # conf_txt_path = os.path.join(root_folder, 'conf.txt')
        # os.rename(txt_path, conf_txt_path)
        # print(f'Renamed {txt_path} → {conf_txt_path}')

        # Save normalized txt as index.txt (no prefix)
        index_txt_path = os.path.join(root_folder, 'index.txt')
        with open(index_txt_path, 'w') as f:
            for line in normalized_lines:
                f.write(line + '\n')

        generate_pdf_coordinates(root_folder)

        print(f'Index txt saved at: {index_txt_path}')

In [28]:
name_root = 'Math_notation - Copy'

In [29]:
pdf_to_jpg_with_sizes(name_root + '.pdf', name_root)

Saved size.txt at Math_notation - Copy\size.txt
Page 1: PDF size = 612.0 x 792.0 pt, JPG size = 2550 x 3300 px
Saved: Math_notation - Copy\Math_notation - Copy.jpg


In [30]:
PREDICTIONS_ROOT = './predictions'

In [31]:
with torch.no_grad():
    predictions = best_model.predict(
        source= './' + name_root,
        conf=0.65,
        iou=0.75,
        stream=True
    )

In [32]:
test_images = []

for prediction in predictions:
    if len(prediction.boxes.xyxy):
        name = prediction.path.split("/")[-1].split(".")[0]
        boxes = prediction.boxes.xyxy.cpu().numpy()
        scores = prediction.boxes.conf.cpu().numpy()
        
        test_images += [name]
        label_path = os.path.join(PREDICTIONS_ROOT, name + ".txt")
        
        with open(label_path, "w+") as f:
            for score, box in zip(scores, boxes):
                text = f"{score:0.4f} {' '.join(box.astype(str))}"
                f.write(text)
                f.write("\n")

clear_output()

In [33]:
crop_and_normalize_all(name_root)

Saved: Math_notation - Copy\images\1.jpg
Saved: Math_notation - Copy\images\2.jpg
Saved: Math_notation - Copy\images\3.jpg
Saved: Math_notation - Copy\images\4.jpg
Saved: Math_notation - Copy\images\5.jpg
Saved: Math_notation - Copy\images\6.jpg
Saved: Math_notation - Copy\images\7.jpg
Saved: Math_notation - Copy\images\8.jpg
Saved: Math_notation - Copy\images\9.jpg
Saved: Math_notation - Copy\images\10.jpg
Saved: Math_notation - Copy\images\11.jpg
PDF coordinates saved at: Math_notation - Copy\pdf_coor.txt
Index txt saved at: Math_notation - Copy\index.txt


## Visualization

In [34]:
import fitz  # PyMuPDF
from PIL import Image
import os
import ast  # To safely parse the tuple from size.txt
from PyPDF2 import PdfReader, PdfWriter, Transformation
import copy

In [35]:
def scale_box_to_pdf(jpg_box, jpg_size, pdf_size):
    x1, y1, x2, y2 = jpg_box
    jpg_width, jpg_height = jpg_size
    pdf_width, pdf_height = pdf_size

    scale_x = pdf_width / jpg_width
    scale_y = pdf_height / jpg_height

    scaled_x1 = x1 * scale_x
    scaled_y1 = y1 * scale_y
    scaled_x2 = x2 * scale_x
    scaled_y2 = y2 * scale_y

    return [scaled_x1, scaled_y1, scaled_x2, scaled_y2]

In [36]:
def insert_images(pdf_path, image_folder):
    """
    Insert cropped images back into the original PDF at scaled coordinates.

    Args:
        pdf_path (str): Path to the input PDF.
        image_folder (str): Folder containing index.txt, size.txt, and cropped images folder ('images').
    """
    # Load size.txt
    size_path = os.path.join(image_folder, 'size.txt')
    with open(size_path, 'r') as f:
        jpg_size = ast.literal_eval(f.readline().strip())
        pdf_size = ast.literal_eval(f.readline().strip())

    # Load index.txt
    index_path = os.path.join(image_folder, 'index.txt')
    with open(index_path, 'r') as f:
        lines = f.readlines()

    # Open PDF
    pdf_doc = fitz.open(pdf_path)
    pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
    output_pdf = f"{pdf_name}_insert_images.pdf"

    # Use first page (or extend later if needed)
    page = pdf_doc[0]

    for line in lines:
        parts = line.strip().split()
        if len(parts) != 5:
            continue

        img_id, x1, y1, x2, y2 = parts
        img_id = int(img_id)
        x1, y1, x2, y2 = map(float, [x1, y1, x2, y2])

        img_path = os.path.join(image_folder, f"images/{img_id}.jpg")
        if not os.path.exists(img_path):
            print(f"Warning: Image {img_path} not found, skipping.")
            continue

        # Scale coordinates
        scaled_box = scale_box_to_pdf([x1, y1, x2, y2], jpg_size, pdf_size)
        rect = fitz.Rect(scaled_box)

        # Insert image
        page.insert_image(rect, filename=img_path)

    # Save PDF
    pdf_doc.save(output_pdf)
    pdf_doc.close()
    print(f"Saved output PDF as {output_pdf}")

In [37]:
def scale_pdf(input_path, output_path, target_width=1025, target_height=1025):
    """
    Scale a PDF to the target dimensions ensuring both page size and content are scaled.

    Args:
        input_path (str): Path to the input PDF file
        output_path (str): Path where the scaled PDF will be saved
        target_width (int): Target width in pixels (default: 1025)
        target_height (int): Target height in pixels (default: 1025)
    """
    # Read the original PDF
    reader = PdfReader(input_path)
    writer = PdfWriter()

    # Convert target dimensions from pixels to points (72 points = 1 inch)
    # Assuming 72 DPI resolution
    target_width_pts = target_width
    target_height_pts = target_height

    # Process each page
    for page_num in range(len(reader.pages)):
        # Get the original page
        original_page = reader.pages[page_num]

        # Get original page dimensions
        mediabox = original_page.mediabox
        orig_width = float(mediabox.width)
        orig_height = float(mediabox.height)

        # Calculate scaling factors
        width_scale = target_width_pts / orig_width
        height_scale = target_height_pts / orig_height

        # Create a copy of the page to work with
        page = copy.deepcopy(original_page)

        # Apply scaling transformation to the content
        transform = Transformation().scale(width_scale, height_scale)
        page.add_transformation(transform)

        # Update the mediabox to the new dimensions
        # PyPDF2 uses a coordinate system with (0,0) at the bottom left
        page.mediabox.lower_left = (0, 0)
        page.mediabox.upper_right = (target_width_pts, target_height_pts)

        # Also update cropbox and trimbox if they exist
        if "/CropBox" in page:
            page.cropbox.lower_left = (0, 0)
            page.cropbox.upper_right = (target_width_pts, target_height_pts)

        if "/TrimBox" in page:
            page.trimbox.lower_left = (0, 0)
            page.trimbox.upper_right = (target_width_pts, target_height_pts)

        if "/ArtBox" in page:
            page.artbox.lower_left = (0, 0)
            page.artbox.upper_right = (target_width_pts, target_height_pts)

        if "/BleedBox" in page:
            page.bleedbox.lower_left = (0, 0)
            page.bleedbox.upper_right = (target_width_pts, target_height_pts)

        # Add the scaled page to the output PDF
        writer.add_page(page)

    # Write the result to the output file
    with open(output_path, "wb") as output_file:
        writer.write(output_file)

    print(f"PDF scaled successfully to {target_width}x{target_height}.")
    print(f"Both page dimensions and content have been scaled. Saved to {output_path}")

In [38]:
def scale_pdf_from_folder(pdf_name, folder_name):
    """
    Read target dimensions from size.txt, check current PDF size, and rescale if needed.
    Save scaled PDF in current directory.
    """
    size_file_path = os.path.join(folder_name, 'size.txt')
    pdf_input_path = pdf_name
    pdf_output_path = f"{os.path.splitext(pdf_name)[0]}_scale.pdf"

    # Read target dimensions
    with open(size_file_path, 'r') as f:
        lines = f.readlines()
        if len(lines) < 2:
            raise ValueError("size.txt does not contain at least two lines.")
        size_line = lines[1].strip()
        try:
            target_width, target_height = eval(size_line)
            if not (isinstance(target_width, (int, float)) and isinstance(target_height, (int, float))):
                raise ValueError("Size values must be numbers.")
        except Exception as e:
            raise ValueError(f"Invalid size tuple in size.txt: {size_line}") from e

    # Check current PDF size (first page)
    reader = PdfReader(pdf_input_path)
    first_page = reader.pages[0]
    orig_width = float(first_page.mediabox.width)
    orig_height = float(first_page.mediabox.height)

    # Compare with target size (allow tiny tolerance)
    tolerance = 0.01
    if abs(orig_width - target_width) < tolerance and abs(orig_height - target_height) < tolerance:
        print(f"No scaling needed. PDF already matches target size {target_width}x{target_height}.")
    else:
        scale_pdf(pdf_input_path, pdf_output_path, target_width, target_height)

In [43]:
def insert_images(pdf_path, image_folder):
    """
    Insert cropped images back into the PDF, rescaling the PDF first if needed.

    Args:
        pdf_path (str): Path to the input PDF.
        image_folder (str): Folder containing pdf_coor.txt, size.txt, and cropped images folder ('images').
    """
    import ast
    import os
    from PyPDF2 import PdfReader
    import fitz

    # Load size.txt
    size_path = os.path.join(image_folder, 'size.txt')
    with open(size_path, 'r') as f:
        jpg_size = ast.literal_eval(f.readline().strip())
        pdf_size = ast.literal_eval(f.readline().strip())
    target_width, target_height = pdf_size

    # Check current PDF size
    reader = PdfReader(pdf_path)
    first_page = reader.pages[0]
    orig_width = float(first_page.mediabox.width)
    orig_height = float(first_page.mediabox.height)

    tolerance = 0.01
    if abs(orig_width - target_width) < tolerance and abs(orig_height - target_height) < tolerance:
        scaled_pdf_path = pdf_path
        print("PDF size matches target. Proceeding to insert images.")
    else:
        print("PDF size does not match target. Rescaling PDF first...")
        folder_name = image_folder
        pdf_name = os.path.basename(pdf_path)
        scale_pdf_from_folder(pdf_name, folder_name)
        scaled_pdf_path = f"{os.path.splitext(pdf_name)[0]}_scale.pdf"

    # Load pdf_coor.txt
    pdf_coor_path = os.path.join(image_folder, 'pdf_coor.txt')
    if not os.path.exists(pdf_coor_path):
        raise FileNotFoundError(f"pdf_coor.txt not found in folder: {image_folder}")

    with open(pdf_coor_path, 'r') as f:
        lines = f.readlines()

    # Open scaled PDF
    pdf_doc = fitz.open(scaled_pdf_path)
    pdf_name = os.path.splitext(os.path.basename(scaled_pdf_path))[0]
    output_pdf = f"{pdf_name}_insert_images.pdf"

    # Use first page (or extend later if needed)
    page = pdf_doc[0]

    for line in lines:
        parts = line.strip().split()
        if len(parts) != 5:
            continue

        img_id, x1, y1, x2, y2 = parts
        img_id = int(img_id)
        x1, y1, x2, y2 = map(float, [x1, y1, x2, y2])

        img_path = os.path.join(image_folder, f"images/{img_id}.jpg")
        if not os.path.exists(img_path):
            print(f"Warning: Image {img_path} not found, skipping.")
            continue

        rect = fitz.Rect(x1 , y1 , x2 , y2 )

        print(rect)

        # Insert image
        page.insert_image(rect, filename=img_path)
        #page.draw_rect(rect, color=(1, 0, 0), fill=None, width=1)

    # Save PDF
    pdf_doc.save(output_pdf)
    pdf_doc.close()
    print(f"Saved output PDF as {output_pdf}")


In [44]:
insert_images('translation_Math_notation - Copy.pdf', 'Math_notation - Copy')

PDF size matches target. Proceeding to insert images.
Rect(408.9157, 426.7901, 444.9264, 442.3542)
Rect(436.7488, 468.8775, 472.039, 484.2936)
Rect(231.5094, 238.413, 267.4838, 254.2285)
Rect(273.5326, 175.9192, 308.594, 191.234)
Rect(217.7661, 323.063, 261.6181, 337.4784)
Rect(443.7224, 239.3031, 490.2527, 253.9552)
Rect(263.7803, 344.3717, 304.2116, 358.4625)
Rect(72.212, 365.2914, 112.7351, 379.6377)
Rect(134.7101, 136.0891, 168.9302, 148.2758)
Rect(169.0868, 641.018, 191.3845, 650.917)
Rect(492.7863, 449.6114, 530.1297, 462.7044)
Saved output PDF as translation_Math_notation - Copy_insert_images.pdf
