In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
    !pip install --no-deps unsloth
    !pip install torch transformers peft accelerate bitsandbytes


In [None]:
!unzip /content/Gltn-20250724T181254Z-1-001.zip

In [None]:
# BATCH OCR-to-PDF SCRIPT (with Font Fixes & Batching)

# Step 1: Install necessary libraries (PDF and Fonts)
!pip install fpdf2 -q
# The DejaVu font is essential for handling a wide range of characters in the PDF
!apt-get install -y fonts-dejavu-core -q

# Step 2: Import all necessary libraries
import torch
from unsloth import FastLanguageModel
from transformers import AutoProcessor
from PIL import Image
from fpdf import FPDF
import os
import gc
import time

# Step 3: Define model, paths, constants, and batch size
base_model_name = "allenai/olmOCR-7B-0725"
image_folder = "/content/Gltn/"
output_pdf_path = "/content/ocr_output.pdf"
RESIZE_MAX_SIZE = 1344
BATCH_SIZE = 5

# --- Model Loading ---
print("--- Loading Model and Processor ---")
model, _ = FastLanguageModel.from_pretrained(
    model_name=base_model_name,
    max_seq_length=4096,
    dtype=None,
    load_in_4bit=True,
    device_map="auto",
)
processor = AutoProcessor.from_pretrained(base_model_name, trust_remote_code=True)
print("--- Model Ready ---")

# Step 4: Find all image files
image_extensions = ('.png', '.jpg', '.jpeg', '.bmp', '.gif')
try:
    all_files = os.listdir(image_folder)
    image_files = sorted([f for f in all_files if f.lower().endswith(image_extensions)])
    if not image_files:
        print(f"\nWARNING: No image files found in '{image_folder}'.")
    else:
        print(f"\nFound {len(image_files)} images to process in batches of {BATCH_SIZE}.")
except FileNotFoundError:
    print(f"ERROR: The folder '{image_folder}' was not found.")
    image_files = []

# Step 5: Loop through images IN BATCHES with resizing and memory cleanup
ocr_results = []
if image_files:
    # Prepare the prompt once, as it's the same for all images
    messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "Please transcribe the text in this image."}]}]
    prompt = processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    # Process the images in chunks of BATCH_SIZE
    for i in range(0, len(image_files), BATCH_SIZE):
        batch_filenames = image_files[i:i + BATCH_SIZE]
        batch_images = []

        print(f"\n--- Processing Batch {i//BATCH_SIZE + 1} ({len(batch_filenames)} images) ---")

        # Prepare all images for the current batch
        for filename in batch_filenames:
            image_path = os.path.join(image_folder, filename)
            try:
                with Image.open(image_path) as img:
                    original_width, original_height = img.size
                    if max(original_width, original_height) > RESIZE_MAX_SIZE:
                        # Resizing logic
                        if original_width > original_height:
                            new_width = RESIZE_MAX_SIZE
                            new_height = int(original_height * (new_width / original_width))
                        else:
                            new_height = RESIZE_MAX_SIZE
                            new_width = int(original_width * (new_height / original_height))
                        resized_image = img.resize((new_width, new_height), Image.Resampling.LANCZOS)
                    else:
                        resized_image = img.copy()
                    batch_images.append(resized_image.convert("RGB"))
            except Exception as e:
                print(f"  > FAILED to load or resize {filename}. Error: {e}")

        if not batch_images:
            print("  > Skipping batch due to image loading errors.")
            continue

        # Process the entire batch in one go
        try:
            inputs = processor(
                text=[prompt] * len(batch_images),
                images=batch_images,
                return_tensors="pt",
                padding=True
            ).to(model.device)

            # Use inference_mode for better performance
            with torch.inference_mode():
                output_ids = model.generate(**inputs, max_new_tokens=1024)

            transcriptions = processor.batch_decode(output_ids, skip_special_tokens=True)

            # Clean and store results for each item in the batch
            for filename, transcription in zip(batch_filenames, transcriptions):
                assistant_marker = "assistant\n"
                assistant_start_index = transcription.rfind(assistant_marker)
                clean_transcription = transcription[assistant_start_index + len(assistant_marker):].strip()
                ocr_results.append({'filename': filename, 'text': clean_transcription})
            print(f"  > Success for batch.")

        except Exception as e:
            print(f"  > FAILED to process batch. Error: {e}")
        finally:
            # Clean up memory after each batch
            del inputs, output_ids, transcriptions, batch_images
            gc.collect()
            torch.cuda.empty_cache()

# Step 6: Create the PDF
if not ocr_results:
    print("\nNo text was extracted. PDF will not be created.")
else:
    print(f"\n--- Creating PDF from {len(ocr_results)} successful transcriptions ---")
    pdf = FPDF()

    # Add both regular and bold font styles from the installed DejaVu font
    pdf.add_font('DejaVu', '', '/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf')
    pdf.add_font('DejaVu', 'B', '/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf')

    for result in ocr_results:
        pdf.add_page()
        pdf.set_font('DejaVu', 'B', 16)
        pdf.cell(0, 10, f"Source: {result['filename']}", 0, 1, 'C')
        pdf.ln(5)
        pdf.set_font('DejaVu', '', 12)
        pdf.multi_cell(0, 5, result['text'])

    pdf.output(output_pdf_path)
    print(f"\n✅ Success! PDF created at: {output_pdf_path}")
    print("You can download it from the 'Files' panel on the left.")