In [60]:
import os
from whenever import LocalDateTime
from pdf2image import convert_from_path
from surya.ocr import run_ocr
from surya.model.detection.model import (
    load_model as load_det_model,
    load_processor as load_det_processor,
)
from surya.model.recognition.model import load_model as load_rec_model
from surya.model.recognition.processor import load_processor as load_rec_processor

# set up Surya
langs = ["it"]
det_processor, det_model = load_det_processor(), load_det_model()
rec_model, rec_processor = load_rec_model(), load_rec_processor()


def read_files():
    files = os.listdir("order_docs")
    return files


def format_filename(filename):
    """Check if filename is already compliant with naming expectation. If not, rename it."""
    if len(filename.split("_")) == 2 and len(filenames[0].split("_")[1]) == 14:
        # date.pdf is 14 digits and there are two parts to the filename - all good
        ordernum = int(filename.split("_")[0])
        orderdate = LocalDateTime.strptime(
            filename.split("_")[1].split(".")[0], "%Y-%m-%d"
        ).date()
    else:
        ordernum = int(filename.split(" ")[1].split("-")[-1])
        orderdate = LocalDateTime.strptime(
            filename.split(" ")[-1].split(".")[0], "%d-%m-%Y"
        ).date()
        new_filename = f"{ordernum}_{orderdate}.pdf"
        os.rename(
            os.path.join("order_docs", filename),
            os.path.join("order_docs", new_filename),
        )
    return ordernum, orderdate


def convert_pdf_to_images(filename):
    """Convert each page of a pdf file into PIL image."""
    return convert_from_path(f"order_docs/{filename}")


def ocr_all_pages(images):
    """Run OCR on all images (pdf pages) of an order."""
    all_pages = []
    pagenum = 0
    for image in images:
        page_ocr = run_ocr(
            [image], [langs], det_model, det_processor, rec_model, rec_processor
        )[0].text_lines
        for line in page_ocr:
            line_dict = line.model_dump()
            line_dict.update({"page": pagenum})
            all_pages.append(line_dict)
        pagenum += 1
    return all_pages


def extract_order_rif(order_ocr):
    """Extract order detail: riferimento, based on last page entry in pdf."""
    for line in order_ocr:
        if "rif" in str.lower(line["text"]):
            char_list_with_rif = line["text"].split(".")
            char_list_with_rif = [s.strip() for s in char_list_with_rif]
            for char in char_list_with_rif:
                try:
                    rif = int(char)
                    return rif
                except:
                    continue


def extract_ordered_items(order_ocr):
    """Extract order codes."""
    order_details = []
    for line in order_ocr:
        split_text = line["text"].replace(" ", ".").split(".")
        item_code_chars = [
            entry for entry in split_text if any(chr.isdigit() for chr in entry)
        ][0:3]
        item_code = str.strip("".join(item_code_chars))
        exclude_patterns = ["/", ",", "-", "MM"]
        if len(item_code) == 9 and not any(x in item_code for x in exclude_patterns):
            order_details.append(
                {
                    "item_code": str.strip(item_code),
                    "coordinates": line["bbox"],
                    "page": line["page"],
                }
            )

    for order in order_details:
        y_top_left = order["coordinates"][1]
        y_bottom_right = order["coordinates"][-1]
        center_y_line = (
            y_top_left + y_bottom_right
        ) / 2  # center line of order line item, scroll right to get quantity
        matched_text = []
        for line in order_ocr:
            if (
                line["bbox"][1] < center_y_line < line["bbox"][-1]
                and line["page"] == order["page"]
            ):
                matched_text.append((line["text"]))
        ordered_qty = int(
            [
                str.strip(char)
                for char in matched_text
                if "," in char and len(str.strip(char)) == 5
            ][0][0]
        )
        order["ordered_qty"] = ordered_qty

    return order_details


# file preprocessing
filenames = read_files()
orders = []
for filename in filenames:
    ordernum, orderdate = format_filename(filename)
    orders.append(
        {
            "order_number": ordernum,
            "order_date": orderdate,
            "images": convert_pdf_to_images(filename),
        }
    )

# orders dict is now ready for OCR
for order in orders:
    order_ocr = ocr_all_pages(order["images"])
    order["order_rif"] = extract_order_rif(order_ocr)
    order["details"] = extract_ordered_items(order_ocr)

Loaded detection model vikp/surya_det3 on device cuda with dtype torch.float16
Loaded recognition model vikp/surya_rec2 on device cuda with dtype torch.float16


Detecting bboxes: 100%|██████████| 1/1 [00:00<00:00,  5.15it/s]
Recognizing Text: 100%|██████████| 1/1 [00:00<00:00,  1.16it/s]
Detecting bboxes: 100%|██████████| 1/1 [00:00<00:00,  4.95it/s]
Recognizing Text: 100%|██████████| 1/1 [00:01<00:00,  1.12s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:00<00:00,  5.11it/s]
Recognizing Text: 100%|██████████| 1/1 [00:00<00:00,  1.06it/s]
Detecting bboxes: 100%|██████████| 1/1 [00:00<00:00,  5.04it/s]
Recognizing Text: 100%|██████████| 1/1 [00:01<00:00,  1.11s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:00<00:00,  5.36it/s]
Recognizing Text: 100%|██████████| 1/1 [00:00<00:00,  1.09it/s]
Detecting bboxes: 100%|██████████| 1/1 [00:00<00:00,  5.41it/s]
Recognizing Text: 100%|██████████| 1/1 [00:00<00:00,  1.07it/s]
Detecting bboxes: 100%|██████████| 1/1 [00:00<00:00,  4.88it/s]
Recognizing Text: 100%|██████████| 1/1 [00:01<00:00,  1.04s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:00<00:00,  5.05it/s]
Recognizing Text: 100%|██████████| 1/1 [

In [61]:
orders[0]

{'order_number': 243339,
 'order_date': Date(2024-07-18),
 'images': [<PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=1653x2337>,
  <PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=1653x2337>,
  <PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=1653x2337>,
  <PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=1653x2337>,
  <PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=1653x2337>],
 'order_rif': 305,
 'details': [{'item_code': '173151163',
   'coordinates': [333.0, 756.0, 603.0, 772.0],
   'page': 1,
   'ordered_qty': 2},
  {'item_code': '173691500',
   'coordinates': [333.0, 819.0, 585.0, 836.0],
   'page': 1,
   'ordered_qty': 2},
  {'item_code': '173719500',
   'coordinates': [333.0, 882.0, 577.0, 899.0],
   'page': 1,
   'ordered_qty': 2},
  {'item_code': '103822588',
   'coordinates': [334.0, 977.0, 625.0, 993.0],
   'page': 1,
   'ordered_qty': 2},
  {'item_code': '103883500',
   'coordinates': [334.0, 1041.0, 575.0, 1057.0],
   'page': 1,
   'ordered_qty': 2},