In [281]:
import os
from whenever import LocalDateTime
from pdf2image import convert_from_path
from surya.ocr import run_ocr
from surya.model.detection.model import (
    load_model as load_det_model,
    load_processor as load_det_processor,
)
from surya.model.recognition.model import load_model as load_rec_model
from surya.model.recognition.processor import load_processor as load_rec_processor

# set up Surya
langs = ["it"]
det_processor, det_model = load_det_processor(), load_det_model()
rec_model, rec_processor = load_rec_model(), load_rec_processor()


def read_files():
    files = os.listdir("order_docs")
    return files


def format_filename(filename):
    """Check if filename is already compliant with naming expectation. If not, rename it."""
    if len(filename.split("_")) == 2 and len(filenames[0].split("_")[1]) == 14:
        # date.pdf is 14 digits and there are two parts to the filename - all good
        ordernum = int(filename.split("_")[0])
        orderdate = LocalDateTime.strptime(
            filename.split("_")[1].split(".")[0], "%Y-%m-%d"
        ).date()
    else:
        ordernum = int(filename.split(" ")[1].split("-")[-1])
        orderdate = LocalDateTime.strptime(
            filename.split(" ")[-1].split(".")[0], "%d-%m-%Y"
        ).date()
        new_filename = f"{ordernum}_{orderdate}.pdf"
        os.rename(
            os.path.join("order_docs", filename),
            os.path.join("order_docs", new_filename),
        )
    return ordernum, orderdate


def convert_pdf_to_images(filename):
    """Convert each page of a pdf file into PIL image."""
    return convert_from_path(f"order_docs/{filename}")


def ocr_all_pages(images):
    """Run OCR on all images (pdf pages) of an order."""
    all_pages = []
    for image in images:
        page_ocr = run_ocr(
            [image], [langs], det_model, det_processor, rec_model, rec_processor
        )[0].text_lines
        all_pages += page_ocr
    return all_pages


def extract_order_rif(order_ocr):
    """Extract order detail: riferimento, based on last page entry in pdf."""
    for line in order_ocr:
        if "rif" in str.lower(line.text):
            char_list_with_rif = line.text.split(".")
            char_list_with_rif = [s.strip() for s in char_list_with_rif]
            for char in char_list_with_rif:
                try:
                    rif = int(char)
                    return rif
                except:
                    continue


def extract_ordered_items(order_ocr):
    """Extract order codes."""

    def extract_order_codes_positions(order_ocr):
        """Find all order codes in the OCR text."""
        order_codes = []
        text_positions = []
        for line in order_ocr:
            split_text = line.text.replace(" ", ".").split(".")
            order_code_chars = [
                entry for entry in split_text if any(chr.isdigit() for chr in entry)
            ][0:3]
            order_code = str.strip("".join(order_code_chars))
            exclude_patterns = ["/", ",", "-", "MM"]
            if len(order_code) == 9 and not any(
                x in order_code for x in exclude_patterns
            ):
                order_codes.append(str.strip(order_code))
                text_positions.append(line.bbox)
        return order_codes, text_positions

    def extract_order_qty(order_ocr, order_item_position):
        """Find the closest bbox to the right of the order code bbox that contains a digit."""
        pass

    codes, positions = extract_order_codes_positions(order_ocr)
    for position in positions:
        order_qty = extract_order_qty(order_ocr, position)
    return codes, positions


# file preprocessing
filenames = read_files()
orders = []
for filename in filenames:
    ordernum, orderdate = format_filename(filename)
    orders.append(
        {
            "order_number": ordernum,
            "order_date": orderdate,
            "images": convert_pdf_to_images(filename),
        }
    )

# orders dict is now ready for OCR
for order in orders:
    order_ocr = ocr_all_pages(order["images"])
    order["order_rif"] = extract_order_rif(order_ocr)
    order["ordered_items"], order["text_positions"] = extract_ordered_items(order_ocr)

Loaded detection model vikp/surya_det3 on device cuda with dtype torch.float16
Loaded recognition model vikp/surya_rec2 on device cuda with dtype torch.float16


Detecting bboxes: 100%|██████████| 1/1 [00:00<00:00,  5.39it/s]
Recognizing Text: 100%|██████████| 1/1 [00:00<00:00,  1.16it/s]
Detecting bboxes: 100%|██████████| 1/1 [00:00<00:00,  5.20it/s]
Recognizing Text: 100%|██████████| 1/1 [00:01<00:00,  1.11s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:00<00:00,  5.35it/s]
Recognizing Text: 100%|██████████| 1/1 [00:00<00:00,  1.06it/s]
Detecting bboxes: 100%|██████████| 1/1 [00:00<00:00,  5.21it/s]
Recognizing Text: 100%|██████████| 1/1 [00:01<00:00,  1.11s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:00<00:00,  5.27it/s]
Recognizing Text: 100%|██████████| 1/1 [00:00<00:00,  1.07it/s]
Detecting bboxes: 100%|██████████| 1/1 [00:00<00:00,  5.23it/s]
Recognizing Text: 100%|██████████| 1/1 [00:00<00:00,  1.00it/s]
Detecting bboxes: 100%|██████████| 1/1 [00:00<00:00,  5.29it/s]
Recognizing Text: 100%|██████████| 1/1 [00:01<00:00,  1.06s/it]
Detecting bboxes: 100%|██████████| 1/1 [00:00<00:00,  4.65it/s]
Recognizing Text: 100%|██████████| 1/1 [