# The Main OCR Pipeline

### Imports & Config

In [1]:
from ocr_pipeline import PDFtoExcelOCR
import matplotlib.pyplot as plt
import cv2
from PIL import Image
import os

PDF_PATH = r"./document.pdf"  
OUTPUT_DIR = r"./output"
TESSERACT_CMD = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
POPLER_BIN_PATH = r"C:\poppler\bin"

  from .autonotebook import tqdm as notebook_tqdm


### Convert PDF to Images & Visualize

In [2]:
ocr = PDFtoExcelOCR(
    pdf_path=PDF_PATH,
    output_dir=OUTPUT_DIR,
    tesseract_cmd=TESSERACT_CMD,
    poppler_path=POPLER_BIN_PATH
)
image_paths = ocr.convert_pdf_to_images()
print(f"✅ Converted {len(image_paths)} pages to images.")


[32mCreating model: ('PP-LCNet_x1_0_doc_ori', None)[0m
[32mUsing official model (PP-LCNet_x1_0_doc_ori), the model files will be automatically downloaded and saved in C:\Users\DZKAM\.paddlex\official_models.[0m
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 2996.64it/s]
[32mCreating model: ('UVDoc', None)[0m
[33mThe model(UVDoc) is not supported to run in MKLDNN mode! Using `paddle` instead![0m
[32mUsing official model (UVDoc), the model files will be automatically downloaded and saved in C:\Users\DZKAM\.paddlex\official_models.[0m
Fetching 6 files: 100%|██████████| 6/6 [00:00<?, ?it/s]
[32mCreating model: ('PP-LCNet_x1_0_textline_ori', None)[0m
[32mUsing official model (PP-LCNet_x1_0_textline_ori), the model files will be automatically downloaded and saved in C:\Users\DZKAM\.paddlex\official_models.[0m
Fetching 6 files: 100%|██████████| 6/6 [00:00<?, ?it/s]
[32mCreating model: ('PP-OCRv5_server_det', None)[0m
[32mUsing official model (PP-OCRv5_server_det), the m

✅ Converted 2 pages to images.


### Preview First Pages

In [3]:
# first_image = Image.open(image_paths[0])
# plt.figure(figsize=(10, 12))
# plt.imshow(first_image)
# plt.axis("off")
# plt.title("First Page of PDF")
# plt.show()


# Preprocessing + OCR + Live Preview

In [4]:
content_blocks = ocr.run_pipeline()
print(f"✅ Extracted {len(content_blocks)} content blocks.")



📄 Processing page 1/2
📐 Detected 2 table(s)

📄 Processing page 2/2
📐 Detected 2 table(s)
✅ Extracted 28 content blocks.


In [5]:
# img = cv2.imread(image_paths[0])
# pre = ocr.preprocess_image(image_paths[0])
# tables = ocr.detect_tables(pre)

# for (x1, y1, x2, y2) in tables:
#     cv2.rectangle(img, (x1, y1), (x2, y2), (0, 0, 255), 2)

# plt.figure(figsize=(10, 10))
# plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
# plt.title("Detected Tables")
# plt.axis("off")
# plt.show()


### Export to text file

In [6]:
text_output_path = os.path.join(OUTPUT_DIR, "extracted_content.txt")
ocr.export_to_txt(content_blocks, text_output_path)
print(f"✅ Text content saved to: {text_output_path}")

✅ Text content saved to: ./output\extracted_content.txt


### Export to PDF file

In [7]:
pdf_output_path = os.path.join(OUTPUT_DIR, "extracted_content.pdf")
ocr.export_structured_to_pdf(content_blocks, pdf_output_path)
print(f"✅ PDF content saved to: {pdf_output_path}")

✅ PDF content saved to: ./output\extracted_content.pdf
