In [1]:
import os
from pathlib import Path
import shutil
import time
import logging
import PyPDF2
from pdf2image import convert_from_path
import pytesseract
from datetime import datetime

DOCUMENT_DIR = "documents"
os.makedirs(DOCUMENT_DIR, exist_ok=True)

SOURCE_FILE = r"D:\Softoo\ML Task\AI_Engine\Trasnformer model.pdf"

if not os.path.isfile(SOURCE_FILE):
    raise FileNotFoundError(f"File not found: {SOURCE_FILE}")

logging.basicConfig(level=logging.INFO, format="%(message)s")


def ensure_pdf(src_path):
    src = Path(src_path)
    dest_dir = Path(DOCUMENT_DIR)
    dest_dir.mkdir(parents=True, exist_ok=True)
    dest = dest_dir / src.name

    if src.resolve() == dest.resolve():
        return str(dest.resolve())

    if dest.exists():
        dest = dest.with_name(f"{dest.stem}_{int(time.time())}{dest.suffix}")

    shutil.move(str(src), str(dest))
    return str(dest.resolve())


def iter_pdf_pages(pdf_path, use_ocr=True):
    with open(pdf_path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        for i, page in enumerate(reader.pages, start=1):
            text = page.extract_text() or ""

            
            if use_ocr and not text.strip():
                logging.info(f"Page {i} seems image-based, applying OCR...")
                images = convert_from_path(pdf_path, first_page=i, last_page=i)
                text = pytesseract.image_to_string(images[0])

            yield {
                "page_content": text,
                "metadata": {
                    "file_name": Path(pdf_path).name,
                    "directory": str(Path(pdf_path).resolve()),
                    "file_type": "pdf",
                    "page": i
                }
            }


def load_pdf_document(pdf_path, use_ocr=True):
    return list(iter_pdf_pages(pdf_path, use_ocr=use_ocr))


if __name__ == "__main__":
    pdf_path = ensure_pdf(SOURCE_FILE)
    docs = load_pdf_document(pdf_path)

  
    now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    print("\n================ PDF DOCUMENT SUMMARY ================\n")
    print(f"Document Processed on: {now}")
    print(f"File Name: {Path(pdf_path).name}")
    print(f"Directory: {pdf_path}")
    print(f"Total Pages: {len(docs)}")
    print("Status: Document Ready for processing")
    print("\n====================================================\n")




Document Processed on: 2026-01-07 13:39:10
File Name: Trasnformer model.pdf
Directory: D:\Softoo\ML Task\AI_Engine\documents\Trasnformer model.pdf
Total Pages: 11
Status: Document Ready for processing


