In [2]:
import os
from pathlib import Path
import shutil
import time
import logging
import PyPDF2
from pdf2image import convert_from_path
import pytesseract
from PIL import Image
from datetime import datetime
import re
import pytesseract

DOCUMENT_DIR = "documents"
os.makedirs(DOCUMENT_DIR, exist_ok=True)

SAVE_TEXT = True
POPPLER_PATH = r"C:\Program Files\poppler-25.12.0\Library\bin"  
logging.basicConfig(level=logging.INFO, format="%(message)s")



def ensure_pdf(src_path):
    src = Path(src_path)
    dest_dir = Path(DOCUMENT_DIR)
    dest_dir.mkdir(parents=True, exist_ok=True)
    dest = dest_dir / src.name

    if src.resolve() == dest.resolve():
        return str(dest.resolve())

    if dest.exists():
        dest = dest.with_name(f"{dest.stem}_{int(time.time())}{dest.suffix}")

    shutil.move(str(src), str(dest))
    logging.info(f"Moved file to: {dest}")
    return str(dest.resolve())



def iter_pdf_pages(pdf_path, use_ocr=True):
    try:
        with open(pdf_path, "rb") as f:
            reader = PyPDF2.PdfReader(f)
            total_pages = len(reader.pages)
    except Exception as e:
        logging.error(f"Failed to read PDF {pdf_path}: {e}")
        return []

    for i, page in enumerate(reader.pages, start=1):
        text = page.extract_text() or ""

        # If page seems image-based, apply OCR
        if use_ocr and not text.strip():
            logging.info(f"Page {i}/{total_pages} seems image-based, applying OCR...")
            try:
                images = convert_from_path(
                    pdf_path,
                    first_page=i,
                    last_page=i,
                    dpi=150,
                    poppler_path=POPPLER_PATH  # <-- explicit Poppler path
                )
                image = images[0].convert("L")  # grayscale
                text = pytesseract.image_to_string(image)
            except Exception as e:
                logging.warning(f"OCR failed on page {i}: {e}")
                text = ""

        # Clean text
        text = re.sub(r"\s+", " ", text).strip()

        yield {
            "page_content": text,
            "metadata": {
                "file_name": Path(pdf_path).name,
                "directory": str(Path(pdf_path).resolve()),
                "file_type": "pdf",
                "page": i
            }
        }


# -----------------------------
# LOAD PDF DOCUMENT
# -----------------------------
def load_pdf_document(pdf_path, use_ocr=True):
    pages = list(iter_pdf_pages(pdf_path, use_ocr=use_ocr))

    # Optionally save text to file
    if SAVE_TEXT and pages:
        txt_file = Path(DOCUMENT_DIR) / f"{Path(pdf_path).stem}.txt"
        try:
            with open(txt_file, "w", encoding="utf-8") as f:
                for page in pages:
                    f.write(page["page_content"] + "\n")
            logging.info(f"Saved extracted text to: {txt_file}")
        except Exception as e:
            logging.warning(f"Failed to save text file: {e}")

    return pages


# -----------------------------
# MAIN FUNCTION
# -----------------------------
if __name__ == "__main__":
    # Single PDF source
    SOURCE_FILE = r"D:\Softoo\ML Task\AI_Engine\documents\Trasnformer model.pdf"

    # Check if file exists
    if not Path(SOURCE_FILE).exists():
        logging.error(f"PDF file not found: {SOURCE_FILE}")
        exit()

    pdf_files = [SOURCE_FILE]

    all_docs = []

    for pdf_path in pdf_files:
        pdf_path = ensure_pdf(pdf_path)
        docs = load_pdf_document(pdf_path)
        all_docs.extend(docs)

        # Summary per document
        now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        logging.info("\n================ PDF DOCUMENT SUMMARY ================\n")
        logging.info(f"Document Processed on: {now}")
        logging.info(f"File Name: {Path(pdf_path).name}")
        logging.info(f"Directory: {pdf_path}")
        logging.info(f"Total Pages: {len(docs)}")
        logging.info("Status: Document Ready for processing")
        logging.info("\n====================================================\n")

    logging.info(f"Total documents processed: {len(pdf_files)}")
    logging.info(f"Total pages extracted: {len(all_docs)}")

Page 1/11 seems image-based, applying OCR...


Page 2/11 seems image-based, applying OCR...
Page 3/11 seems image-based, applying OCR...
Page 4/11 seems image-based, applying OCR...
Page 5/11 seems image-based, applying OCR...
Page 6/11 seems image-based, applying OCR...
Page 7/11 seems image-based, applying OCR...
Page 8/11 seems image-based, applying OCR...
Page 9/11 seems image-based, applying OCR...
Page 10/11 seems image-based, applying OCR...
Page 11/11 seems image-based, applying OCR...
Saved extracted text to: documents\Trasnformer model.txt


Document Processed on: 2026-01-07 15:17:42
File Name: Trasnformer model.pdf
Directory: D:\Softoo\ML Task\AI_Engine\documents\Trasnformer model.pdf
Total Pages: 11
Status: Document Ready for processing


Total documents processed: 1
Total pages extracted: 11


In [5]:
import os
from pathlib import Path
import shutil
import time
import logging
import PyPDF2
from pdf2image import convert_from_path
import pytesseract
from PIL import Image
from datetime import datetime
import re

# -----------------------------
# CONFIG
# -----------------------------
DOCUMENT_DIR = "documents"
os.makedirs(DOCUMENT_DIR, exist_ok=True)

# Optional: Save extracted OCR text for future reuse
SAVE_TEXT = True

# Path to Poppler (raw string to avoid backslash issues)
POPPLER_PATH = r"C:\Program Files\poppler-25.12.0\Library\bin"  # <-- Update to your Poppler bin path

# Setup logging
logging.basicConfig(level=logging.INFO, format="%(message)s")


# -----------------------------
# ENSURE PDF IN DOCUMENT FOLDER
# -----------------------------
def ensure_pdf(src_path):
    src = Path(src_path)
    dest_dir = Path(DOCUMENT_DIR)
    dest_dir.mkdir(parents=True, exist_ok=True)
    dest = dest_dir / src.name

    if src.resolve() == dest.resolve():
        return str(dest.resolve())

    if dest.exists():
        dest = dest.with_name(f"{dest.stem}_{int(time.time())}{dest.suffix}")

    shutil.move(str(src), str(dest))
    logging.info(f"Moved file to: {dest}")
    return str(dest.resolve())

pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"


# -----------------------------
# ITERATE PDF PAGES
# -----------------------------
def iter_pdf_pages(pdf_path, use_ocr=True):
    try:
        with open(pdf_path, "rb") as f:
            reader = PyPDF2.PdfReader(f)
            total_pages = len(reader.pages)
    except Exception as e:
        logging.error(f"Failed to read PDF {pdf_path}: {e}")
        return []

    for i, page in enumerate(reader.pages, start=1):
        text = page.extract_text() or ""

        # If page seems image-based, apply OCR
        if use_ocr and not text.strip():
            logging.info(f"Page {i}/{total_pages} seems image-based, applying OCR...")
            try:
                images = convert_from_path(
                    pdf_path,
                    first_page=i,
                    last_page=i,
                    dpi=150,
                    poppler_path=POPPLER_PATH  # <-- explicit Poppler path
                )
                image = images[0].convert("L")  # grayscale
                text = pytesseract.image_to_string(image)
            except Exception as e:
                logging.warning(f"OCR failed on page {i}: {e}")
                text = ""

        # Clean text
        text = re.sub(r"\s+", " ", text).strip()

        yield {
            "page_content": text,
            "metadata": {
                "file_name": Path(pdf_path).name,
                "directory": str(Path(pdf_path).resolve()),
                "file_type": "pdf",
                "page": i
            }
        }



def load_pdf_document(pdf_path, use_ocr=True):
    pages = list(iter_pdf_pages(pdf_path, use_ocr=use_ocr))

    # Optionally save text to file
    if SAVE_TEXT and pages:
        txt_file = Path(DOCUMENT_DIR) / f"{Path(pdf_path).stem}.txt"
        try:
            with open(txt_file, "w", encoding="utf-8") as f:
                for page in pages:
                    f.write(page["page_content"] + "\n")
            logging.info(f"Saved extracted text to: {txt_file}")
        except Exception as e:
            logging.warning(f"Failed to save text file: {e}")

    return pages



if __name__ == "__main__":
    
    SOURCE_FILE = r"C:\Users\ckabe\Downloads\Fall 2025_STA301_2.docx"

    if not Path(SOURCE_FILE).exists():
        logging.error(f"PDF file not found: {SOURCE_FILE}")
        exit()

    pdf_files = [SOURCE_FILE]

    all_docs = []

    for pdf_path in pdf_files:
        pdf_path = ensure_pdf(pdf_path)
        docs = load_pdf_document(pdf_path)
        all_docs.extend(docs)

        now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        logging.info("\n================ PDF DOCUMENT SUMMARY ================\n")
        logging.info(f"Document Processed on: {now}")
        logging.info(f"File Name: {Path(pdf_path).name}")
        logging.info(f"Directory: {pdf_path}")
        logging.info(f"Total Pages: {len(docs)}")
        logging.info("Status: Document Ready for processing")
        logging.info("\n====================================================\n")

    

Moved file to: documents\Fall 2025_STA301_2.docx


Failed to read PDF D:\Softoo\ML Task\AI_Engine\documents\Fall 2025_STA301_2.docx: EOF marker not found


Document Processed on: 2026-01-07 15:21:15
File Name: Fall 2025_STA301_2.docx
Directory: D:\Softoo\ML Task\AI_Engine\documents\Fall 2025_STA301_2.docx
Total Pages: 0
Status: Document Ready for processing




In [3]:
import os
from pathlib import Path
import shutil
import logging
import PyPDF2
from pdf2image import convert_from_path
import pytesseract
from PIL import Image
import re
from docx2pdf import convert as docx2pdf_convert
import sys
from datetime import datetime
import pandas as pd

DOCUMENT_DIR = "documents"
os.makedirs(DOCUMENT_DIR, exist_ok=True)

SAVE_TEXT = True
POPPLER_PATH = r"C:\Program Files\poppler-25.12.0\Library\bin"
logging.basicConfig(level=logging.INFO, format="%(message)s")

def ensure_file(src_path):
    src = Path(src_path)
    if not src.exists():
        logging.error(f"File not found: {src}")
        exit()
    dest = Path(DOCUMENT_DIR) / src.name
    if dest.exists():
        dest = dest.with_name(f"{dest.stem}_{int(datetime.now().timestamp())}{dest.suffix}")
    shutil.copy(str(src), str(dest))
    logging.info(f"Copied file to: {dest.name}")
    return dest

def convert_to_pdf(file_path):
    file_path = Path(file_path)
    if file_path.suffix.lower() == ".pdf":
        return file_path
    if file_path.suffix.lower() == ".docx":
        old_stdout = sys.stdout
        sys.stdout = open(os.devnull, "w")
        docx2pdf_convert(str(file_path), str(file_path.parent))
        sys.stdout.close()
        sys.stdout = old_stdout
        pdf_file = file_path.with_suffix(".pdf")
        logging.info(f"Converted {file_path.name} to PDF: {pdf_file.name}")
        return pdf_file
    if file_path.suffix.lower() == ".txt":
        from fpdf import FPDF
        pdf_file = file_path.with_suffix(".pdf")
        pdf = FPDF()
        pdf.add_page()
        pdf.set_auto_page_break(auto=True, margin=15)
        pdf.set_font("Arial", size=12)
        with open(file_path, "r", encoding="utf-8") as f:
            for line in f:
                pdf.multi_cell(0, 5, line.strip())
        pdf.output(str(pdf_file))
        logging.info(f"Converted {file_path.name} to PDF: {pdf_file.name}")
        return pdf_file
    logging.error(f"Unsupported file type: {file_path.suffix}")
    exit()

def iter_pdf_pages(pdf_path, use_ocr=True):
    reader = PyPDF2.PdfReader(str(pdf_path))
    for i, page in enumerate(reader.pages, start=1):
        text = page.extract_text() or ""
        if use_ocr and not text.strip():
            try:
                images = convert_from_path(str(pdf_path), first_page=i, last_page=i, dpi=150, poppler_path=POPPLER_PATH)
                image = images[0].convert("L")
                text = pytesseract.image_to_string(image)
            except Exception as e:
                logging.warning(f"OCR failed on page {i}: {e}")
                text = ""
        text = re.sub(r"\s+", " ", text).strip()
        yield {
            "page_content": text,
            "metadata": {
                "file_name": pdf_path.name,
                "directory": str(pdf_path.resolve()),
                "file_type": "pdf",
                "page": i
            }
        }

def load_pdf_document(pdf_path, use_ocr=True):
    pages = list(iter_pdf_pages(pdf_path, use_ocr=use_ocr))
    if SAVE_TEXT and pages:
        txt_file = Path(DOCUMENT_DIR) / f"{pdf_path.stem}.txt"
        try:
            with open(txt_file, "w", encoding="utf-8") as f:
                for page in pages:
                    f.write(page["page_content"] + "\n")
            logging.info(f"Saved extracted text to: {txt_file.name}")
        except Exception as e:
            logging.warning(f"Failed to save text file: {e}")
    return pages

def load_csv_document(csv_path):
    df = pd.read_csv(csv_path)
    pages = []
    for i, row in df.iterrows():
        text = " | ".join([f"{col}: {val}" for col, val in row.items()])
        pages.append({
            "page_content": text,
            "metadata": {
                "file_name": csv_path.name,
                "directory": str(csv_path.resolve()),
                "file_type": "csv",
                "row": i + 1
            }
        })
    if SAVE_TEXT and pages:
        txt_file = Path(DOCUMENT_DIR) / f"{csv_path.stem}.txt"
        try:
            with open(txt_file, "w", encoding="utf-8") as f:
                for page in pages:
                    f.write(page["page_content"] + "\n")
            logging.info(f"Saved extracted text to: {txt_file.name}")
        except Exception as e:
            logging.warning(f"Failed to save text file: {e}")
    return pages

if __name__ == "__main__":
    SOURCE_FILE = r"C:\Users\ckabe\Downloads\Arms Imports Per Country.csv"
    file_in_docs = ensure_file(SOURCE_FILE)
    if file_in_docs.suffix.lower() == ".csv":
        docs = load_csv_document(file_in_docs)
    else:
        pdf_path = convert_to_pdf(file_in_docs)
        docs = load_pdf_document(pdf_path)
    logging.info("\n================ DOCUMENT SUMMARY ================\n")
    logging.info(f"Document Processed on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    logging.info(f"File Name: {file_in_docs.name}")
    logging.info(f"Directory: {file_in_docs.resolve()}")
    logging.info(f"Total Pages/Rows: {len(docs)}")
    logging.info("Status: Document Ready for processing")
    logging.info("\n====================================================\n")


Copied file to: Arms Imports Per Country_1767783349.csv


Saved extracted text to: Arms Imports Per Country_1767783349.txt


Document Processed on: 2026-01-07 15:55:49
File Name: Arms Imports Per Country_1767783349.csv
Directory: D:\Softoo\ML Task\AI_Engine\documents\Arms Imports Per Country_1767783349.csv
Total Pages/Rows: 261
Status: Document Ready for processing


