In [1]:
import os
from pathlib import Path
import shutil
import logging
import PyPDF2
from pdf2image import convert_from_path
import pytesseract
from PIL import Image
import re
from docx2pdf import convert as docx2pdf_convert
import sys
from datetime import datetime
import pandas as pd

DOCUMENT_DIR = "documents"
os.makedirs(DOCUMENT_DIR, exist_ok=True)

SAVE_TEXT = True
POPPLER_PATH = r"C:\Program Files\poppler-25.12.0\Library\bin"
logging.basicConfig(level=logging.INFO, format="%(message)s")

def ensure_file(src_path):
    src = Path(src_path)
    if not src.exists():
        logging.error(f"File not found: {src}")
        exit()
    dest = Path(DOCUMENT_DIR) / src.name
    if dest.exists():
        dest = dest.with_name(f"{dest.stem}_{int(datetime.now().timestamp())}{dest.suffix}")
    shutil.copy(str(src), str(dest))
    logging.info(f"Copied file to: {dest.name}")
    return dest

def convert_to_pdf(file_path):
    file_path = Path(file_path)
    if file_path.suffix.lower() == ".pdf":
        return file_path
    if file_path.suffix.lower() == ".docx":
        old_stdout = sys.stdout
        sys.stdout = open(os.devnull, "w")
        docx2pdf_convert(str(file_path), str(file_path.parent))
        sys.stdout.close()
        sys.stdout = old_stdout
        pdf_file = file_path.with_suffix(".pdf")
        logging.info(f"Converted {file_path.name} to PDF: {pdf_file.name}")
        return pdf_file
    if file_path.suffix.lower() == ".txt":
        from fpdf import FPDF
        pdf_file = file_path.with_suffix(".pdf")
        pdf = FPDF()
        pdf.add_page()
        pdf.set_auto_page_break(auto=True, margin=15)
        pdf.set_font("Arial", size=12)
        with open(file_path, "r", encoding="utf-8") as f:
            for line in f:
                pdf.multi_cell(0, 5, line.strip())
        pdf.output(str(pdf_file))
        logging.info(f"Converted {file_path.name} to PDF: {pdf_file.name}")
        return pdf_file
    logging.error(f"Unsupported file type: {file_path.suffix}")
    exit()

def iter_pdf_pages(pdf_path, use_ocr=True):
    reader = PyPDF2.PdfReader(str(pdf_path))
    for i, page in enumerate(reader.pages, start=1):
        text = page.extract_text() or ""
        if use_ocr and not text.strip():
            try:
                images = convert_from_path(str(pdf_path), first_page=i, last_page=i, dpi=150, poppler_path=POPPLER_PATH)
                image = images[0].convert("L")
                text = pytesseract.image_to_string(image)
            except Exception as e:
                logging.warning(f"OCR failed on page {i}: {e}")
                text = ""
        text = re.sub(r"\s+", " ", text).strip()
        yield {
            "page_content": text,
            "metadata": {
                "file_name": pdf_path.name,
                "directory": str(pdf_path.resolve()),
                "file_type": "pdf",
                "page": i
            }
        }

def load_pdf_document(pdf_path, use_ocr=True):
    pages = list(iter_pdf_pages(pdf_path, use_ocr=use_ocr))
    if SAVE_TEXT and pages:
        txt_file = Path(DOCUMENT_DIR) / f"{pdf_path.stem}.txt"
        try:
            with open(txt_file, "w", encoding="utf-8") as f:
                for page in pages:
                    f.write(page["page_content"] + "\n")
            logging.info(f"Saved extracted text to: {txt_file.name}")
        except Exception as e:
            logging.warning(f"Failed to save text file: {e}")
    return pages

def load_csv_document(csv_path):
    df = pd.read_csv(csv_path)
    pages = []
    for i, row in df.iterrows():
        text = " | ".join([f"{col}: {val}" for col, val in row.items()])
        pages.append({
            "page_content": text,
            "metadata": {
                "file_name": csv_path.name,
                "directory": str(csv_path.resolve()),
                "file_type": "csv",
                "row": i + 1
            }
        })
    if SAVE_TEXT and pages:
        txt_file = Path(DOCUMENT_DIR) / f"{csv_path.stem}.txt"
        try:
            with open(txt_file, "w", encoding="utf-8") as f:
                for page in pages:
                    f.write(page["page_content"] + "\n")
            logging.info(f"Saved extracted text to: {txt_file.name}")
        except Exception as e:
            logging.warning(f"Failed to save text file: {e}")
    return pages

if __name__ == "__main__":
    SOURCE_FILE = r"C:\Users\ckabe\Downloads\Arms Imports Per Country.csv"
    file_in_docs = ensure_file(SOURCE_FILE)
    if file_in_docs.suffix.lower() == ".csv":
        docs = load_csv_document(file_in_docs)
    else:
        pdf_path = convert_to_pdf(file_in_docs)
        docs = load_pdf_document(pdf_path)
    logging.info("\n================ DOCUMENT SUMMARY ================\n")
    logging.info(f"Document Processed on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    logging.info(f"File Name: {file_in_docs.name}")
    logging.info(f"Directory: {file_in_docs.resolve()}")
    logging.info(f"Total Pages/Rows: {len(docs)}")
    logging.info("Status: Document Ready for processing")
    logging.info("\n====================================================\n")


  from .autonotebook import tqdm as notebook_tqdm
Copied file to: Arms Imports Per Country.csv
Saved extracted text to: Arms Imports Per Country.txt


Document Processed on: 2026-01-07 16:07:22
File Name: Arms Imports Per Country.csv
Directory: D:\Softoo\ML Task\AI_Engine\documents\Arms Imports Per Country.csv
Total Pages/Rows: 261
Status: Document Ready for processing


