In [4]:
import os
import re
import pdfplumber
from operator import itemgetter

def clean_text(text):
    """
    Cleans a given text string by removing unwanted elements:
    - Dates in the format "Feb 23, 2025" or partial dates like "Feb 23,"
      (including any trailing commas)
    - TMP IDs like "TMP ID: 7863"
    - Page markers like "Page #" or "Page 2"
    - Isolated "P" or stray punctuation with a year (e.g., ", 2025")
    - Extra whitespace
    """
    if not text:
        return text
    # Remove dates: full dates and partial dates (e.g., "Feb 23," or "Feb 23, 2025"),
    # also removing any trailing comma.
    text = re.sub(r'\b[A-Za-z]{3} \d{1,2}(?:, \d{4})?\b,?', '', text)
    # Remove TMP IDs e.g., "TMP ID: 7863"
    text = re.sub(r'TMP ID:\s*\d+', '', text)
    # Remove page markers e.g., "Page #" or "Page 2"
    text = re.sub(r'\bPage\s*(?:#|\d+)\b', '', text)
    # Remove isolated "P" if the cell contains only "P"
    if text.strip() == "P":
        text = ""
    # Remove cells that are only a year with stray punctuation, e.g., ", 2025"
    if re.match(r'^[, ]*\d{4}[\s,]*$', text):
        text = ""
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def check_bboxes(word, table_bbox):
    """
    Check whether a word is inside a table's bounding box.
    """
    l = word['x0'], word['top'], word['x1'], word['bottom']
    r = table_bbox
    return l[0] > r[0] and l[1] > r[1] and l[2] < r[2] and l[3] < r[3]

def extract_clean_text(page):
    elements = []
    
    # Extract tables and their bounding boxes
    tables = page.find_tables()
    table_bboxes = [table.bbox for table in tables]
    # Save table data along with their top coordinate for clustering
    table_data = [{'table': table.extract(), 'top': table.bbox[1]} for table in tables]
    
    # Extract words excluding those inside tables
    words = page.extract_words()
    non_table_words = [word for word in words if not any(
        check_bboxes(word, table_bbox) for table_bbox in table_bboxes
    )]
    
    # Cluster text and tables based on their vertical position with a tolerance
    clusters = pdfplumber.utils.cluster_objects(
        non_table_words + table_data, itemgetter('top'), tolerance=5
    )
    
    for cluster in clusters:
        # If the cluster contains text words
        if 'text' in cluster[0]:
            raw_text = ' '.join(item['text'] for item in cluster)
            cleaned = clean_text(raw_text)
            if cleaned:
                elements.append(cleaned)
        # If the cluster contains table data
        elif 'table' in cluster[0]:
            table_md = format_table_to_md(cluster[0]['table'])
            if table_md:
                elements.append(table_md)
    
    return "\n".join(elements)

def format_table_to_md(table):
    """
    Formats a table as Markdown.
    Cleans each cell with clean_text() to remove dates, TMP IDs, and page markers.
    """
    if not table:
        return ""
    
    # Clean header cells
    header = [clean_text(cell) for cell in table[0]]
    md_table = "| " + " | ".join(header) + " |\n"
    md_table += "| " + " | ".join(["---"] * len(header)) + " |\n"
    
    # Clean each row cell
    for row in table[1:]:
        row_cells = [clean_text(cell) if cell else "" for cell in row]
        md_table += "| " + " | ".join(row_cells) + " |\n"
    
    return md_table.strip()

def PDF_to_MD(file_path, Output_MD_Dir):
    """
    Extracts text and tables from a PDF file, removes unwanted elements,
    and saves the output as a Markdown (.md) file.
    """
    with pdfplumber.open(file_path) as pdf:
        MD_content = ""
        for page in pdf.pages:
            clean_text_output = extract_clean_text(page)
            if clean_text_output:  # Only add non-empty content
                MD_content += clean_text_output + "\n\n"
        
        # Create output file name and path
        file_name = os.path.splitext(os.path.basename(file_path))[0] + ".md"
        output_path = os.path.join(Output_MD_Dir, file_name)
        
        with open(output_path, "w", encoding="utf-8") as md_file:
            md_file.write(MD_content)
        
        print(f"Processed: {file_name}")

def loop_dir(PDF_Dir, Output_MD_Dir):
    """
    Loops through all PDF files in the given directory (and subdirectories)
    and converts each one to a Markdown file.
    """
    if not os.path.exists(Output_MD_Dir):
        os.makedirs(Output_MD_Dir)
    
    for root, dirs, files in os.walk(PDF_Dir):
        for file in files:
            file_path = os.path.join(root, file)
            PDF_to_MD(file_path, Output_MD_Dir)

if __name__ == '__main__':
    # Set your input and output directories
    PDF_Dir = "testdir"         # Directory containing PDF files
    Output_MD_Dir = "test1"      # Directory to save Markdown files
    loop_dir(PDF_Dir, Output_MD_Dir)


Processed: WisTMP 7863 (4430-21-00) (1).md
