In [2]:
import pdfplumber
import os
import pdfplumber.utils

# def extract_clean_text(page):
#     """   
#     Extracts and cleans text from a PDF page.

#     The function removes:
#     - Page numbers (e.g., "Page 2")
#     - TMP-related metadata (e.g., "TMP ID: 7863", "TMP Form Version")
#     - Dates in formats like "Feb 23, 2025"
#     - Empty lines for better readability

#     The function also:
#     - Extracts tables to MD format tables (it's raw text before)
#     - Extracts titles and subtitles to MD format headers

#     Parameters:
#     page (pdfplumber.page.Page): A single page from a PDF document.

#     Returns:
#     str: Cleaned text extracted from the PDF page.
#     """
#     cleaned_text = ''

#     # if there exist a table in that page, both raw_text and table_data will have the same table data. 
#     # While raw_text will have the table data in a string format, table_data will have the table data in a list format.
#     table_data = page.extract_tables() 
#     first_word_of_table = None

#     if table_data: # not empty
#         for row in table_data[0]:  # Iterate over the first table's rows
#             if row and row[0].strip():  # Ensure it's not empty
#                 first_word_of_table = row[0].strip()
#                 break  # Stop at the first non-empty value
#     print(table_data)
#     print("\nFIST WORD OF TABLE:")
#     print(first_word_of_table)
#     print("\n\n\n\n\n")

#     raw_text = page.extract_text() ## WisDOT Transportation Management Plan\nTMP Form Version 2.0\nTMP ID: 7863 - Approved 60%\nFeb 23, 2025

#     if raw_text:
#         list_of_text = raw_text.split("\n") ## ['WisDOT Transportation Management Plan', 'TMP Form Version 2.0', 'TMP ID: 7863 - Approved 60%'...]
#     else:
#         raw_text = [] 
    


#     if not (raw_text or table_data): # If the page is empty, return an empty string
#         return cleaned_text




#     return cleaned_text






# def PDF_to_MD(file_path, Output_MD_Dir):
#     with pdfplumber.open(file_path) as pdf:
#         MD_content = ''

#         for page in pdf.pages:
#             cleaned_text = extract_clean_text(page)

#             # If cleaned_text is not empty, add it to the MD_content. (Avoid empty lines)
#             if cleaned_text: 
#                 MD_content += cleaned_text


#             # !!!!!!!!!!!!!!testing purpose!!!!!!!!!!!!!!REMOVE LATER!!!!!!!!!!!!!!!!!!!!!!!!!!!!testing purpose!!!!!!!!!!!!!!REMOVE LATER!!!!!!!!!!!!!!!!!!!!!!!!!!!!testing purpose!!!!!!!!!!!!!!REMOVE LATER!!!!!!!!!!!!!!
#             # break 

#         # print(MD_content+'\n\n\n\n\n\n')

#         file_name = os.path.basename(file_path) ## WisTMP 7863 (4430-21-00) (1).pdf
#         file_name = os.path.splitext(file_name)[0] + '.md' ## WisTMP 7863 (4430-21-00) (1).md

#         Output_MD_Dir = os.path.join(Output_MD_Dir, file_name) # Full output path
#         with open(Output_MD_Dir, "w", encoding="utf-8") as md_file:
#             md_file.write(MD_content)


# code from https://stackoverflow.com/questions/71612119/how-to-extract-texts-and-tables-pdfplumber
# --- start ---
from operator import itemgetter

def check_bboxes(word, table_bbox):
    """
    Check whether a word is inside a table's bounding box.
    """
    l = word['x0'], word['top'], word['x1'], word['bottom']
    r = table_bbox
    return l[0] > r[0] and l[1] > r[1] and l[2] < r[2] and l[3] < r[3]

def extract_clean_text(page):
    elements = []
    
    # Extract tables and their bounding boxes
    tables = page.find_tables()
    table_bboxes = [table.bbox for table in tables]
    table_data = [{'table': table.extract(), 'top': table.bbox[1]} for table in tables]
    
    # Extract text words excluding those inside tables
    words = page.extract_words()
    non_table_words = [word for word in words if not any(
        check_bboxes(word, table_bbox) for table_bbox in table_bboxes
    )]
    
    # Cluster text and tables together based on their Y-coordinate
    for cluster in pdfplumber.utils.cluster_objects(
            non_table_words + table_data, itemgetter('top'), tolerance=5):
        if 'text' in cluster[0]:
            elements.append(' '.join([i['text'] for i in cluster]))
        elif 'table' in cluster[0]:
            elements.append(format_table_to_md(cluster[0]['table']))
    
    return "\n".join(elements)

def format_table_to_md(table):
    """
    Formats a table as Markdown.
    """
    if not table:
        return ""
    
    table_md = "\n| " + " | ".join(table[0]) + " |\n"
    table_md += "| " + " | ".join(["---"] * len(table[0])) + " |\n"
    
    for row in table[1:]:
        table_md += "| " + " | ".join(row if row else [" "] * len(table[0])) + " |\n"
    
    return table_md.strip()
# --- end ---






def PDF_to_MD(file_path, Output_MD_Dir):
    """
    Extracts and saves Markdown from a PDF while preserving table order.
    """
    with pdfplumber.open(file_path) as pdf:
        MD_content = ""
        
        for i, page in enumerate(pdf.pages, start=1):
            clean_text_output = extract_clean_text(page)
            
            if clean_text_output:  # Ensure empty pages are skipped
                MD_content += clean_text_output
        
        file_name = os.path.splitext(os.path.basename(file_path))[0] + ".md"
        output_path = os.path.join(Output_MD_Dir, file_name)
        
        with open(output_path, "w", encoding="utf-8") as md_file:
            md_file.write(MD_content)
        
        print(f"Processed: {file_name}")




def loop_dir(PDF_Dir, Output_MD_Dir):
    if not os.path.exists(Output_MD_Dir):
        os.makedirs(Output_MD_Dir)

    # os.walk() return tuples containing the root(the path to the current dir), directories(A list of subdirectories in the current dir), and files(A list of files in the current dir)
    for root, dirs, files in os.walk(PDF_Dir): 
        ## root: PDF_Files, dirs: [], files: ['WisTMP 7863 (4430-21-00) (1).pdf' ...

        for file in files:
            file_path = os.path.join(root, file)
            # print(f'Processing {file_path}') ## PDF_Files\WisTMP 7863 (4430-21-00) (1).pdf PDF_Files\WisTMP 7863 (4430-21-00) (2).pdf ...
            
            PDF_to_MD(file_path, Output_MD_Dir)

            # # !!!!!!!!!!!!!!testing purpose!!!!!!!!!!!!!!REMOVE LATER!!!!!!!!!!!!!!!!!!!!!!!!!!!!testing purpose!!!!!!!!!!!!!!REMOVE LATER!!!!!!!!!!!!!!!!!!!!!!!!!!!!testing purpose!!!!!!!!!!!!!!REMOVE LATER!!!!!!!!!!!!!!
            # break 


if __name__ == '__main__':
    # PDF_Dir = "PDF_Files"
    # Output_MD_Dir = "Output_markdown"
    PDF_Dir = "testdir"
    Output_MD_Dir = "test1"
    loop_dir(PDF_Dir, Output_MD_Dir)



Processed: WisTMP 7863 (4430-21-00) (1).md
