In [None]:
import os
import pdfplumber
import re


TEST_PDF_PATH = r"C:\\Users\\User\\Desktop\\test_path\\2025 Journal article Toward expert-level medical question answering with LLMs.pdf"

OUTPUT_PATH = r"C:\\Users\\User\\Desktop\\test_path"

In [4]:
def pdf_to_text_file(pdf_file_path, output_file_path):
    """
    Extracts text from a PDF file and writes it to a text file with basic formatting.
    
    :param pdf_file_path: The path to the PDF file to be processed.
    :param output_file_path: The path where the extracted text file will be stored.
    """
    
    # Ensure the PDF file exists
    if not os.path.isfile(pdf_file_path):
        raise FileNotFoundError(f"PDF file not found: {pdf_file_path}")

    with pdfplumber.open(pdf_file_path) as pdf:
        # Prepare a list to hold all extracted text from all pages
        all_text = []
        
        for page_number, page in enumerate(pdf.pages, start=1):
            page_text = page.extract_text()
            
            if page_text:
                page_text = page_text.strip()
                page_text = re.sub(r'\r\n', '\n', page_text)  # normalize line breaks
                page_text = re.sub(r'\n\s*\n+', '\n\n', page_text)  # ensure double newlines stay as paragraphs
                page_text = re.sub(r'[ \t]+', ' ', page_text)  # convert multiple spaces/tabs to single space
                
                # Add a header or footer for clarity (optional)
                formatted_text = f"--- Page {page_number} ---\n{page_text}\n"
                
                # Append to the list
                all_text.append(formatted_text)
            else:
                # In case a page has no extractable text
                all_text.append(f"--- Page {page_number} ---\n[No text found on this page]\n")
    
    # Join all text with double newlines or single newline
    # (Adjust this based on how you want the final file formatted)
    final_text = "\n\n".join(all_text)
    
    # Write the output to a text file
    with open(output_file_path, 'w', encoding='utf-8') as f:
        f.write(final_text)

In [5]:
pdf_to_text_file(TEST_PDF_PATH, os.path.join(OUTPUT_PATH, "output.txt"))

In [None]:
def extract_pdf_alternative(pdf_input_path, pdf_output_path):
    

In [None]:
def convert_folder(folder_path, output_folder):
    for article in os.listdir(folder_path):
        if article.endswith(".pdf"):
            pdf_path = os.path.join(folder_path, article) 
            convert_pdf_to_text(pdf_path, output_folder)