In [6]:
import os
import io
import pdfplumber
import fitz
from PIL import Image
from tabula import read_pdf
import pandas as pd

def extract_text_and_tables_and_images_from_pdfs(pdf_docs):
    """
    Extract text, tables, and images from multiple PDF files.
    
    Args:
    - pdf_docs (list of str): List of file paths to the PDF files.
    
    Returns:
    - tuple: A tuple containing extracted text, tables (as CSV data), and separated tables.
    """
    extracted_text = ""
    separated_tables = []
    
    # Create a figures folder if it doesn't exist
    figures_folder = "figures"
    if not os.path.exists(figures_folder):
        os.makedirs(figures_folder)

    try:
        for pdf in pdf_docs:
            # Extract text and tables from PDFs using pdfplumber and tabula
            with pdfplumber.open(pdf) as pdf_file:
                for page in pdf_file.pages:
                    page_text = page.extract_text()
                    if page_text:
                        extracted_text += page_text + "\n"  # Add a new line after each page's text
                    
                    # Read tables from the PDF file
                    tables = read_pdf(pdf, pages=page.page_number, multiple_tables=True)
                    
                    # Convert each table to CSV format and append to extracted_text
                    for df in tables:
                        csv_data = df.to_csv(index=False)
                        extracted_text += csv_data + "\n"  # Add a new line after each table data
                        separated_tables.append(csv_data)
                
            # Extract images from the PDF file using fitz
            pdf_document = fitz.open(pdf)
            for page_index in range(len(pdf_document)):
                # Check if page index is within valid range
                if page_index >= len(pdf_document):
                    break
                page = pdf_document[page_index] 
                images = page.get_images(full=True) 

                for image_index, img in enumerate(images, start=1):
                    xref = img[0]
                    base_image = pdf_document.extract_image(xref)
                    image_bytes = base_image["image"]
                    image_ext = base_image["ext"]
                    pil_image = Image.open(io.BytesIO(image_bytes))
                    image_name = f"{os.path.splitext(os.path.basename(pdf))[0]}_page_{page_index}_image_{image_index}.{image_ext}"
                    image_path = os.path.join(figures_folder, image_name)
                    pil_image.save(image_path)

    except Exception as e:
        print(f"Failed to process PDF files: {e}")
    
    return extracted_text, separated_tables


In [7]:
# Example usage:
pdf_docs = ["MRI_Maintenance_Documents/16026suppl.pdf", "MRI_Maintenance_Documents/PrismaUserGuide.pdf"]
text,separated_tables = extract_text_and_tables_and_images_from_pdfs(pdf_docs)

  df[c] = pd.to_numeric(df[c], errors="ignore")


In [8]:
print(text)

T1MES Phantom
THE T1MES PHANTOM USER MANUAL
CONTACTS FOR ADVICE:
Dr Peter Gatehouse Prof James C Moon
p.gatehouse@rbht.nhs.uk j.moon@ucl.ac.uk
Cardiac MRI Department, Barts Heart Center,
Royal Brompton Hospital, West Smithfield,
Sydney Street, London, UK EC1A 7BE
London, UK SW3 6NP
Dr Gaby Captur
capturgaby@hotmail.co.uk
Barts Heart Center,
West Smithfield,
London, UK EC1A 7BE
Manufacturer: European Authorised Representative:
Resonance Health, Medical Device & QA Services Ltd.
278 Stirling Highway, Spring Court, Spring Road,
Claremont, Australia WA 6010 Hale, Cheshire, UK WA14 2UQ
support@resonancehealth.com
ADDITIONAL MATERIALS NEEDED TO SCAN THE PHANTOM
1. A firm support to elevate the T1MES bottle to isocenter (for approximate heights
please see the specific Siemens/Philips/General Electric (GE) Appendix scanner
instructions but these may vary depending on couch options).
2. A double folded blanket to separate the phantom from the anterior coil.
3. For GE centers only – a ‘Chicken H

In [3]:
import os
import io
from PIL import Image
import pdfplumber

def extract_text_and_tables_from_pdfs(pdf_docs):
    """
    Extract text and tables from multiple PDF files.
    
    Args:
    - pdf_docs (list of str): List of file paths to the PDF files.
    
    Returns:
    - tuple: A tuple containing extracted text and tables (as CSV data).
    """
    extracted_text = ""

    try:
        for pdf in pdf_docs:
            with pdfplumber.open(pdf) as pdf_file:
                for page_number, page in enumerate(pdf_file.pages, start=1):
                    # Extract text from the current page
                    page_text = page.extract_text()
                    if page_text:
                        extracted_text += page_text + "\n"  # Add a new line after each page's text
                    
                    # Extract tables from the current page if any
                    tables = page.extract_tables()
                    if tables:
                        for table_number, table in enumerate(tables, start=1):
                            # Convert table to CSV format and append to extracted text
                            csv_data = "\n".join([",".join(map(str, row)) for row in table])
                            extracted_text += csv_data + "\n"  # Add a new line after each table data

    except Exception as e:
        print(f"Failed to process PDF files: {e}")
    
    return extracted_text

In [5]:
# Example usage:
pdf_docs = ["MRI_Maintenance_Documents/16026suppl.pdf", "MRI_Maintenance_Documents/PrismaUserGuide.pdf"]  # List of PDF file paths
extracted_text = extract_text_and_tables_from_pdfs(pdf_docs)
print(extracted_text)

T1MES Phantom
THE T1MES PHANTOM USER MANUAL
CONTACTS FOR ADVICE:
Dr Peter Gatehouse Prof James C Moon
p.gatehouse@rbht.nhs.uk j.moon@ucl.ac.uk
Cardiac MRI Department, Barts Heart Center,
Royal Brompton Hospital, West Smithfield,
Sydney Street, London, UK EC1A 7BE
London, UK SW3 6NP
Dr Gaby Captur
capturgaby@hotmail.co.uk
Barts Heart Center,
West Smithfield,
London, UK EC1A 7BE
Manufacturer: European Authorised Representative:
Resonance Health, Medical Device & QA Services Ltd.
278 Stirling Highway, Spring Court, Spring Road,
Claremont, Australia WA 6010 Hale, Cheshire, UK WA14 2UQ
support@resonancehealth.com
ADDITIONAL MATERIALS NEEDED TO SCAN THE PHANTOM
1. A firm support to elevate the T1MES bottle to isocenter (for approximate heights
please see the specific Siemens/Philips/General Electric (GE) Appendix scanner
instructions but these may vary depending on couch options).
2. A double folded blanket to separate the phantom from the anterior coil.
3. For GE centers only – a ‘Chicken H