In [None]:
! pip install tabula-py
! pip install tabulate

In [None]:
from tabula import read_pdf
from tabulate import tabulate
import numpy as np
import os

# File path
file_path = os.path.join("data", "visual instruction tuning.pdf")

# Read tables from the PDF file
dfs = read_pdf(file_path, pages="all", multiple_tables=True)

# Check how many pages were processed
print("Number of pages processed:", len(dfs))

# Loop through each DataFrame and print it using tabulate
for df in dfs:
    # Replace NaN values with an empty string
    df.replace(np.nan, '', inplace=True)
    data = df.values.tolist()  # Convert DataFrame to a list of lists
    headers = df.columns.tolist()  # Extract column headers
    print(tabulate(data, headers=headers, tablefmt='psql'))  # Print table using tabulate


In [None]:
from tabula import read_pdf
import numpy as np
import pdfplumber

def extract_text_and_tables_from_pdfs(pdf_docs):
    """
    Extract text and tables from multiple PDF files.
    
    Args:
    - pdf_docs (list of str): List of file paths to the PDF files.
    
    Returns:
    - tuple: A tuple containing extracted text and tables.
    """
    extracted_text = ""
    all_tables = []
    
    try:
        for pdf in pdf_docs:
            with pdfplumber.open(pdf) as pdf_file:
                for page in pdf_file.pages:
                    page_text = page.extract_text()
                    if page_text:
                        extracted_text += page_text
                    
                    # Read tables from the PDF file
                    tables = read_pdf(pdf, pages=page.page_number, multiple_tables=True)
                    
                    # Replace NaN values with an empty string in each DataFrame
                    for df in tables:
                        df.replace(np.nan, '', inplace=True)
                    
                    all_tables.extend(tables)
    except Exception as e:
        # Handle any exceptions that occur during processing
        # For example, you might want to log the error or display a message to the user
        print(f"Failed to read PDF files: {e}")
    
    return extracted_text, all_tables

def separate_tables(all_tables):
    """
    Separate each table data and store each one in a list.
    
    Args:
    - all_tables (list of pandas.DataFrame): List of DataFrames containing table data.
    
    Returns:
    - list of pandas.DataFrame: List containing separated table data.
    """
    separated_tables = []
    for table in all_tables:
        separated_tables.append(table)
    return separated_tables

# Example usage:
pdf_docs = ["MRI_Maintenance_Documents/16026suppl.pdf", "MRI_Maintenance_Documents/PrismaUserGuide.pdf"]  # Replace with the list of uploaded file paths
text, tables = extract_text_and_tables_from_pdfs(pdf_docs)
separated_tables = separate_tables(tables)


In [None]:
print(tables)

In [None]:
! pip install PyMuPDF Pillow

In [None]:
import os
import fitz 
import io 
from PIL import Image 

# File path you want to extract images from 
file = "MRI_Maintenance_Documents/PrismaUserGuide.pdf"

# Create a figures folder if it doesn't exist
figures_folder = "figures"
if not os.path.exists(figures_folder):
    os.makedirs(figures_folder)

# Open the file 
pdf_file = fitz.open(file) 

# Iterate over PDF pages 
for page_index in range(len(pdf_file)): 
    # Get the page itself 
    page = pdf_file[page_index] 
    
    # Get the images on the page
    images = page.get_images(full=True)
    
    # Printing the number of images found on this page 
    if images: 
        print(f"[+] Found a total of {len(images)} images in page {page_index}") 
    else: 
        print("[!] No images found on page", page_index) 
        
    for image_index, img in enumerate(images, start=1):
        # Get the XREF of the image 
        xref = img[0]
        
        # Extract the image bytes 
        base_image = pdf_file.extract_image(xref)
        image_bytes = base_image["image"]
        
        # Get the image extension 
        image_ext = base_image["ext"]
        
        # Create a PIL image from the bytes
        pil_image = Image.open(io.BytesIO(image_bytes))
        
        # Save the image to the figures folder with a unique name
        image_name = f"page_{page_index}_image_{image_index}.{image_ext}"
        image_path = os.path.join(figures_folder, image_name)
        pil_image.save(image_path)
        print(f"Saved image '{image_name}' to '{figures_folder}' folder")


In [None]:
import os
import fitz 
import io 
from PIL import Image 
from tabula import read_pdf
import numpy as np
import pdfplumber

def extract_text_and_tables_and_images_from_pdfs(pdf_docs):
    """
    Extract text, tables, and images from multiple PDF files.
    
    Args:
    - pdf_docs (list of str): List of file paths to the PDF files.
    
    Returns:
    - tuple: A tuple containing extracted text, tables, and separated tables.
    """
    extracted_text = ""
    all_tables = []
    separated_tables = []
    
    # Create a figures folder if it doesn't exist
    figures_folder = "figures"
    if not os.path.exists(figures_folder):
        os.makedirs(figures_folder)

    try:
        for pdf in pdf_docs:
            # Extract text and tables from PDFs using pdfplumber and tabula
            with pdfplumber.open(pdf) as pdf_file:
                for page in pdf_file.pages:
                    page_text = page.extract_text()
                    if page_text:
                        extracted_text += page_text
                    
                    # Read tables from the PDF file
                    tables = read_pdf(pdf, pages=page.page_number, multiple_tables=True)
                    
                    # Replace NaN values with an empty string in each DataFrame
                    for df in tables:
                        df.replace(np.nan, '', inplace=True)
                    
                    all_tables.extend(tables)
                    
            # Separate each table data and store each one in a list
            for table in all_tables:
                separated_tables.append(table)
                
            # Extract images from the PDF file using fitz
            pdf_document = fitz.open(pdf)
            for page_index in range(len(pdf_document)):
                # Check if page index is within valid range
                if page_index >= len(pdf_document):
                    break
                page = pdf_document[page_index] 
                images = page.get_images(full=True) 

                for image_index, img in enumerate(images, start=1):
                    xref = img[0]
                    base_image = pdf_document.extract_image(xref)
                    image_bytes = base_image["image"]
                    image_ext = base_image["ext"]
                    pil_image = Image.open(io.BytesIO(image_bytes))
                    image_name = f"{os.path.splitext(os.path.basename(pdf))[0]}_page_{page_index}_image_{image_index}.{image_ext}"
                    image_path = os.path.join(figures_folder, image_name)
                    pil_image.save(image_path)

    except Exception as e:
        print(f"Failed to process PDF files: {e}")
    
    return extracted_text, all_tables, separated_tables

# Example usage:
pdf_docs = ["MRI_Maintenance_Documents/16026suppl.pdf", "MRI_Maintenance_Documents/PrismaUserGuide.pdf"]
text, tables, separated_tables = extract_text_and_tables_and_images_from_pdfs(pdf_docs)


In [None]:
for i in separated_tables:
    print(i)
    print(" ")

In [None]:
print(text)

In [None]:
def extract_text_and_tables_and_images_from_pdfs(pdf_docs):
    """
    Extract text, tables, and images from multiple PDF files.
    
    Args:
    - pdf_docs (list of str): List of file paths to the PDF files.
    
    Returns:
    - tuple: A tuple containing extracted text, tables, and separated tables.
    """
    extracted_text = ""
    all_tables = []
    separated_tables = []
    
    # Create a figures folder if it doesn't exist
    figures_folder = "figures"
    if not os.path.exists(figures_folder):
        os.makedirs(figures_folder)

    try:
        for pdf in pdf_docs:
            # Extract text and tables from PDFs using pdfplumber and tabula
            with pdfplumber.open(pdf) as pdf_file:
                for page in pdf_file.pages:
                    page_text = page.extract_text()
                    if page_text:
                        extracted_text += page_text
                    
                    # Read tables from the PDF file
                    tables = read_pdf(pdf, pages=page.page_number, multiple_tables=True)
                    
                    # Replace NaN values with an empty string in each DataFrame
                    for df in tables:
                        df.replace(np.nan, '', inplace=True)
                    
                    all_tables.extend(tables)
                    
            # Separate each table data and store each one in a list
            for table in all_tables:
                separated_tables.append(table)
                
            # Extract images from the PDF file using fitz
            pdf_document = fitz.open(pdf)
            for page_index in range(len(pdf_document)):
                # Check if page index is within valid range
                if page_index >= len(pdf_document):
                    break
                page = pdf_document[page_index] 
                images = page.get_images(full=True) 

                for image_index, img in enumerate(images, start=1):
                    xref = img[0]
                    base_image = pdf_document.extract_image(xref)
                    image_bytes = base_image["image"]
                    image_ext = base_image["ext"]
                    pil_image = Image.open(io.BytesIO(image_bytes))
                    image_name = f"{os.path.splitext(os.path.basename(pdf))[0]}_page_{page_index}_image_{image_index}.{image_ext}"
                    image_path = os.path.join(figures_folder, image_name)
                    pil_image.save(image_path)

    except Exception as e:
        print(f"Failed to process PDF files: {e}")
    
    return extracted_text, all_tables, separated_tables


In [None]:
# Example usage:
pdf_docs = ["MRI_Maintenance_Documents/16026suppl.pdf", "MRI_Maintenance_Documents/PrismaUserGuide.pdf"]
text, tables, separated_tables = extract_text_and_tables_and_images_from_pdfs(pdf_docs)

In [None]:
for i in tables:
    print(i)
    print()