# PDF Conversion in Jupyter Notebook

Welcome to this Jupyter Notebook! In this notebook, we will demonstrate how to create a PDF converter using Python libraries. The converter will handle the conversion of PDF files and provide a simple user interface for ease of use.

- Dev by Kao Panboonyuen

In [14]:
import fitz  # PyMuPDF
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from PyPDF2 import PdfReader, PdfWriter
import io
import tempfile
import os

## PDF Conversion Function

This section contains the code for converting PDF files. We will use the `PyPDF2` library to read and write PDFs.

In [15]:
def optimize_pdf(input_pdf_path, output_pdf_path, dpi=300):
    # Open the original PDF
    pdf_document = fitz.open(input_pdf_path)
    
    # Create a new PDF with optimized content
    output = PdfWriter()
    
    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        pix = page.get_pixmap(matrix=fitz.Matrix(dpi / 72, dpi / 72))  # Set resolution
        
        # Create a temporary file to save the image
        with tempfile.NamedTemporaryFile(delete=False, suffix='.png') as temp_img_file:
            img_path = temp_img_file.name
            pix.save(img_path)
        
        # Convert image to PDF page using ReportLab
        packet = io.BytesIO()
        can = canvas.Canvas(packet, pagesize=(pix.width, pix.height))
        can.drawImage(img_path, 0, 0, width=pix.width, height=pix.height)
        can.save()
        
        # Merge image PDF into the new PDF
        packet.seek(0)
        new_pdf = PdfReader(packet)
        output.add_page(new_pdf.pages[0])
        
        # Remove the temporary image file
        os.remove(img_path)
    
    # Write the optimized PDF to output
    with open(output_pdf_path, "wb") as f:
        output.write(f)
    
    print(f"PDF optimized and saved to {output_pdf_path}")

def main(input_pdf_path, output_pdf_path):
    optimize_pdf(input_pdf_path, output_pdf_path)

## Interactive User Interface

In this section, we will create an interactive user interface using IPython widgets. This will allow us to input file paths and trigger the PDF conversion process through a graphical interface.

In [16]:
input_pdf_path = 'paper/Panboonyuen_REG_Refined_Generalized_Focal_Loss.pdf'
output_pdf_path = 'Panboonyuen_REG_Refined_Generalized_Focal_Loss_toArxiv.pdf'

main(input_pdf_path, output_pdf_path)

PDF optimized and saved to Panboonyuen_REG_Refined_Generalized_Focal_Loss_toArxiv.pdf


In [18]:
import fitz  # PyMuPDF
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from PyPDF2 import PdfReader, PdfWriter
import io
import tempfile
import os

def extract_text_from_pdf(pdf_path):
    # Open the PDF file
    pdf_document = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        text += page.get_text()
    return text

def optimize_pdf(input_pdf_path, output_pdf_path, dpi=300):
    # Open the original PDF
    pdf_document = fitz.open(input_pdf_path)
    
    # Create a new PDF with optimized content
    output = PdfWriter()
    
    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        pix = page.get_pixmap(matrix=fitz.Matrix(dpi / 72, dpi / 72))  # Set resolution
        
        # Create a temporary file to save the image
        with tempfile.NamedTemporaryFile(delete=False, suffix='.png') as temp_img_file:
            img_path = temp_img_file.name
            pix.save(img_path)
        
        # Convert image to PDF page using ReportLab
        packet = io.BytesIO()
        can = canvas.Canvas(packet, pagesize=(pix.width, pix.height))
        can.drawImage(img_path, 0, 0, width=pix.width, height=pix.height)
        can.save()
        
        # Merge image PDF into the new PDF
        packet.seek(0)
        new_pdf = PdfReader(packet)
        output.add_page(new_pdf.pages[0])
        
        # Remove the temporary image file
        os.remove(img_path)
    
    # Write the optimized PDF to output
    with open(output_pdf_path, "wb") as f:
        output.write(f)
    
    print(f"PDF optimized and saved to {output_pdf_path}")

def generate_summary(text):
    if text.strip() == "":
        return (
            "Unable to extract text from PDF\n\n"
            "Additional details on each of these conditions are included below:\n\n"
            "Unable to extract text from PDF: Your article is currently in \"on-hold\" status because the output from "
            "pdf to text conversion was too small to allow further processing. This might be due to submission of a "
            "scanned document, or some technical error with your submission.\n\n"
            "You may resubmit your paper once you have addressed all the issues listed above. If you are not able to "
            "resolve these matters, or feel you are receiving this warning in error, please contact help@arxiv.org, "
            "quoting submission identifier submit/5852672, to request additional assistance."
        )
    else:
        return "Text extraction successful."

def main(input_pdf_path, output_pdf_path):
    # Extract text from the input PDF
    extracted_text = extract_text_from_pdf(input_pdf_path)
    
    # Generate and print summary
    summary = generate_summary(extracted_text)
    print(summary)
    
    # Proceed with PDF optimization if text extraction is successful
    if "Unable to extract text from PDF" not in summary:
        optimize_pdf(input_pdf_path, output_pdf_path)

input_pdf_path = 'paper/Panboonyuen_REG_Refined_Generalized_Focal_Loss.pdf'
output_pdf_path = 'Panboonyuen_REG_Refined_Generalized_Focal_Loss_toArxiv.pdf'

main(input_pdf_path, output_pdf_path)

In [19]:
import fitz  # PyMuPDF
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from PyPDF2 import PdfReader, PdfWriter
import io
import tempfile
import os

def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file."""
    try:
        pdf_document = fitz.open(pdf_path)
        text = ""
        for page_num in range(len(pdf_document)):
            page = pdf_document.load_page(page_num)
            text += page.get_text()
        return text
    except Exception as e:
        print(f"Error extracting text from PDF: {e}")
        return None

def optimize_pdf(input_pdf_path, output_pdf_path, dpi=300):
    """Optimize the PDF by converting pages to images and back to PDF."""
    try:
        pdf_document = fitz.open(input_pdf_path)
        output = PdfWriter()
        
        for page_num in range(len(pdf_document)):
            page = pdf_document.load_page(page_num)
            pix = page.get_pixmap(matrix=fitz.Matrix(dpi / 72, dpi / 72))  # Set resolution
            
            # Create a temporary file to save the image
            with tempfile.NamedTemporaryFile(delete=False, suffix='.png') as temp_img_file:
                img_path = temp_img_file.name
                pix.save(img_path)
            
            # Convert image to PDF page using ReportLab
            packet = io.BytesIO()
            can = canvas.Canvas(packet, pagesize=(pix.width, pix.height))
            can.drawImage(img_path, 0, 0, width=pix.width, height=pix.height)
            can.save()
            
            # Merge image PDF into the new PDF
            packet.seek(0)
            new_pdf = PdfReader(packet)
            output.add_page(new_pdf.pages[0])
            
            # Remove the temporary image file
            os.remove(img_path)
        
        # Write the optimized PDF to output
        with open(output_pdf_path, "wb") as f:
            output.write(f)
        
        print(f"PDF optimized and saved to {output_pdf_path}")
    except Exception as e:
        print(f"Error optimizing PDF: {e}")

def generate_summary(text):
    """Generate a summary based on the text extraction result."""
    if text is None:
        return (
            "Unable to extract text from PDF\n\n"
            "Additional details on each of these conditions are included below:\n\n"
            "Unable to extract text from PDF: Your article is currently in \"on-hold\" status because the output from "
            "pdf to text conversion was too small to allow further processing. This might be due to submission of a "
            "scanned document, or some technical error with your submission.\n\n"
            "You may resubmit your paper once you have addressed all the issues listed above. If you are not able to "
            "resolve these matters, or feel you are receiving this warning in error, please contact help@arxiv.org, "
            "quoting submission identifier submit/5852672, to request additional assistance."
        )
    elif not text.strip():
        return (
            "Unable to extract text from PDF\n\n"
            "Additional details on each of these conditions are included below:\n\n"
            "Unable to extract text from PDF: The extracted text is empty or too small. This might be due to a scanned "
            "document or other issues with the submission.\n\n"
            "You may resubmit your paper once you have addressed all the issues listed above. If you are not able to "
            "resolve these matters, or feel you are receiving this warning in error, please contact help@arxiv.org, "
            "quoting submission identifier submit/5852672, to request additional assistance."
        )
    else:
        return "Text extraction successful."

def main(input_pdf_path, output_pdf_path):
    """Main function to handle text extraction, summary generation, and PDF optimization."""
    extracted_text = extract_text_from_pdf(input_pdf_path)
    
    # Generate and print summary
    summary = generate_summary(extracted_text)
    print(summary)
    
    # Proceed with PDF optimization if text extraction is successful
    if "Unable to extract text from PDF" not in summary:
        optimize_pdf(input_pdf_path, output_pdf_path)

input_pdf_path = 'paper/Panboonyuen_REG_Refined_Generalized_Focal_Loss.pdf'
output_pdf_path = 'Panboonyuen_REG_Refined_Generalized_Focal_Loss_toArxiv.pdf'

main(input_pdf_path, output_pdf_path)

Text extraction successful.
PDF optimized and saved to Panboonyuen_REG_Refined_Generalized_Focal_Loss_toArxiv.pdf
