In [33]:
# 20250702_pdf_10_errors_fix.R

# This script (1) generates a sample PDF with 10 common structure errors,
# then (2) detects and attempts to correct them using R.

# ─── 0. Setup ──────────────────────────────────────────────────────────────

# Install needed packages if not already installed:
# install.packages(c("pdftools", "qpdf", "magick", "stringr"))

library(pdftools)  # for text extraction
library(qpdf)      # for rotation and metadata
library(magick)    # for stamping overlays
library(stringr)   # for string detection

# Filenames
sample_pdf <- "20250702_sample_errors_R.pdf"
fixed_pdf  <- "20250702_fixed_10_errors_R.pdf"

# ─── 1. Generate sample PDF ───────────────────────────────────────────────
# Use base graphics on a PDF device
pdf(sample_pdf, width=8.5, height=11)  
for (i in 1:12) {
  # Simulate blank page (error 6)
  if (i == 7) {
    plot.new()
    next
  }
  
  # Change orientation on page 8 (error 7) — *NOT natively supported mid-stream in R*.
  # Here we just note it; full support would require separate PDF and later merging.
  if (i == 8) {
    # TODO: create a separate landscape PDF page and merge later
  }
  
  plot.new()
  
  # Errors 1 & 2: Missing header on pages 2 & 5
  if (!(i %in% c(2, 5))) {
    text(0.5, 0.95, "Report Title", cex=1.5, font=2)
  }
  
  # Error 10: Lowercase header on page 10
  if (i == 10) {
    text(0.5, 0.95, "report title", cex=1.5, font=2)
  }
  
  # Error 3 & 4: Missing footer on pages 3 & 6
  if (!(i %in% c(3, 6))) {
    text(0.1, 0.05, paste("Footer - Page", i), cex=0.7, font=3)
  }
  
  # Error 5: No page number on page 4
  if (i != 4) {
    text(0.9, 0.05, as.character(i), cex=1)
  }
  
  # Error 9: Inconsistent font size on page 9
  body_cex <- ifelse(i == 9, 0.7, 1.0)
  
  # Body content
  text(0.5, 0.85, paste("Content of page", i), cex=body_cex)
}
dev.off()

# ─── 2. Detect and correct errors ──────────────────────────────────────────

# 2.1 Extract text per page
txt_pages <- pdf_text(sample_pdf)

# Prepare a temporary directory for page-by-page PDF manipulation
tmp_dir <- tempfile("pdf_pages_"); dir.create(tmp_dir)
qpdf::pdf_split(input = sample_pdf, output = file.path(tmp_dir, "page_%03d.pdf"))

fixed_pages <- list()

for (idx in seq_along(txt_pages)) {
  page_txt <- txt_pages[idx]
  infile   <- file.path(tmp_dir, sprintf("page_%03d.pdf", idx))
  
  # 2.2 Remove blank pages (error 6)
  if (str_trim(page_txt) == "") {
    next
  }
  
  # 2.3 Rotate landscape pages back to portrait (error 7)
  # qpdf can detect and rotate; here we attempt a blind rotation if width>height
  info <- pdf_info(infile)
  if (info$page_size[idx]$width > info$page_size[idx]$height) {
    outfile <- tempfile(fileext = ".pdf")
    qpdf::pdf_rotate(input = infile, pages = idx, output = outfile, rotate = 90)
    infile <- outfile
  }
  
  # 2.4 Overlay header if missing or wrong case (errors 1,2,10)
  if (!str_detect(page_txt, regex("Report Title", ignore_case = FALSE))) {
    # Stamp "Report Title" using magick
    img   <- image_read_pdf(infile)
    img   <- image_annotate(img, "Report Title", location = "+72+50", size = 20, font = "Helvetica-Bold")
    image_write(img, path = infile, format = "pdf")
  }
  
  # 2.5 Overlay footer if missing (errors 3,4)
  if (!str_detect(page_txt, regex(sprintf("Footer - Page %d", idx)))) {
    img <- image_read_pdf(infile)
    img <- image_annotate(img, sprintf("Footer - Page %d", idx), location = "+72+750", size = 12, font = "Helvetica-Oblique")
    image_write(img, path = infile, format = "pdf")
  }
  
  # 2.6 Add page number if missing (error 5)
  if (!str_detect(page_txt, fixed(idx))) {
    img <- image_read_pdf(infile)
    img <- image_annotate(img, as.character(idx), location = "+500+750", size = 20, font = "Helvetica")
    image_write(img, path = infile, format = "pdf")
  }
  
  fixed_pages[[length(fixed_pages) + 1]] <- infile
}

# 2.7 Combine fixed pages
qpdf::pdf_combine(input = fixed_pages, output = fixed_pdf)

# 2.8 Set metadata (error 8)
qpdf::pdf_set_info(input = fixed_pdf,
                   output = fixed_pdf,
                   title   = "Corrected Report",
                   author  = "Data Science Team",
                   subject = "PDF Structure Errors Fixed")

message("Fixed PDF written to: ", fixed_pdf)

# Note:
# - Changing orientation mid-stream (page 8) in the native R PDF device is not supported;
#   requires creating a separate landscape PDF and merging manually.
# - Overlaying text on existing PDF pages is done here via **magick**, which rasterizes pages—
#   for vector retention, consider a specialized stamping tool or system qpdf commands.


ERROR: Error in normalizePath(path.expand(path), winslash, mustWork): path[1]="C:\Users\jenny\AppData\Local\Temp\RtmpMnajms\pdf_pages_1e844e15577a/page_001.pdf": El sistema no puede encontrar el archivo especificado
