Skip to content

Commit

Permalink
Remove qpdf.merge
Browse files Browse the repository at this point in the history
We no longer need to merge pages this way. Much of the functionality
was there to implement page splitting without hitting ulimit which
will be fixed in qpdf > 8.0.2. The tests were expensive to run.

Also remove pytest-timeout since it breaks the Linux build.
  • Loading branch information
James R. Barlow committed Jun 23, 2018
1 parent 54e74f8 commit 94150f4
Show file tree
Hide file tree
Showing 4 changed files with 0 additions and 200 deletions.
121 changes: 0 additions & 121 deletions src/ocrmypdf/exec/qpdf.py
Expand Up @@ -95,131 +95,10 @@ def repair(input_file, output_file, log):
raise SubprocessOutputError() from e


def get_npages(input_file, log):
try:
pages = run(
['qpdf', '--show-npages', input_file],
universal_newlines=True, check=True, stdout=PIPE, stderr=STDOUT)
except CalledProcessError as e:
if e.returncode == 2 and e.output.find('No such file'):
log.error(e.output)
raise InputFileError() from e
return int(pages)


def split_pages(input_file, work_folder, npages):
"""Split multipage PDF into individual pages.
Incredibly enough, this multiple process approach is about 70 times
faster than using Ghostscript.
"""
for n in range(int(npages)):
args_qpdf = [
'qpdf', input_file,
'--pages', input_file, '{0}'.format(n + 1), '--',
os.path.join(work_folder, '{0:06d}.page.pdf'.format(n + 1))
]
run(args_qpdf, check=True)


def extract_page(input_file, output_file, pageno):
args_qpdf = [
'qpdf', input_file,
'--pages', input_file, '{0}'.format(pageno + 1), '--',
output_file
]
run(args_qpdf, check=True)


def _merge_inner(input_files, output_file, min_version=None, log=None):
"""Merge the list of input files (all filenames) into the output file.
The input files may contain one or more pages.
"""

# Single page 'merges' should still be attempted to that the same error
# checking is applied to single page case

version_arg = ['--min-version={}'.format(min_version)] \
if min_version else []

if log is None:
import logging as log

args_qpdf = [
'qpdf'
] + version_arg + [
input_files[0], '--pages'
] + input_files + ['--', output_file]

try:
run(args_qpdf, check=True, stderr=PIPE, universal_newlines=True)
except CalledProcessError as e:
if e.returncode == 3 and \
e.stderr.find("unknown token while reading object") and \
e.stderr.find("operation succeeded"):
# Only whitelist the 'unknown token' problem (decimal/string issue)
# qpdf issue #165
log.warning('qpdf found and fixed errors: ' + e.stderr)
return
raise e from e


def merge(input_files, output_file, min_version=None, log=None, max_files=None):
"""Merge the list of input files (all filenames) into the output file.
The input files may contain one or more pages.
"""
# qpdf requires that every file that contributes to the output has a file
# descriptor that remains open. That means, given our approach of one
# intermediate PDF per, we can practically hit the number of file
# descriptors.

if max_files is None or max_files < 2:
# Find out how many open file descriptors we can get away with
ulimits = resource.getrlimit(resource.RLIMIT_NOFILE)
max_open_files = ulimits[0]
max_files = max_open_files // 2 # Conservative guess

# We'll write things alongside the output file
output_dir = os.path.dirname(output_file)

import random
import string

def randstr():
return ''.join(random.sample(string.ascii_lowercase, 6))

# How many files to grab at once, merging all their contents
step_size = max_files

workqueue = input_files.copy()
counter = 1
next_workqueue = []
while len(workqueue) > 1 or len(next_workqueue) > 0:
# Take n files out of the queue
n = min(step_size, len(workqueue))
job = workqueue[0:n]
del workqueue[0:n]
log.debug('merging ' + repr(job))

# Merge them into 1 file, which will contain n^depth pages
merge_file = os.path.join(
output_dir, "merge-{:06d}-{}.pdf".format(counter, randstr()))
counter += 1
_merge_inner(job, merge_file, min_version=min_version, log=log)

# On the next
next_workqueue.append(merge_file)
log.debug('next_workqueue ' + repr(next_workqueue))

# If we're out of things to do in this queue, move on to the next
# queue. On the counter-th pass of the workqueue we can chew through
# (step_size)**N pages, so on most systems the second pass finishes
# the job.
if len(workqueue) == 0:
workqueue = next_workqueue
next_workqueue = []

re_symlink(workqueue.pop(), output_file, log)
8 changes: 0 additions & 8 deletions src/ocrmypdf/pipeline.py
Expand Up @@ -887,14 +887,6 @@ def build_pipeline(options, work_folder, log, context):
os.path.join(work_folder, '*.marker.pdf'),
extras=[log, context])

# task_split_pages = main_pipeline.transform(
# task_func=split_page,
# input=task_pre_split_pages,
# filter=suffix('.presplit.pdf'),
# output='.page.pdf',
# output_dir=work_folder,
# extras=[log, context])

task_ocr_or_skip = main_pipeline.split(
ocr_or_skip,
task_marker_pages,
Expand Down
1 change: 0 additions & 1 deletion test_requirements.txt
Expand Up @@ -2,7 +2,6 @@ pytest >= 3.2
pytest-helpers-namespace
pytest-xdist
pytest-cov
pytest-timeout
python-xmp-toolkit # requires apt-get install libexempi3
# or brew install exempi
PyPDF2 >= 1.26.0
Expand Down
70 changes: 0 additions & 70 deletions tests/test_qpdf.py

This file was deleted.

0 comments on commit 94150f4

Please sign in to comment.