Remove qpdf.merge

We no longer need to merge pages this way. Much of the functionality was there to implement page splitting without hitting ulimit which will be fixed in qpdf > 8.0.2. The tests were expensive to run. Also remove pytest-timeout since it breaks the Linux build.
ocrmypdf · Jun 23, 2018 · 94150f4 · 94150f4
1 parent 54e74f8
commit 94150f4
Show file tree

Hide file tree

Showing 4 changed files with 0 additions and 200 deletions.
diff --git a/src/ocrmypdf/exec/qpdf.py b/src/ocrmypdf/exec/qpdf.py
@@ -95,131 +95,10 @@ def repair(input_file, output_file, log):
             raise SubprocessOutputError() from e
 
 
-def get_npages(input_file, log):
-    try:
-        pages = run(
-            ['qpdf', '--show-npages', input_file],
-            universal_newlines=True, check=True, stdout=PIPE, stderr=STDOUT)
-    except CalledProcessError as e:
-        if e.returncode == 2 and e.output.find('No such file'):
-            log.error(e.output)
-            raise InputFileError() from e
-    return int(pages)
-
-
-def split_pages(input_file, work_folder, npages):
-    """Split multipage PDF into individual pages.
-
-    Incredibly enough, this multiple process approach is about 70 times
-    faster than using Ghostscript.
-    """
-    for n in range(int(npages)):
-        args_qpdf = [
-            'qpdf', input_file,
-            '--pages', input_file, '{0}'.format(n + 1), '--',
-            os.path.join(work_folder, '{0:06d}.page.pdf'.format(n + 1))
-        ]
-        run(args_qpdf, check=True)
-
-
 def extract_page(input_file, output_file, pageno):
     args_qpdf = [
         'qpdf', input_file,
         '--pages', input_file, '{0}'.format(pageno + 1), '--',
         output_file
     ]
     run(args_qpdf, check=True)
-
-
-def _merge_inner(input_files, output_file, min_version=None, log=None):
-    """Merge the list of input files (all filenames) into the output file.
-
-    The input files may contain one or more pages.
-    """
-
-    # Single page 'merges' should still be attempted to that the same error
-    # checking is applied to single page case
-
-    version_arg = ['--min-version={}'.format(min_version)] \
-                  if min_version else []
-
-    if log is None:
-        import logging as log
-
-    args_qpdf = [
-        'qpdf'
-    ] + version_arg + [
-        input_files[0], '--pages'
-    ] + input_files + ['--', output_file]
-
-    try:
-        run(args_qpdf, check=True, stderr=PIPE, universal_newlines=True)
-    except CalledProcessError as e:
-        if e.returncode == 3 and \
-                e.stderr.find("unknown token while reading object") and \
-                e.stderr.find("operation succeeded"):
-            # Only whitelist the 'unknown token' problem (decimal/string issue)
-            # qpdf issue #165
-            log.warning('qpdf found and fixed errors: ' + e.stderr)
-            return
-        raise e from e
-
-
-def merge(input_files, output_file, min_version=None, log=None, max_files=None):
-    """Merge the list of input files (all filenames) into the output file.
-
-    The input files may contain one or more pages.
-
-    """
-    # qpdf requires that every file that contributes to the output has a file
-    # descriptor that remains open. That means, given our approach of one
-    # intermediate PDF per, we can practically hit the number of file
-    # descriptors.
-
-    if max_files is None or max_files < 2:
-        # Find out how many open file descriptors we can get away with
-        ulimits = resource.getrlimit(resource.RLIMIT_NOFILE)
-        max_open_files = ulimits[0]
-        max_files = max_open_files // 2  # Conservative guess
-
-    # We'll write things alongside the output file
-    output_dir = os.path.dirname(output_file)
-
-    import random
-    import string
-
-    def randstr():
-        return ''.join(random.sample(string.ascii_lowercase, 6))
-
-    # How many files to grab at once, merging all their contents
-    step_size = max_files
-
-    workqueue = input_files.copy()
-    counter = 1
-    next_workqueue = []
-    while len(workqueue) > 1 or len(next_workqueue) > 0:
-        # Take n files out of the queue
-        n = min(step_size, len(workqueue))
-        job = workqueue[0:n]
-        del workqueue[0:n]
-        log.debug('merging ' + repr(job))
-
-        # Merge them into 1 file, which will contain n^depth pages
-        merge_file = os.path.join(
-            output_dir, "merge-{:06d}-{}.pdf".format(counter, randstr()))
-        counter += 1
-        _merge_inner(job, merge_file, min_version=min_version, log=log)
-
-        # On the next
-        next_workqueue.append(merge_file)
-        log.debug('next_workqueue ' + repr(next_workqueue))
-
-        # If we're out of things to do in this queue, move on to the next
-        # queue. On the counter-th pass of the workqueue we can chew through
-        # (step_size)**N pages, so on most systems the second pass finishes
-        # the job.
-        if len(workqueue) == 0:
-            workqueue = next_workqueue
-            next_workqueue = []
-
-    re_symlink(workqueue.pop(), output_file, log)
diff --git a/src/ocrmypdf/pipeline.py b/src/ocrmypdf/pipeline.py
@@ -887,14 +887,6 @@ def build_pipeline(options, work_folder, log, context):
         os.path.join(work_folder, '*.marker.pdf'),
         extras=[log, context])
 
-    # task_split_pages = main_pipeline.transform(
-    #     task_func=split_page,
-    #     input=task_pre_split_pages,
-    #     filter=suffix('.presplit.pdf'),
-    #     output='.page.pdf',
-    #     output_dir=work_folder,
-    #     extras=[log, context])
-
     task_ocr_or_skip = main_pipeline.split(
         ocr_or_skip,
         task_marker_pages,

diff --git a/test_requirements.txt b/test_requirements.txt
@@ -2,7 +2,6 @@ pytest >= 3.2
 pytest-helpers-namespace
 pytest-xdist
 pytest-cov
-pytest-timeout
 python-xmp-toolkit   # requires apt-get install libexempi3
                      # or brew install exempi
 PyPDF2 >= 1.26.0

diff --git a/tests/test_qpdf.py b/tests/test_qpdf.py