Skip to content
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
87 lines (72 sloc) 3.72 KB
Use imagemagick to convert all pfds to a sequence of thumbnail images
requires: sudo apt-get install imagemagick
import os
import time
import shutil
from subprocess import Popen
from utils import Config
# make sure imagemagick is installed
if not shutil.which('convert'): # shutil.which needs Python 3.3+
print("ERROR: you don\'t have imagemagick installed. Install it first before calling this script")
# create if necessary the directories we're using for processing and output
pdf_dir = os.path.join('data', 'pdf')
if not os.path.exists(Config.thumbs_dir): os.makedirs(Config.thumbs_dir)
if not os.path.exists(Config.tmp_dir): os.makedirs(Config.tmp_dir)
# fetch all pdf filenames in the pdf directory
files_in_pdf_dir = os.listdir(pdf_dir)
pdf_files = [x for x in files_in_pdf_dir if x.endswith('.pdf')] # filter to just pdfs, just in case
# iterate over all pdf files and create the thumbnails
for i,p in enumerate(pdf_files):
pdf_path = os.path.join(pdf_dir, p)
thumb_path = os.path.join(Config.thumbs_dir, p + '.jpg')
if os.path.isfile(thumb_path):
print("skipping %s, thumbnail already exists." % (pdf_path, ))
print("%d/%d processing %s" % (i, len(pdf_files), p))
# take first 8 pages of the pdf ([0-7]), since 9th page are references
# tile them horizontally, use JPEG compression 80, trim the borders for each image
#cmd = "montage %s[0-7] -mode Concatenate -tile x1 -quality 80 -resize x230 -trim %s" % (pdf_path, "thumbs/" + f + ".jpg")
#print "EXEC: " + cmd
# nvm, below using a roundabout alternative that is worse and requires temporary files, yuck!
# but i found that it succeeds more often. I can't remember wha thappened anymore but I remember
# that the version above, while more elegant, had some problem with it on some pdfs. I think.
# erase previous intermediate files thumb-*.png in the tmp directory
if os.path.isfile(os.path.join(Config.tmp_dir, 'thumb-0.png')):
for i in range(8):
f = os.path.join(Config.tmp_dir, 'thumb-%d.png' % (i,))
f2= os.path.join(Config.tmp_dir, 'thumbbuf-%d.png' % (i,))
if os.path.isfile(f):
cmd = 'mv %s %s' % (f, f2)
# okay originally I was going to issue an rm call, but I am too terrified of
# running scripted rm queries, so what we will do is instead issue a "mv" call
# to rename the files. That's a bit safer, right? We have to do this because if
# some papers are shorter than 8 pages, then results from previous paper will
# "leek" over to this result, through the intermediate files.
# spawn async. convert can unfortunately enter an infinite loop, have to handle this.
# this command will generate 8 independent images thumb-0.png ... thumb-7.png of the thumbnails
pp = Popen(['convert', '%s[0-7]' % (pdf_path, ), '-thumbnail', 'x156', os.path.join(Config.tmp_dir, 'thumb.png')])
t0 = time.time()
while time.time() - t0 < 20: # give it 15 seconds deadline
ret = pp.poll()
if not (ret is None):
# process terminated
ret = pp.poll()
if ret is None:
print("convert command did not terminate in 20 seconds, terminating.")
pp.terminate() # give up
if not os.path.isfile(os.path.join(Config.tmp_dir, 'thumb-0.png')):
# failed to render pdf, replace with missing image
missing_thumb_path = os.path.join('static', 'missing.jpg')
os.system('cp %s %s' % (missing_thumb_path, thumb_path))
print("could not render pdf, creating a missing image placeholder")
cmd = "montage -mode concatenate -quality 80 -tile x1 %s %s" % (os.path.join(Config.tmp_dir, 'thumb-*.png'), thumb_path)
time.sleep(0.01) # silly way for allowing for ctrl+c termination
You can’t perform that action at this time.