Skip to content

Commit

Permalink
test: Replace PyPDF2 with pdfplumber
Browse files Browse the repository at this point in the history
  • Loading branch information
kesara committed Jun 28, 2022
1 parent 005b007 commit a2e7564
Show file tree
Hide file tree
Showing 4 changed files with 24 additions and 73 deletions.
4 changes: 2 additions & 2 deletions bin/walkpdf
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Copyright The IETF Trust 2018, All Rights Reserved

Expand All @@ -16,7 +16,7 @@ for filename in sys.argv[1:]:
print('Could not find "%s"' % filename)
print('File: %s' % filename)
doc = pyobj(filename)
with io.open(filename+'.json', 'bw') as j:
with io.open(filename+'.json', 'w') as j:
json.dump(doc, j, indent=2)
print('Wrote: %s' % j.name)
with io.open(filename+'.xml', 'w', encoding='utf-8') as x:
Expand Down
2 changes: 1 addition & 1 deletion configtest.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
for (pname, mname) in [
('decorator', 'decorator'),
('dict2xml', 'dict2xml'),
('PyPDF2', 'PyPDF2'),
('pdfplumber', 'pdfplumber'),
]:
try:
sys.stderr.write(" '%s'...\n" % pname)
Expand Down
5 changes: 3 additions & 2 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ allowlist_externals =

deps =
-rrequirements.txt
typing-extensions
decorator
dict2xml==1.7
pypdf2<1.27.0
pdfplumber
dict2xml
weasyprint==55.0
86 changes: 18 additions & 68 deletions xml2rfc/walkpdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,86 +9,36 @@
import json
import lxml
import os
import PyPDF2 as pypdf2
import pdfplumber
import sys

def walk(obj, seen):
dobj = {} # Direct objects
iobj = [] # Indirect objects
if hasattr(obj, 'keys'):
for key in obj.keys():
k = key[1:] if key.startswith('/') else key
d, i = walk(obj[key], seen)
dobj[k] = d
iobj += i
if hasattr(obj, 'extractText'):
dobj['text'] = obj.extractText()
elif isinstance(obj, pypdf2.generic.ArrayObject):
dobj = []
for o in obj:
d, i = walk(o, seen)
dobj.append(d)
iobj += i
elif isinstance(obj, pypdf2.generic.BooleanObject):
dobj = obj.value
elif isinstance(obj, pypdf2.generic.NameObject):
dobj = str(obj)
elif isinstance(obj, pypdf2.generic.NumberObject):
dobj = int(obj)
elif isinstance(obj, pypdf2.generic.FloatObject):
dobj = float(obj)
elif isinstance(obj, pypdf2.generic.IndirectObject):
dobj = str(obj)
if (obj.idnum, obj.generation) not in seen:
seen.add((obj.idnum, obj.generation))
d, i = walk(obj.getObject(), seen)
if isinstance(d, dict):
d['IdNum'] = obj.idnum
d['Generation'] = obj.generation
else:
dobj = d
iobj += i
iobj.append(d)
elif isinstance(obj, pypdf2.generic.TextStringObject):
dobj = str(obj)
else:
raise RuntimeError("Unexpected object type: %s" % type(obj))

if hasattr(obj, 'idnum'):
seen.add((obj.idnum, obj.generation))

return dobj, iobj
def get_fonts(page):
fonts = []
if 'char' in page.objects.keys():
for obj in page.chars:
# pdfplumber presents font names like `ROTXYT+Noto-Sans-Cherokee`
fontname = obj['fontname'].split('+')[1].replace('-', ' ')
if fontname not in fonts:
fonts.append(fontname)
return fonts

def pyobj(filename=None, bytes=None):
seen = set()
#
pdffile = io.BytesIO(bytes) if bytes else io.open(filename, 'br')
reader = pypdf2.PdfFileReader(pdffile, strict=False)
info = reader.getDocumentInfo()
doc = {}
for key in info.keys():
k = key[1:] if key.startswith('/') else key
doc[k] = info[key]
iobj = []
reader = pdfplumber.open(pdffile)
doc = reader.metadata
pages = []
for num in range(reader.getNumPages()):
page = reader.getPage(num)
obj = page.getObject()
d, i = walk(obj, seen)
#pages[num+1] = d
pages.append(d)
iobj += i
for num in range(len(reader.pages)):
page = reader.pages[num]
pages.append({
'text': page.extract_text(),
'FontFamily': get_fonts(page)})
pdffile.close()
#
doc['Page'] = pages
doc['IndirectObject'] = iobj
return doc

def xmltext(filename=None, obj=None, bytes=None):
if obj is None:
obj = pyobj(filename=filename, bytes=bytes)
# for i,p in enumerate(obj['Pages']):
# obj['Pages'][i] = {'Page': p}
return dict2xml.dict2xml(obj, wrap="Document")

def xmldoc(filename, text=None, bytes=None):
Expand All @@ -110,4 +60,4 @@ def main():
print('Wrote: %s' % x.name)

if __name__ == "__main__":
main()
main()

0 comments on commit a2e7564

Please sign in to comment.