/
sanitize_pdf
executable file
·65 lines (58 loc) · 2.51 KB
/
sanitize_pdf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#!/usr/bin/python3
# jbin - Joe's miscellaneous scripts, tools and configs
# sanitize_pdf: Remove sensitive information from a PDF
# Copyright (C) 2022-2022 Johannes Bauer
#
# This file is part of jbin.
#
# jbin is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; this program is ONLY licensed under
# version 3 of the License, later versions are explicitly excluded.
#
# jbin is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with jbin. If not, see <http://www.gnu.org/licenses/>.
#
# Johannes Bauer <JohannesBauer@gmx.de>
import sys
import subprocess
import re
import os
from FriendlyArgumentParser import FriendlyArgumentParser
class Sanitizer():
_REGEX = re.compile(r"^/M \(D:\d{14}\+\d{2}'\d{2}'\)$", flags = re.MULTILINE)
def __init__(self, args):
self._args = args
def _do_sanitize(self, pdf_filename):
if self._args.verbose >= 1:
print(pdf_filename)
uncompressed_pdf = subprocess.check_output([ "pdftk", pdf_filename, "output", "-", "uncompress" ])
text = uncompressed_pdf.decode("latin1")
text = self._REGEX.sub("/M (D:19700101000000+00'00)", text)
uncompressed_pdf = text.encode("latin1")
compressed_pdf = subprocess.check_output([ "pdftk", "-", "output", "-", "compress" ], input = uncompressed_pdf)
with open(pdf_filename, "wb") as f:
f.write(compressed_pdf)
def do_sanitize(self, pdf_filename):
try:
self._do_sanitize(pdf_filename)
except subprocess.CalledProcessError as e:
print(f"Error when processing {pdf_filename}: {str(e)}", file = sys.stderr)
parser = FriendlyArgumentParser(description = "Remove annotation timestamps from PDFs.")
parser.add_argument("-v", "--verbose", action = "count", default = 0, help = "Increases verbosity. Can be specified multiple times to increase.")
parser.add_argument("filename", metavar = "file/path", help = "Filename or directory name to traverse")
args = parser.parse_args(sys.argv[1:])
sanitizer = Sanitizer(args)
if os.path.isdir(args.filename):
for (basedir, subdirs, filenames) in os.walk(args.filename):
for filename in filenames:
if filename.lower().endswith(".pdf"):
full_filename = basedir + "/" + filename
sanitizer.do_sanitize(full_filename)
else:
sanitizer.do_sanitize(args.filename)