sanitize_pdf

#!/usr/bin/python3
#	jbin - Joe's miscellaneous scripts, tools and configs
#	sanitize_pdf: Remove sensitive information from a PDF
#	Copyright (C) 2022-2022 Johannes Bauer
#
#	This file is part of jbin.
#
#	jbin is free software; you can redistribute it and/or modify
#	it under the terms of the GNU General Public License as published by
#	the Free Software Foundation; this program is ONLY licensed under
#	version 3 of the License, later versions are explicitly excluded.
#
#	jbin is distributed in the hope that it will be useful,
#	but WITHOUT ANY WARRANTY; without even the implied warranty of
#	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#	GNU General Public License for more details.
#
#	You should have received a copy of the GNU General Public License
#	along with jbin. If not, see <http://www.gnu.org/licenses/>.
#
#	Johannes Bauer <JohannesBauer@gmx.de>

import sys
import subprocess
import re
import os
from FriendlyArgumentParser import FriendlyArgumentParser

class Sanitizer():
	_REGEX = re.compile(r"^/M \(D:\d{14}\+\d{2}'\d{2}'\)$", flags = re.MULTILINE)

	def __init__(self, args):
		self._args = args

	def _do_sanitize(self, pdf_filename):
		if self._args.verbose >= 1:
			print(pdf_filename)
		uncompressed_pdf = subprocess.check_output([ "pdftk", pdf_filename, "output", "-", "uncompress" ])
		text = uncompressed_pdf.decode("latin1")
		text = self._REGEX.sub("/M (D:19700101000000+00'00)", text)
		uncompressed_pdf = text.encode("latin1")
		compressed_pdf = subprocess.check_output([ "pdftk", "-", "output", "-", "compress" ], input = uncompressed_pdf)
		with open(pdf_filename, "wb") as f:
			f.write(compressed_pdf)

	def do_sanitize(self, pdf_filename):
		try:
			self._do_sanitize(pdf_filename)
		except subprocess.CalledProcessError as e:
			print(f"Error when processing {pdf_filename}: {str(e)}", file = sys.stderr)

parser = FriendlyArgumentParser(description = "Remove annotation timestamps from PDFs.")
parser.add_argument("-v", "--verbose", action = "count", default = 0, help = "Increases verbosity. Can be specified multiple times to increase.")
parser.add_argument("filename", metavar = "file/path", help = "Filename or directory name to traverse")
args = parser.parse_args(sys.argv[1:])

sanitizer = Sanitizer(args)
if os.path.isdir(args.filename):
	for (basedir, subdirs, filenames)  in os.walk(args.filename):
		for filename in filenames:
			if filename.lower().endswith(".pdf"):
				full_filename = basedir + "/" + filename
				sanitizer.do_sanitize(full_filename)
else:
	sanitizer.do_sanitize(args.filename)