In [1]:
# code adapted from: https://pythontips.com/2016/02/25/ocr-on-pdf-files-using-python/

from wand.image import Image
from PIL import Image as PI  # import as PI to avoid name conflicts

import pyocr
import pyocr.builders

import io
import os

In [2]:
tool = pyocr.get_available_tools()[0]
lang = tool.get_available_languages()[0]  # eng

In [3]:
def convert_file(filename):
    req_image = []  # list to store image of each page in
    final_text = []  # list to store OCR converted text

    # pyocr needs image files to work, so we need to convert PDFs to images first
    image_pdf = Image(filename=filename, resolution=300)
    image_jpeg = image_pdf.convert('jpeg')  # this creates a sequence of jpegs, one per page
    
    for img in image_jpeg.sequence:
        img_page = Image(image=img)
        req_image.append(img_page.make_blob('jpeg'))
    
    for img in req_image: 
        txt = tool.image_to_string(
            PI.open(io.BytesIO(img)),
            lang=lang,
            builder=pyocr.builders.TextBuilder()
        )
        final_text.append(txt)
    
    return final_text

In [4]:
mypath = 'Porter'
porterfiles = [f for f in os.listdir(mypath) if os.path.isfile(os.path.join(mypath, f))]
porterfiles

['Porter - 1.pdf',
 'Porter - 2.pdf',
 'Porter - 3.pdf',
 'Porter - 4.pdf',
 'Porter - 5.pdf']

In [5]:
for file in porterfiles:
    infile = mypath + '/' + file
    
    file_name, file_ext = os.path.splitext(file)
    outfile = mypath + '/' + file_name + '.txt'
    
    outtext = convert_file(infile)
    with open(outfile, 'w') as f:
        for item in outtext:
            f.write("%s\n\n" % item.encode("ascii", errors="ignore").decode())

In [6]:
mypath = 'Hughes'
hughesfiles = [f for f in os.listdir(mypath) if os.path.isfile(os.path.join(mypath, f))]
hughesfiles

['1 (p. 1-9).pdf',
 '10 (back pg).pdf',
 '11 (p. 41-45).pdf',
 '12 (p. 46-50).pdf',
 '13 (extra pg).pdf',
 '2 (3 back pg).pdf',
 '3 (p. 10-16+).pdf',
 '4 (back pg).pdf',
 '5 (p. 17-22).pdf',
 '6 (back pg).pdf',
 '7 (p. 23-31).pdf',
 '8 (back pg).pdf',
 '9 (p. 32-40).pdf',
 'Gravestone of Susannah Hughes - Meaford Cemetery.pdf',
 'Hughes Coat of Arms.pdf',
 'Hughes History (by Minnie Hegadorn).pdf',
 'Mortgage burning.pdf',
 'Obituary - Uncle Jack Hughes.pdf',
 'Robert Pettapiece.pdf']

In [7]:
todelete = [18, 17, 16, 14, 13]

for i in todelete:
    del hughesfiles[i]

hughesfiles

['1 (p. 1-9).pdf',
 '10 (back pg).pdf',
 '11 (p. 41-45).pdf',
 '12 (p. 46-50).pdf',
 '13 (extra pg).pdf',
 '2 (3 back pg).pdf',
 '3 (p. 10-16+).pdf',
 '4 (back pg).pdf',
 '5 (p. 17-22).pdf',
 '6 (back pg).pdf',
 '7 (p. 23-31).pdf',
 '8 (back pg).pdf',
 '9 (p. 32-40).pdf',
 'Hughes History (by Minnie Hegadorn).pdf']

In [8]:
for file in hughesfiles:
    infile = mypath + '/' + file
    
    file_name, file_ext = os.path.splitext(file)
    outfile = mypath + '/' + file_name + '.txt'
    
    outtext = convert_file(infile)
    with open(outfile, 'w') as f:
        for item in outtext:
            f.write("%s\n\n" % item.encode("ascii", errors="ignore").decode())