Contains pipeline for reading in image files, extracing relevant data, and outputing a dataframe.

Download installer for tesseract for Windows:

https://github.com/tesseract-ocr/tessdoc/blob/main/Installation.md

https://github.com/UB-Mannheim/tesseract/wiki

In [2]:
#import packages for ocr
from PIL import Image
import pytesseract
#set location of tesseracct executable
pytesseract.pytesseract.tesseract_cmd = r'C:\Users\mayerlm\Saved Games\Desktop\tesseract\tesseract.exe'
import cv2
from matplotlib import pyplot as plt
%matplotlib inline
import pdfplumber

#import packages for data manipulation
import numpy as np
import pandas as pd
import argparse
import sys
import re
import os

In [2]:
#set functions

#check if filepath exists and return valid filepath
# (based on function by Sara Jones)
def check_path(filepath):
    if os.path.exists(filepath):
        return filepath
    else:
        print("{} does not exist.".format(filepath))
        sys.exit()

#search for dates with regular expressions 
#for now, pieces of this typed into pipeline, but could use separate function instead if needed
def short_dates(card_text):
    dates=[]
    found=re.findall('(\d+[-\/|]\d+[-\/|]\d+)',card_text)
    dates.append(found)
    return(dates)
#this date format catches 00/00/00, 00-00-00, 00|00|00, mixed delimiters, four digit years, different m/d/y orders



In [3]:
#make output dataframe with image file names + extracted contents
#args: imagefiles (string) - path to folder with images

#returns dataframe
#      id (string)    - hhs id
#      valid (string) - yes if valid vax card, no if not
#      manuf (string) - vaccine manufacturer from vard
#      dates (string) - extracted dates from card
def make_df(imagepath):
    total_files = 0 # number of files in folder
    total_valid = 0 
    
    if os.path.isdir(imagepath):
        filenames=[] #list for image file names
        ids=[] #list for hhs ids
        valid=[] #list for valid card
        manuf=[] #list for manufacturers
        dates=[] #list for dates
        flags=[] #list for flags
        
        #loop through each file and do stuff
        for file in os.listdir(imagepath):
            total_files += 1
            #add file names and ids for all files, regardless of if we can pull data
            filenames.append(file)
            ids.append(file.split('_',1)[0]) #change based on actual delim
                
            if file.endswith('pdf'):
            #continue to pdf reader
            else: 
                img=cv2.imread(imagepath)
                text=pytesseract.image_to_string(img)

                #check if text contains key words for covid vax validity
                vaxwords="COVID-19 Vaccination" in text # for now just look for "COVID-19 Vaccination" but later could we match list of potential strings?
                if vaxwords==True:
                    valid.append("yes")
                    total_valid +=1
                        
                    #extract manufacturer text if it matches a list of manufacturers
                    
                    #extract dates using regex developed elsewhere
                    found=re.findall('(\d+[-\/|]\d+[-\/|]\d+)',card_text)
                    dates.append(found)

                else:
                    valid.append("no")
                    #should add some specific term for other variables "na"?    
                #if image can't be read, flag for manual review
            except: 
                flag="manual_review_required"
                #do I need to add something to close file?
                    
    else:
        print("Not valid directory.")
        sys.exit()
    print("Number of files in directory: {}". format(total_files))
    print("Number of files with at least a valid card: {}". format(total_valid))
                    
    # create a zipped list of tuples from above lists
    data = list(zip(filenames, ids, valid, manuf, dates, flags))

    # convert to dataframe
    df = pd.DataFrame(data, columns = ["filename","hhs_id","valid_doc","manufacturer","vax_dates","flags"])

    return df
                        
        
        

'C:/Users/mayerlm/Saved Games/Desktop/text-recognition-ocr-python/images/'

In [None]:
#how to call/execute?