In [30]:
import numpy as np
import os
import cv2
import pytesseract
import json
import os.path

try:
    from PIL import Image
except ImportError:
    import Image
from PIL import Image
from pdf2image import convert_from_path 

PDF_file = "samples/Lakeside - Hedges Used Cars Inc_sales.pdf"

In [31]:
pages = convert_from_path(PDF_file, dpi=300) 

In [32]:
image_counter = 1

In [33]:
pdfbaseFolder = PDF_file.replace("samples/","").replace(".pdf","")
if not os.path.exists(pdfbaseFolder):
    os.makedirs(pdfbaseFolder)  
        
for page in pages: 
  
    # Declaring filename for each page of PDF as JPG 
    # For each page, filename will be: 
    # PDF page 1 -> page_1.jpg 
    # PDF page 2 -> page_2.jpg 
    # PDF page 3 -> page_3.jpg 
    # .... 
    # PDF page n -> page_n.jpg 
    filename = "page_"+str(image_counter)+".jpg"
      
    # Save the image of the page in system 
    page.save(pdfbaseFolder+"/"+filename, 'JPEG') 
    
    # Increment the counter to update filename 
    image_counter = image_counter + 1
    

In [34]:
def sort_contours(cnts, method="left-to-right"):
	# initialize the reverse flag and sort index
	reverse = False
	i = 0
 
	# handle if we need to sort in reverse
	if method == "right-to-left" or method == "bottom-to-top":
		reverse = True
 
	# handle if we are sorting against the y-coordinate rather than
	# the x-coordinate of the bounding box
	if method == "top-to-bottom" or method == "bottom-to-top":
		i = 1
 
	# construct the list of bounding boxes and sort them from top to
	# bottom
	boundingBoxes = [cv2.boundingRect(c) for c in cnts]
	(cnts, boundingBoxes) = zip(*sorted(zip(cnts, boundingBoxes),
		key=lambda b:b[1][i], reverse=reverse))
 
	# return the list of sorted contours and bounding boxes
	return (cnts, boundingBoxes)
    

In [35]:
def cropImage(filename,contours,cv2,img):
    baseFolder =filename.replace(".jpg","")+"/cropped"
    if not os.path.exists(baseFolder):
        os.makedirs(baseFolder)   
    idx = 0
    for c in contours:
        # Returns the location and width,height for every contour
        x, y, w, h = cv2.boundingRect(c)

        # If the box height is greater then 20, widht is >80, then only save it as a box in "cropped/" folder.
        #if (w > 20 and h > 20) and w > 3*h:
        if (w > 30 and h > 30):
            idx += 1
            new_img = img[y:y+h, x:x+w]
            cv2.imwrite(baseFolder+'/'+str(idx) + '.png', new_img)
        
    

In [36]:
def pagesRead(filename):
    img = cv2.imread(filename, 0)
    
    # Thresholding the image
    (thresh, img_bin) = cv2.threshold(img, 128, 255,cv2.THRESH_BINARY|     cv2.THRESH_OTSU)
    # Invert the image
    img_bin = 255-img_bin 
    cv2.imwrite(filename+"_bin.jpg",img_bin)
    # Defining a kernel length
    kernel_length = np.array(img).shape[1]//80

    # A verticle kernel of (1 X kernel_length), which will detect all the verticle lines from the image.
    verticle_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, kernel_length))
    # A horizontal kernel of (kernel_length X 1), which will help to detect all the horizontal line from the image.
    hori_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kernel_length, 1))
    # A kernel of (3 X 3) ones.
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
    # Morphological operation to detect vertical lines from an image
    img_temp1 = cv2.erode(img_bin, verticle_kernel, iterations=3)
    verticle_lines_img = cv2.dilate(img_temp1, verticle_kernel, iterations=3)
    cv2.imwrite(filename+"_verticle_lines.jpg",verticle_lines_img)
    # Morphological operation to detect horizontal lines from an image
    img_temp2 = cv2.erode(img_bin, hori_kernel, iterations=3)
    horizontal_lines_img = cv2.dilate(img_temp2, hori_kernel, iterations=3)
    cv2.imwrite(filename+"_horizontal_lines.jpg",horizontal_lines_img)
    # Weighting parameters, this will decide the quantity of an image to be added to make a new image.
    alpha = 0.5
    beta = 1.0 - alpha
    # This function helps to add two image with specific weight parameter to get a third image as summation of two image.
    img_final_bin = cv2.addWeighted(verticle_lines_img, alpha, horizontal_lines_img, beta, 0.0)
    cv2.imwrite(filename+"_weighted.jpg", img_final_bin)
    img_final_bin = cv2.erode(~img_final_bin, kernel, iterations=2)
    cv2.imwrite(filename+"erode.jpg", img_final_bin)
    #img_final_bin = cv2.equalizeHist(img_final_bin)
    (thresh, img_final_bin) = cv2.threshold(img_final_bin, 128,255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
    cv2.imwrite(filename+"_final_bin.jpg",img_final_bin)
    # Find contours for image, which will detect all the boxes
    #im2, contours, hierarchy = cv2.findContours(img_final_bin, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
    contours, hierarchy = cv2.findContours(img_final_bin,cv2.RETR_TREE,cv2.CHAIN_APPROX_SIMPLE)
    image_cropped_contoured_drawn = cv2.drawContours(img.copy(), contours, -1, (0, 255, 0), 3)
    cv2.imwrite(filename+"_template_masked_contoured_exploration.jpg", image_cropped_contoured_drawn)
    # Sort all the contours by top to bottom.
    (contours, boundingBoxes) = sort_contours(contours, method="top-to-bottom")
    cropImage(filename,contours,cv2,img)

In [37]:
for image_name in range(image_counter-1):
    imagepage= pdfbaseFolder+"/"+"page_"+str(image_name+1)+".jpg"
    pagesRead(imagepage) 

In [38]:
def extractData(basefolder):
    if not os.path.exists(basefolder+'/extracted'):
        os.makedirs(basefolder+'/extracted')   
    for filename in os.listdir(basefolder+'/cropped'): 
        if 'png' in filename:
            try:   
                #print(basefolder+":"+filename)
                #if filename in ['177.'+'png'] and 'page_1' in basefolder:
                #   print('inside >>'+filename)
                #    textData = pytesseract.image_to_string(cv2.imread(basefolder+'/cropped/'+filename),config="-c tessedit_char_whitelist=_x --psm 6")
                #else:
                try:     
                    chckconfig = identifyAcordFormAndPage(basefolder+"/extracted/1.txt")
                except IOError as e:   
                    print('IOError - file not found')
                    textData = pytesseract.image_to_string(cv2.imread(basefolder+'/cropped/1.png'))
                    textfileName = '1.txt'
                    first_textfile = open(basefolder+'/extracted/'+textfileName, "w") 
                    first_textfile.write(textData) 
                    first_textfile.close()
                    chckconfig = identifyAcordFormAndPage(basefolder+"/extracted/1.txt")
                 
               
                #print('config>'+chckconfig)
                checkboxes = loadCheckboxConfiguration(chckconfig)
                #print('checkboxes >'+checkboxes)
                #print('filename >'+filename)
                if filename.replace('.png','') in checkboxes.split(','):
                    #print('inside filename')
                    textData = pytesseract.image_to_string(cv2.imread(basefolder+'/cropped/'+filename),config="-c tessedit_char_whitelist=x --psm 6")
                else:
                    textData = pytesseract.image_to_string(cv2.imread(basefolder+'/cropped/'+filename))
                
                textfileName = filename.replace('png','txt')
                file = open(basefolder+'/extracted/'+textfileName, "w") 
                file.write(textData) 
                file.close()
            except Exception as e:
                print('failed to extract: '+ str(e))

In [39]:

configuration = "configuration/acord-configuration.json"
replacementConfiguration = "configuration/replacement-configuration.json"
outputConfiguration = "configuration/output-configuration.json"
checkboxConfiguration = "configuration/checkbox-configuration.json"
folder_delimiter = ">"
output = {}

def identifyAcordFormAndPage(filename):
    data = {} 
    f = open(filename, "r")
    filecontent = f.read()
    if 'ACORD 128' in filecontent:
        data['fileType'] = 'ACORD 128'
        identifyPageNumber(filecontent,data)
    elif 'ACORD 125' in filecontent:
        data['fileType'] = 'ACORD 125'
        identifyPageNumber(filecontent,data)
        identifyAcordVersion(filecontent,data)
    else:
        data.fileType = 'unknown'
    #{"fileType": "ACORD 125", "pagenumber": "1", "version": "2009"}
    #construct the configuration name
    config = (data["fileType"]+" "+data["version"]+" "+data["pagenumber"]).replace(" ","_")
    #print(config)
    return config

def identifyPageNumber(filecontent,data):
    if 'Page 1' in filecontent:
            data['pagenumber'] = '1'  
    elif 'Page 2' in filecontent:
        data['pagenumber'] = '2' 
    elif 'Page 3' in filecontent:
        data['pagenumber'] = '3' 
    elif 'Page 4' in filecontent:
        data['pagenumber'] = '4' 
    elif 'Page 5' in filecontent:
        data['pagenumber'] = '5' 
    elif 'Page 6' in filecontent:
        data['pagenumber'] = '6' 

def identifyAcordVersion(filecontent,data):
    if '2015/12' in filecontent:
        data['version'] = '2015'
    elif '2009/08' in filecontent:
        data['version'] = '2009'
    elif '2016/03'in filecontent:
        data['version'] = '2016'
    else:
        data['version'] = 'unknown'

In [40]:
def extractContent(config,extractedFiles,output):
    attributes,replacement,outputmapper = loadConfiguration(config)
    for id, value in attributes.items():
        #output["PAGE_"+config[-1:]+"_"+id]=getContentByKey(id,attributes,replacement,extractedFiles)
        outputAttribute = outputmapper[id] if id in outputmapper else "PAGE_"+config[-1:]+"_"+id;
        op = outputAttribute.split(folder_delimiter)
        outputAttribute_length = len(op)
        #print("length:"+str(outputAttribute_length))
        if outputAttribute_length == 1:
            output[op[0]] = getContentByKey(id,attributes,replacement,extractedFiles)
        elif  outputAttribute_length == 2:
            output[op[0]] = output[op[0]] if op[0] in  output else {}
            output[op[0]][op[1]] = getContentByKey(id,attributes,replacement,extractedFiles)
        elif  outputAttribute_length == 3:
            output[op[0]] = output[op[0]] if op[0] in  output else {}
            output[op[0]][op[1]] = output[op[0]][op[1]]  if op[1] in  output[op[0]] else {}
            output[op[0]][op[1]][op[2]] = getContentByKey(id,attributes,replacement,extractedFiles)
        elif  outputAttribute_length == 4:
            output[op[0]] = output[op[0]] if op[0] in  output else {}
            output[op[0]][op[1]] = output[op[0]][op[1]]  if op[1] in  output[op[0]] else {}
            output[op[0]][op[1]][op[2]] = output[op[0]][op[1]][op[2]] if op[2] in output[op[0]][op[1]] else {}
            output[op[0]][op[1]][op[2]][op[3]] = getContentByKey(id,attributes,replacement,extractedFiles)
        
      
#identifyAcordFormAndPage('Acord 125 Comm_Filled_7th March_V2/page_2/extracted/1.txt');
def identifyandExtractData(baseFolder):
    output = {}
    dirs = [d for d in os.listdir(baseFolder) if os.path.isdir(os.path.join(baseFolder, d))]
    for foldername in dirs:
        config = identifyAcordFormAndPage(baseFolder+"/"+foldername+"/extracted/1.txt")
        extractContent(config,baseFolder+"/"+foldername+"/extracted",output)
    print(output)

In [41]:
def loadConfiguration(fileformat):
    #print('loadConfiguration:'+fileformat)
    #print('configuration:'+configuration)
    with open(configuration) as json_file:  
        data = json.load(json_file)
        attributes= data[fileformat] if fileformat in data else {}
    with open(replacementConfiguration) as json_file: 
        data = json.load(json_file)
        replacements = data[fileformat] if fileformat in data else {}
    with open(outputConfiguration) as json_file: 
        data = json.load(json_file)
        outputConf = data[fileformat] if fileformat in data else {}
        return attributes,replacements,outputConf
    
def loadCheckboxConfiguration(fileformat):
    with open(checkboxConfiguration) as json_file:  
        data = json.load(json_file)
        attributes= data[fileformat] if fileformat in data else ""
    return attributes
    

In [42]:
def getContentByKey(key,attributes,replacement,extractedFiles):
    #print("key:"+key)
    
    configurationFile = attributes[key] if key in attributes else ""
    replacementText = replacement[key] if key in replacement else ""
    #print("configurationFile:"+configurationFile)
    #print("replacementText:"+replacementText)
    
    f = open(extractedFiles+"/"+configurationFile+'.txt', "r")
    filecontent = f.read()
    if any(ext in replacementText for ext in [":"]) and  replacementText in filecontent:
        newfilecontent = filecontent.split(replacementText)[1].replace("\n"," ").strip()
    else:
        #print("else replace text")
        newfilecontent = filecontent.replace(replacementText,"").replace("\n"," ").strip()
    return str(newfilecontent)


In [43]:
for image_name in range(image_counter-1):
    pagedetails= pdfbaseFolder+"/page_"+str(image_name+1)
    extractData(pagedetails)

IOError - file not found
failed to extract: 'dict' object has no attribute 'fileType'
failed to extract: 'dict' object has no attribute 'fileType'
failed to extract: 'dict' object has no attribute 'fileType'
failed to extract: 'dict' object has no attribute 'fileType'
failed to extract: 'dict' object has no attribute 'fileType'
failed to extract: 'dict' object has no attribute 'fileType'
failed to extract: 'dict' object has no attribute 'fileType'
failed to extract: 'dict' object has no attribute 'fileType'
failed to extract: 'dict' object has no attribute 'fileType'
failed to extract: 'dict' object has no attribute 'fileType'
failed to extract: 'dict' object has no attribute 'fileType'
failed to extract: 'dict' object has no attribute 'fileType'
failed to extract: 'dict' object has no attribute 'fileType'
failed to extract: 'dict' object has no attribute 'fileType'
failed to extract: 'dict' object has no attribute 'fileType'
failed to extract: 'dict' object has no attribute 'fileType'

failed to extract: 'dict' object has no attribute 'fileType'
failed to extract: 'dict' object has no attribute 'fileType'
failed to extract: 'dict' object has no attribute 'fileType'
failed to extract: 'dict' object has no attribute 'fileType'
failed to extract: 'dict' object has no attribute 'fileType'
failed to extract: 'dict' object has no attribute 'fileType'
failed to extract: 'dict' object has no attribute 'fileType'
failed to extract: 'dict' object has no attribute 'fileType'
failed to extract: 'dict' object has no attribute 'fileType'
failed to extract: 'dict' object has no attribute 'fileType'
failed to extract: 'dict' object has no attribute 'fileType'
failed to extract: 'dict' object has no attribute 'fileType'
failed to extract: 'dict' object has no attribute 'fileType'
failed to extract: 'dict' object has no attribute 'fileType'
failed to extract: 'dict' object has no attribute 'fileType'
failed to extract: 'dict' object has no attribute 'fileType'
failed to extract: 'dict

In [44]:
identifyandExtractData(PDF_file.replace('.pdf','').replace('samples/',''))

AttributeError: 'dict' object has no attribute 'fileType'