In [None]:
# AnalyseResolutions. Tuomo Toljamo (King's College London; DiXiT) at the Huygens ING (KNAW), 2016.
#
# This PhDWare code-sketch was part of a pilot exploring the use and usefulness of image data and visual information in
# the digital opening of an archival series, the Resolutions of the States General 1576‒1796.
#
# DiXiT (Digital Scholarly Editions Initial Training Network) has been funded from the People Programme (Marie Curie Actions) 
# of the European Union's Seventh Framework Programme FP7/2007-2013/ under REA grant agreement n° 317436. 

import IPython.core.display as ipy
import sys
import os
import traceback
from SpreadLevel import SauvolaBinarise
from ColumnLevel import *
from DocumentImageAnalysis import *
from DocumentImageUnderstanding import *  
from XMLFactory import XMLDocument
import Levers
import HelperFunctions as fu
from HelperFunctions import debug
from TextProcessing import processDatelineText

# For Jupyter to autoload modules and get new changes in before execution.
%load_ext autoreload
%autoreload 2

In [None]:
def process():
    isRotated = False
    pagesOutList = []
    columnsOutList = []
    facsPage = fu.Page()
    Levers.imageIter = 0

    
    # IMAGE-LEVEL: LOADING THE ORIGINAL
    original = Facsimile(fu.loadImage(Levers.dir_in, Levers.fileName), 'original')
    original.save(Levers.fileName.replace(".jpg", ""))
    
    
    # IMAGE-LEVEL: ANALYSE INPUT - Let's check if the image is landscape as expected.
    imgShape = original.getImage().shape
    if (imgShape[0] > imgShape[1]):
        document.flush()
        print ("Skipping this image. It's not a spread.")
        document.addElement("facs", {"src":Levers.fileName})
        document.addElement("note", {"type":"processing"}, "Image skipped (reason: not a two-page spread).")
        return


    # SPREAD-LEVEL: BINARISE
    if (Levers.kludgeWin32bit == True):
        binarised = SauvolaBinariseQuadrants(original)
    else:
        binarised = SauvolaBinarise(original, 'gaussian')
    
    
    # SPREAD-LEVEL: REMOVE SCAN BORDERS
    remBorders = cropBorderShadows(binarised, 'remBorders')


    # SPREAD-LEVEL: EXTRACT PAGES
    pages = extractPages(remBorders)
    pagesOutList = pages

    for i in range(0, len(pages)):
        nameString = Levers.fileName.replace(".jpg","")+"_page" + str(i+1)
        
        
        # PAGE-LEVEL: APPLY HORIZONTAL MASK
        maskedPage = applyMask(pages[i], nameString+'_masked')


        # PAGE-LEVEL: CLEAN BORDERS
        maskedPage = cleanBorders(maskedPage)


        # PAGE-LEVEL: ANALYSE SKEW DETECTION AREA
        skewDetectionArea = analyseSkewDetectionArea(maskedPage)
        if skewDetectionArea == None:
            document.flush()
            if i==0:
                document.addElement("facs", {"src":Levers.fileName})
            document.addElement("pb", {"n":Levers.fileName.replace(".jpg", "")+"#page"+str(i+1)})
            document.addElement("note", {"type":"processing"}, "Page skipped (reason: not body text).")
            #print ("Skew detection area analysed: Not body text.")
            pagesOutList[i] = None
            continue  # Breaking for the next page.
        
        
        # PAGE-LEVEL: DETECT SKEW BASED ON DIVIDER
        
        # Apply vertical mask to the horizontally masked image.
        doubleMaskedPage = applyVerticalMask(maskedPage, 'doubleMaskedPage')
        #fu.nbimage(doubleMaskedPage.getImage())
        
        # PAGE-LEVEL: FIND DIVIDER
        dividerCC = findDividingLine(doubleMaskedPage)
        clusterRend = renderFacsCC(doubleMaskedPage, dividerCC, 'renderedDivider', (0,0))
        clusterRend.save()
        #fu.nbimage(clusterRend.getImage())
        divAngle = calculateDividerAngle(dividerCC)
        #print("Detected div angle:", divAngle)
        
        if divAngle==None:
            # Let's detect skew according to the older, projection-profile -based method.
            angle = detectSkew(maskedPage, skewDetectionArea)
            #angle = 0
        else:
            angle = divAngle*(-1)
        
        if ( abs(angle) < 0.15):
            # the angle is not significant, let's do nothing
            angle=0
        #print ("angle", angle)

        
        # PAGE-LEVEL: ROTATE IMAGE IF NEEDED
        if (angle != 0):
            nameString = nameString + "_rot" + str(angle)
            rotatedMaskedPage = rotateImage(maskedPage, angle, nameString+'_rotatedMaskedPage')
            rotatedPageImg = rotateImage(pages[i], angle, nameString+'_rotatedPageImg')
            isRotated = True
        else:
            isRotated = False


        # PAGE-LEVEL: FIND DATELINE
        if (isRotated):
            dateLine, datelineComponentList = findDateLine(rotatedMaskedPage, nameString+'_dateLine')
        else:
            dateLine, datelineComponentList = findDateLine(maskedPage, nameString+'_dateLine')
        
        if dateLine == None:
            document.flush()
            document.addElement("pb", {"n":str(i+1)})
            document.addElement("note", {"type":"processing"}, "Page skipped (reason: not recognised as body text).")
            print ("A dateline was not found: this page is not part of Resolutions body. Skipping to next page.")
            pagesOutList[i] = None
            continue  # Breaking for the next page.
        
        
        # PAGE-LEVEL: FIND DATELINE COMPONENTS
        dlistsorted = sortCCListHorizontally(datelineComponentList)
        datelineComponentClusters = findDatelineClusters(dlistsorted)
        if (isRotated):
            dateLineClusters = renderFacsCCClusters(rotatedMaskedPage,datelineComponentClusters, nameString+'_dateLineClusters', (0,0))
        else:
            dateLineClusters = renderFacsCCClusters(maskedPage,datelineComponentClusters, nameString+'_dateLineClusters', (0,0))

        if Levers.debugFlag:
            dateLineClusters.save(Levers.fileName.replace(".jpg","")+"_page"+str(i), "datelinePile")


        # PAGE-LEVEL: ANALYSE DATELINE - CHECK CLUSTERING
        if len(datelineComponentClusters) == 2:
            print ("According to dateline clustering, this page is part of the index. Skipping to next page.")
            document.flush()
            if (i == 0):
                #document.flush()
                document.addElement("facs", {"src":Levers.fileName})
                
            document.addElement("pb", {"n":str(i+1)})
            document.addElement("note", {"type":"processing"}, Levers.fileName+"#page"+str(i+1)+": Page skipped (reason: part of the index).")
            pagesOutList[i] = None
            continue  # Breaking for the next page.
        if len(datelineComponentClusters) < 2 or len(datelineComponentClusters) > 3:
            document.flush()
            document.addElement("pb", {"n":str(i+1)})
            document.addElement("note", {"type":"processing"}, "Page skipped (reason: not recognised as body text).")
            print ("According to dateline clustering, this page is not Resolutions body. Skipping to next page.")
            pagesOutList[i] = None
            continue  # Breaking for the next page.

        print ("A dateline was found. Continuing processing.")
        #continue
        
        # PAGE-LEVEL: OCR DATELINE
        if (isRotated):
            datelineUnmasked = unmaskFacsimile(rotatedPageImg, dateLine, "unmaskedDateline")
            #fu.nbimage(datelineUnmasked.getImage())

            saveTesseractImage(datelineUnmasked.getImage(), "page"+str(i+1)+"_dateline")
            datelineText = Tesseract("page"+str(i+1)+"_dateline", "dateline")
            datelineText = processDatelineText(datelineText)
            #print(datelineText)    
        else:
            datelineUnmasked = unmaskFacsimile(pages[i], dateLine, "unmaskedDateline")
            #fu.nbimage(datelineUnmasked.getImage())

            saveTesseractImage(datelineUnmasked.getImage(), "page"+str(i+1)+"_dateline")
            datelineText = Tesseract("page"+str(i+1)+"_dateline", "dateline")
            datelineText = processDatelineText(datelineText)
            #print(datelineText)  

            
        # PAGE-LEVEL: FIND DIVIDER AREA
        if (isRotated):
            columnDividerArea = spliceDividerArea(rotatedMaskedPage, dateLine, getCentreCluster(rotatedMaskedPage, datelineComponentClusters))
        else:
            columnDividerArea = spliceDividerArea(maskedPage, dateLine, getCentreCluster(maskedPage, datelineComponentClusters))


        # PAGE-LEVEL: APPLY VERTICAL MASK TO DIVIDER AREA
        maskedColumnDividerArea = applyVerticalMask(columnDividerArea, nameString+'_verticallyMaskedSplice')


        # PAGE-LEVEL: FIND DIVIDER FROM THE MASKED AREA
        dividerCC = findDividingLine(maskedColumnDividerArea)
        if Levers.debugFlag:
            clusterRend = renderFacsCC(maskedColumnDividerArea, dividerCC, nameString+'_renderedDivider', (0,0))

            
        # PAGE-LEVEL: ANALYSE DIVIDER
        if (isRotated):
            (columnMin, columnMax) = analyseDivider(dividerCC, rotatedMaskedPage, columnDividerArea)
        else:
            (columnMin, columnMax) = analyseDivider(dividerCC, maskedPage, columnDividerArea)

            
        # COLUMN-LEVEL: EXTRACT COLUMNS
        if (isRotated):
            cols, (rotatedPageImgProcessedLeft, rotatedPageImgProcessedRight) = extractColumns(rotatedMaskedPage, columnDividerArea, dividerCC, columnMin, columnMax, rotatedPageImg)
        else:
            cols, (pageImgProcessedLeft, pageImgProcessedRight) = extractColumns(maskedPage, columnDividerArea, dividerCC, columnMin, columnMax, pages[i])

        for j in range(0, len(cols)):
            nameString2 = nameString + "_col" + str(j+1)
            
            # Removing catchwords
            if (j==1):
                cols[j] = removeCatchwords(cols[j])
                        
            cleanedImage = dia.cleanBorders2(cols[j].getImage())
            
            
            # COLUMN-LEVEL: FIND BORDER-MOST CONNECTED COMPONENTS IN COLUMN
            leftMostCC = findBorderMostConnectedComponents(cleanedImage.copy(), 'left')
            rightMostCC = findBorderMostConnectedComponents(cleanedImage.copy(), 'right')
            bottomMostCC = findBorderMostConnectedComponents(cleanedImage.copy(), 'bottom')
            l = [leftMostCC, rightMostCC, bottomMostCC]
            if Levers.debugFlag:
                borderClusters = renderFacsCCClusters(cols[j], l, nameString2+'_borderClusters', (0,0))


            # COLUMN-LEVEL: REMOVE EXTRA WHITESPACE FROM COLUMNS
            colImgCropped = removeExtraWhitespace(cols[j], j, leftMostCC, rightMostCC, bottomMostCC)


            # COLUMN-LEVEL: FIND LARGE INITIAL CAPITALS
            if (isRotated):
                if (j==0):
                    colImgUnmasked = unmaskFacsimile(rotatedPageImgProcessedLeft, colImgCropped, nameString2+"_ImgCroppedUnmasked")
                if (j==1):
                    colImgUnmasked = unmaskFacsimile(rotatedPageImgProcessedRight, colImgCropped, nameString2+"_ImgCroppedUnmasked")
            else:
                if (j==0):
                    colImgUnmasked = unmaskFacsimile(pageImgProcessedLeft, colImgCropped, nameString2+"_ImgCroppedUnmasked")
                if (j==1):
                    colImgUnmasked = unmaskFacsimile(pageImgProcessedRight, colImgCropped, nameString2+"_ImgCroppedUnmasked")

            initialCandidates = findInitialCapitalCandidates(colImgUnmasked)

            initialCapitalCCList = []
            for z in range(0, len(initialCandidates)):
                initialCapitalCCList.append( initialCandidates[z].getInitialCC() )

            renderInitialCandidates = renderFacsCCList(colImgUnmasked,initialCapitalCCList,nameString2+'_initialCapitalCandidates', (0,0))

            
            # COLUMN-LEVEL: FIND VERTICAL SPACE CANDIDATES
            spaceCandidates = findVerticalSpaceCandidates(colImgUnmasked)
            
            
            # COLUMN-LEVEL: FIND PHYSICAL SECTIONS
            physicalSections,validatedInitials,validatedSpaces = segmentColumn(colImgUnmasked, initialCandidates, spaceCandidates,i,j)
            
            # render
            validatedInitialsCCList = []
            for t in range(0, len(validatedInitials)):
                validatedInitialsCCList.append( validatedInitials[t].getInitialCC() )

            renderValidatedInitials = renderFacsCCList(colImgUnmasked,validatedInitialsCCList,nameString2+'_validatedInitialCapitals', (0,0))
            
            physicalSections = analyseSections(physicalSections)

            physicalSections = ocrSections(physicalSections)

            elementQueue = buildElementQueue(physicalSections, i, j, datelineText[0])
            
            processQueue(elementQueue, i, j, document)

            # note: render only after spaces have been validated.
            renderSpacesAndInitials = renderSections(renderValidatedInitials, validatedSpaces, nameString2+'_renderValidatedSpaces', (127,0,0))
            renderSpacesAndInitials.save(Levers.fileName.replace(".jpg","")+"_page"+str(i+1)+"_col"+str(j+1))
            columnsOutList.append(renderSpacesAndInitials)
            
            # error: column text not processed at this point, only when the XML is saved.
            facsPage.addPhysSections(i*2+j, physicalSections)
            

    # VISUALISE THE PRODUCTS ON TOP OF THE ORIGINAL FACSIMILE
    outputImage = None

    for k in range(len(columnsOutList)):
        if (k==0):
            outputImage = overlayRegion(original.getImage(), columnsOutList[k])
        else:
            outputImage = overlayRegion(outputImage, columnsOutList[k])

    if (outputImage is not None):
        theEnd = Facsimile(outputImage, Levers.fileName.replace(".jpg","")+"_finishedProcessing")
        theEnd.save()
        theEnd.save(Levers.fileName.replace(".jpg","")+"_finishedProcessing", "finishedProcessing")
        theEnd.save(Levers.fileName.replace(".jpg","")+"_finishedProcessing")
        
        document.saveDocument("results/"+(Levers.saveDir)+"/", "document.xml")
        
        
        # GENERATING HTML TESTING PAGES
        html = fu.HTMLTestingPage("results/"+str(Levers.saveDir)+"/testingPage.html", Levers.fileName)
        html.addOriginal(Levers.fileName.replace(".jpg", "")+".png")
        for i in range (0, len(pagesOutList)):
            if pagesOutList[i] != None:
                j=0
                facsPage.addColText(i*2+j, getColumnText(facsPage.getPhysSections(i*2+j)))                
                html.addColumn("Page " + str(i+1) + ", Col " + str(j+1), Levers.fileName.replace(".jpg","")+"_"+"page"+str(i+1)+"_col"+str(j+1)+".png", 
                               facsPage.getColText(i*2+j))
                
                j=1
                facsPage.addColText(i*2+j, getColumnText(facsPage.getPhysSections(i*2+j)))
                html.addColumn("Page " + str(i+1) + ", Col " + str(j+1), Levers.fileName.replace(".jpg","")+"_"+"page"+str(i+1)+"_col"+str(j+1)+".png", 
                               facsPage.getColText(i*2+j))
       
        
        html.addOriginal(Levers.fileName.replace(".jpg", "")+"_finishedProcessing.png")
        html.close()
       
    
# Main
        
Levers.dir_in = "images/"
Levers.dir_out = "results/"
Levers.file_ext = "jpg"

fileList = fu.getFiles(Levers.dir_in, Levers.file_ext)

# Let's create the XML document
document = XMLDocument()

for f in range(0, len(fileList)):
    Levers.fileName = fileList[f]
    Levers.saveDir = Levers.fileName.replace(".jpg", "")

    debug("["+fileList[f]+"]: " + "Starting to process "+Levers.fileName+".")
    print("Starting to process "+Levers.fileName+".")
    try:
        process()
    except Exception:
        e = traceback.format_exc()
        print (e)
        document.flush()
        debug("["+fileList[f]+"]: " + "Failed to process "+Levers.fileName+".")
        document.addElement("note", {"type":"processing"}, "Failed to process "+Levers.fileName+".")
        
    
document.flush()
document.saveDocument("results/"+(Levers.saveDir)+"/", "document.xml")