# Tree Alignment

As shown in [1], prior to detecting data fields we need to identify aligned regions across pages that look like potential data values. The paper uses DEPTA [2], but we are unsure if this is usable for PDF document too so let's explore it here.

## 1. Load pdf and build a tag tree out of it.

In [1]:
# Tree with defaultdict
# https://gist.github.com/hrldcpr/2012250 but implemented by
# extending a dict object:
# http://stackoverflow.com/questions/6780952/how-to-change-behavior-of-dict-for-an-instance
class Tree(dict):
    """Implementation of perl's autovivification feature."""
    def __missing__(self, key):
        value = self[key] = type(self)()
        return value

test = Tree()
test['left']['value'] = 'leftval'
test['right']['value'] = 'rightval'
test['right']['right1']['value'] = 'testval'
print(test)

{'right': {'right1': {'value': 'testval'}, 'value': 'rightval'}, 'left': {'value': 'leftval'}}


In [15]:
import os, sys, inspect
base_path = os.path.realpath(
    os.path.abspath(
        os.path.join(
            os.path.split(
                inspect.getfile(
                    inspect.currentframe()
                )
            )[0],
            '..'
        )
    )
)
sys.path.append(base_path)

class ListTable(list):
    """ Overridden list class which takes a 2-dimensional list of 
        the form [[1,2,3],[4,5,6]], and renders an HTML Table in 
        IPython Notebook. """
    
    def _repr_html_(self):
        html = ["<table>"]
        for row in self:
            html.append("<tr>")
            
            for col in row:
                html.append("<td>{0}</td>".format(col))
            
            html.append("</tr>")
        html.append("</table>")
        return ''.join(html).decode('utf-8')

from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.layout import LAParams
from pdfminer.converter import PDFConverter
from pdfminer.pdfpage import PDFPage

from pdfminer.layout import LTPage, LTContainer, LTAnno
from pdfminer.layout import LTImage, LTChar
from pdfminer.layout import LTTextBox, LTTextLine

from pdfminer.image import ImageWriter

try:
    from cStringIO import StringIO
except ImportError:
    from StringIO import StringIO
    
table = ListTable()
table.append([
    'type',
    'text',
    'x',
    'x1',
    'y',
    'y1',
    'page',
    'textbox_id',
    'textline_id'
])
tree = Tree()
types = set()
last_type = ''
class CustomPDFConverter(PDFConverter):
    def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
                 showpageno=False, imagewriter=None):
        PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
        self.showpageno = showpageno
        self.imagewriter = imagewriter
        self.current_total_height = 0
        return

    def write(self, message):
        self.outfp.write(message)

    def receive_layout(self, ltpage):
        """Stuff to do when layout received.

        Inherited from :class:`pdfminer.converter.PDFConverter`.

        Args:
            ltpage: :class:`pdfminer.layout.LTPage` object.
        """
        def render(item, page, page_width, current_total_height):
            """What to do when rendering certain :class:`pdfminer.layout.LT.*` objects.
            
            item.bbox has a set of (x0, y0, x1, y1)
            which we will store as part of our features.

            Important notes on features:
            - In pdf, (0,0) coordinate is positioned at bottom left, and each page has its own
              coordinates. To make it easier for learning and users to label, we combine all the
              pages into one huge, continuous plane, with position (0,0) on top left. This way
              users don't have to specify page number to create blocks, and learner can learn contents
              that span across multiple pages i.e. large paragraph of texts.

            Facts about LTTextBox:
            - It contains list of texts contained in LTTextLine... object,
              instead of single characters. Values of texts can be gathered with
              object.current_value. LTTextLine... contains LTChar.
            - LTTextLineHorizontal, LTTextLineVertical, or any child of LTTextBox does
              not have index, but we create our own index on the go (textline_id), so later
              we can trace back which LTText does an LTChar belong to.
            - LTTextBox does not seem to contain another LTTextBox, nor does it
              contain characters directly (at least in the examples so far).
            - Index can be gained from property `index` e.g. `item.index` (we will
              think about how to get this element from index later).

            Facts about LTImage:
            - object.srcsize are different from width and height calculated from bbox.
            - Belongs to no textbox.
            """
            global last_type
            if isinstance(item, LTPage):
                page = ltpage.pageid             
            if isinstance(item, LTContainer):
                for child in item:
                    render(child, page, page_width, current_total_height)
            if isinstance(item, LTTextBox):
                table.append([item.__class__.__name__])
                for textline_id, textline in enumerate(item):
                    element = textline
                    text = element.get_text().encode(self.codec, 'ignore')
                    tree[page][item.index][textline_id]['value'] = text
                    table.append([
                        element.__class__.__name__,
                        text,
                        element.bbox[0],
                        element.bbox[1],
                        element.bbox[2],
                        element.bbox[3],
                        page,
                        item.index,
                        textline_id
                    ])
            else:
                if not (isinstance(item, LTTextLine) or isinstance(item, LTChar)) and item.__class__.__name__ != last_type:
                    table.append([item.__class__.__name__])
            if isinstance(item, LTAnno):
                table.append(['-', item.get_text().encode(self.codec, 'ignore')])
            last_type = item.__class__.__name__


        self.current_total_height += ltpage.height
        render(ltpage, ltpage.pageid, ltpage.width, self.current_total_height)
        return

In [16]:
outfp = sys.stdout
pagenos = set()
pdf_path = 'pdfs/final/test.pdf'
f = open(pdf_path, 'rb')
text = f.read()

rsrcmgr = PDFResourceManager(caching=True)
codec = 'utf-8'
# imagewriter = ImageWriter('images')

laparams = LAParams()
device = CustomPDFConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
interpreter.debug = False

pagenos = set()
text_io = StringIO(text)

for page in PDFPage.get_pages(text_io, pagenos):
    interpreter.process_page(page)
device.close()
print("types:", types)
table

('types:', set([]))


0,1,2,3,4,5,6,7,8
type,text,x,x1,y,y1,page,textbox_id,textline_id
LTAnno,,,,,,,,
-,,,,,,,,
LTTextBoxHorizontal,,,,,,,,
LTTextLineHorizontal,,585.36,757.97022,588.54744,777.44436,1,0,0
LTAnno,,,,,,,,
-,,,,,,,,
LTAnno,,,,,,,,
-,,,,,,,,
LTAnno,,,,,,,,


In [None]:
tree