In [42]:
#pip install pdfminer.six


from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTRect, LTTextBox, LTTextBoxHorizontal, LTTextLine, LTTextLineHorizontal, LTChar
import os
import configparser

from pyspark.sql import SparkSession
from pyspark.sql import SQLContext

class PdfExtractor:
    def __init__(self):
            self.config = configparser.ConfigParser()
            self.config.read('retail.ini')
    
            self.sparkSession = SparkSession \
            .builder \
            .appName("encans") \
            .getOrCreate()
            
            self.sqlContext = SQLContext(self.sparkSession)

            
            '''
            self.logger = logging.getLogger('Retail')
            logfile = self.config.get('dir', 'datadir') + 'indeed_' + str(datetime.now().date()) + '.log'
            hdlr = logging.FileHandler(logfile)
            formatter = logging.Formatter('%(asctime)s -- %(funcName)s -- %(levelname)s -- %(message)s')
            hdlr.setFormatter(formatter)
            self.logger.addHandler(hdlr) 
            self.logger.setLevel(logging.INFO)
            '''           
                
                
    ##
    ## Helping funtions
    ##
    
    def flatten(self, lst):
        """Flattens a list of lists"""
        return [subelem for elem in lst for subelem in elem]


    def extract_characters(self, element):
        """
        Recursively extracts individual characters from 
        text elements. 
        """
        TEXT_ELEMENTS = [LTTextBox, LTTextBoxHorizontal, LTTextLine, LTTextLineHorizontal]

        if isinstance(element, LTChar):
            return [element]

        if any(isinstance(element, i) for i in TEXT_ELEMENTS):
            return self.flatten([self.extract_characters(e) for e in element])

        if isinstance(element, list):
            return self.flatten([self.extract_characters(l) for l in element])

        return []
    
    def extract_layout_by_page(self, pdf_path):
        """
        Extracts LTPage objects from a pdf file.

        slightly modified from
        https://euske.github.io/pdfminer/programming.html
        """
        laparams = LAParams()

        fp = open(pdf_path, 'rb')
        parser = PDFParser(fp)
        document = PDFDocument(parser)

        if not document.is_extractable:
            raise PDFTextExtractionNotAllowed

        rsrcmgr = PDFResourceManager()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        layouts = []
        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)
            layouts.append(device.get_result())
            
        # Close object
        fp.close()
        device.close()

        return layouts
    
    
    def main(self):
        example_file = "catalogue_internet_qc_2018-04-28_v2018-04-13.pdf"
        page_layouts = self.extract_layout_by_page(example_file)
        nb_pages = len(page_layouts)
        #objects_on_page = set(type(o) for o in page_layouts[3])
        current_page = page_layouts[19]

        texts = []

        # seperate text and rectangle elements
        for e in current_page:
            if isinstance(e, LTTextBoxHorizontal):
                texts.append(e)

        # sort them into 
        charactersList = self.extract_characters(texts)
        #Store data into a rdd file
        
        #charactersRDD = self.sc.parallelize(charactersList)
        charactersDF = self.sparkSession.createDataFrame(charactersList)
        
        #charactersDF.printSchema()
        charactersDF.show(100)

In [43]:
pdf = PdfExtractor()
pdf.main()

+-----+------------------+--------------------+-------------------+------------------+--------------------+------------------+-------+------------------+------------------+------------------+-----------------+-----------------+
|_text|               adv|                bbox|           fontname|            height|              matrix|              size|upright|             width|                x0|                x1|               y0|               y1|
+-----+------------------+--------------------+-------------------+------------------+--------------------+------------------+-------+------------------+------------------+------------------+-----------------+-----------------+
|    E|             0.667|[113.28,758.99165...|HLGGMH+Arial-BoldMT| 17.15922640000008|[11.9827,0.0,0.0,...| 17.15922640000008|   true|7.9924608999999975|            113.28|       121.2724609|      758.9916503|      776.1508767|
|    n|             0.611|[121.2844436,758....|HLGGMH+Arial-BoldMT| 17.15922640000008|[1