In [None]:
# default_exp core

In [None]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Core

> Basic pdf operations available to fastagger.

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#export
import os
from pdf2image import convert_from_path

In [None]:
#export

class AlreadyImportedError(Exception):
    pass

class InvalidFilePath(Exception):
    pass

class InvalidFileExtension(Exception):
    pass

class Pdf:
    """Entity that deals with pdf
    """
    def __init__(self, path):
        if self.checkPath(path) != 'file':
            raise InvalidFilePath('Invalid file path', path)
        if not path.endswith('.pdf'):
            raise InvalidFileExtension('File must be pdf', path)
        self.path = path
        self.name = os.path.basename(self.path) 
        self.steamName = os.path.splitext(self.name)[0]
    
    def load(self, outputFolder, dpi=200, grayscale=True, size=(600, 846)):
        """Imports pdf pages to folder
        outputFolder: pages image folder destiny
        dpi: image quality, default 200
        grayscale: filter to grayscale, default True
        size: page image size, default (600, 846)
        """
        
        outputFolder = outputFolder +'/'+ self.steamName
        
        if os.path.exists(outputFolder):
            raise AlreadyImportedError('Pdf already imported: ',self.path)                      
        metadata = {self.steamName: {}}
        try:
            pages = convert_from_path(self.path, dpi=200, grayscale=True, size=(600, 846))
        except:
            raise Exception("Could not convert from path: ", self.path)                      

        metadata[self.steamName]['path'] = outputFolder
        metadata[self.steamName]['pages'] = dict([(x,0) for x in range(1,len(pages)+1)])

        try:
            os.mkdir(outputFolder)
        except OSError:
            raise Exception("Could not create folder: ", outputFolder)

        for idx, page in enumerate(pages):
            try:
                page.save(outputFolder+'/'+str(self.steamName)+'_'+str(idx+1)+'.png', 'PNG')
            except OSError:
                raise Exception("Could not save file:", outputFolder+'/'+self.steamName+'_'+str(idx+1)+'.png')            
        return metadata
                
    def bulkLoad(path, outputFolder, dpi=200, grayscale=True, size=(600, 846)):
        """imports pdf pages to folder
        path: string path for the file or folder
        recursive: boolean choice for traversing the path folder searching for pdf files
        outputFolder: pages image folder destiny
        dpi: image quality, default 200
        grayscale: filter to grayscale, default True
        size: page image size, default (600, 846)
        """
        filesMetadata = {}
        if Pdf.checkPath(path) == 'folder':
            for root, d_names, f_names in os.walk(path):
                for f in f_names:
                    try:
                        pdf = Pdf(os.path.join(root, f))
                        metadata = pdf.load(outputFolder, dpi, grayscale, size)
                        filesMetadata.update(metadata)                       
                    except Exception as e:
                        continue
            return filesMetadata
        else:
            raise Exception('Invalid folder')
            
    @staticmethod
    def checkPath(path):
        if os.path.isdir(path):  
            return "folder"
        elif os.path.isfile(path):  
            return "file"  
        else:
            return None 

Let's test Pdf class

In [None]:
APP_ROOT = os.path.dirname(os.path.realpath('__file__'))
p = Pdf(APP_ROOT+'/resources/pdfs/01.pdf')
assert isinstance(p, Pdf)

!rm -rf ./resources/pdfs/01
metadata = p.load('./resources/pdfs')
obj = {'01': {'path': './resources/pdfs/01', 'pages': {1: 0}}}
assert metadata == obj
!rm -rf ./resources/pdfs/01

Let's test the `checkPath` static method

In [None]:
file = os.path.abspath('LICENSE')
folder = os.path.abspath('docs')
error = os.path.abspath('doesNotExist')
assert Pdf.checkPath(file) ==  "file" 
assert Pdf.checkPath(folder) == "folder"
assert Pdf.checkPath(error) == None

In [None]:
path = './resources/pdfs'
!rm -rf ./resources/pdfs/01
!rm -rf ./resources/pdfs/02
metadata = Pdf.bulkLoad('./resources/pdfs', './resources/pdfs')
obj = {'02': {'path': './resources/pdfs/02',
  'pages': {1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0}},
 '01': {'path': './resources/pdfs/01', 'pages': {1: 0}}}
!rm -rf ./resources/pdfs/01
!rm -rf ./resources/pdfs/02
assert metadata == obj