In [None]:
# default_exp core

In [None]:
#hide 
%load_ext autoreload
%autoreload 2

# Core

> Basic pdf operations available to fastagger.

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#export
import os
from pdf2image import convert_from_path

In [None]:
#export

class AlreadyImportedError(Exception):
    pass

class InvalidFilePath(Exception):
    pass

class InvalidFolderPath(Exception):
    pass

class InvalidFileExtension(Exception):
    pass

class Pdf:
    """Entity that deals with pdf
    """
    def __init__(self, path):
        if self.getPathType(path) != 'file':
            raise InvalidFilePath('Invalid file path', path)
        if not path.endswith('.pdf'):
            raise InvalidFileExtension('File must be pdf', path)
        self.path = path
        self.name = os.path.basename(self.path) 
        self.steamName = os.path.splitext(self.name)[0]
    
    def load(self, outputFolder, dpi=200, grayscale=True, size=(600, 846)):
        """Loads pdf images pages to output folder
        """
        
        outputFolder = outputFolder +'/'+ self.steamName
        
        if os.path.exists(outputFolder):
            raise AlreadyImportedError('Pdf already imported: ',self.path)                      
        metadata = {self.steamName: {}}
        try:
            pages = convert_from_path(self.path, dpi=200, grayscale=True, size=(600, 846))
        except:
            raise Exception("Could not convert from path: ", self.path)                      

        metadata[self.steamName]['path'] = outputFolder
        metadata[self.steamName]['pages'] = dict([(x,0) for x in range(1,len(pages)+1)])

        try:
            os.mkdir(outputFolder)
        except OSError:
            raise Exception("Could not create folder: ", outputFolder)

        for idx, page in enumerate(pages):
            try:
                page.save(outputFolder+'/'+str(self.steamName)+'_'+str(idx+1)+'.png', 'PNG')
            except OSError:
                raise Exception("Could not save file:", outputFolder+'/'+self.steamName+'_'+str(idx+1)+'.png')            
        return metadata
                
    def bulkLoad(path, outputFolder, dpi=200, grayscale=True, size=(600, 846)):
        """From every pdf in the path folder, loads theirs image pages to the output folder
        """       
        filesMetadata = {}
        if Pdf.getPathType(path) == 'folder':
            for root, d_names, f_names in os.walk(path):
                for f in f_names:
                    try:
                        pdf = Pdf(os.path.join(root, f))
                        metadata = pdf.load(outputFolder, dpi, grayscale, size)
                        filesMetadata.update(metadata)                       
                    except Exception as e:
                        continue
            return filesMetadata
        else:
            raise InvalidFolderPath('Invalid folder')
            
    @staticmethod
    def getPathType(path):
        """Get the kind of the supplied path
        """        
        
        if os.path.isdir(path):  
            return "folder"
        elif os.path.isfile(path):  
            return "file"  
        else:
            return None 

Let's test Pdf class

In [None]:
APP_ROOT = os.path.dirname(os.path.realpath('__file__'))
p = Pdf(APP_ROOT+'/resources/pdfs/01.pdf')
assert isinstance(p, Pdf)

Now, it's time to `load` pdf images into the output folder

In [None]:
show_doc(Pdf.load)

<h4 id="Pdf.load" class="doc_header"><code>Pdf.load</code><a href="__main__.py#L27" class="source_link" style="float:right">[source]</a></h4>

> <code>Pdf.load</code>(**`outputFolder`**, **`dpi`**=*`200`*, **`grayscale`**=*`True`*, **`size`**=*`(600, 846)`*)

Loads pdf images pages to output folder
        

In [None]:
!rm -rf ./resources/pdfs/01
metadata = p.load('./resources/pdfs')
!rm -rf ./resources/pdfs/01
obj = {'01': {'path': './resources/pdfs/01', 'pages': {1: 0}}}
assert metadata == obj

Let's check how to import a whole folder with `bulkLoad`

In [None]:
show_doc(Pdf.bulkLoad)

<h4 id="Pdf.bulkLoad" class="doc_header"><code>Pdf.bulkLoad</code><a href="__main__.py#L56" class="source_link" style="float:right">[source]</a></h4>

> <code>Pdf.bulkLoad</code>(**`path`**, **`outputFolder`**, **`dpi`**=*`200`*, **`grayscale`**=*`True`*, **`size`**=*`(600, 846)`*)

From every pdf in the path folder, loads theirs image pages to the output folder
        

In [None]:
path = './resources/pdfs'
!rm -rf ./resources/pdfs/01
!rm -rf ./resources/pdfs/02
metadata = Pdf.bulkLoad('./resources/pdfs', './resources/pdfs')
obj = {'02': {'path': './resources/pdfs/02',
  'pages': {1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0}},
 '01': {'path': './resources/pdfs/01', 'pages': {1: 0}}}
!rm -rf ./resources/pdfs/01
!rm -rf ./resources/pdfs/02
assert metadata == obj

In [None]:
show_doc(Pdf.getPathType)

<h4 id="Pdf.getPathType" class="doc_header"><code>Pdf.getPathType</code><a href="__main__.py#L73" class="source_link" style="float:right">[source]</a></h4>

> <code>Pdf.getPathType</code>(**`path`**)

Get the kind of the supplied path
        

Let's test the `checkPath` static method

In [None]:
file = os.path.abspath('LICENSE')
folder = os.path.abspath('docs')
error = os.path.abspath('doesNotExist')
assert Pdf.getPathType(file) ==  "file" 
assert Pdf.getPathType(folder) == "folder"
assert Pdf.getPathType(error) == None