In [2]:
# default_exp pdf_core

# Pdf Core

> Basic pdf operations available to fastagger.

In [3]:
#hide
from nbdev.showdoc import *

In [6]:
import os
from pdf2image import convert_from_path

In [52]:
def checkPath(path):
    if os.path.isdir(path):  
        return "folder"
    elif os.path.isfile(path):  
        return "file"  
    else:
        return None

In [54]:
file = os.path.abspath('LICENSE')
folder = os.path.abspath('docs')
error = ('doesNotExist')
assert checkPath(file) ==  "file" 
assert checkPath(folder) == "folder"
assert checkPath(error) == None

In [100]:
#export
def importPDFs(path, outputFolder, dpi=200, grayscale=True, size=(600, 846)):
    """imports pdf pages to folder
    path: string path for the file or folder
    recursive: boolean choice for traversing the path folder searching for pdf files
    outputFolder: pages image folder destiny
    dpi: image quality, default 200
    grayscale: filter to grayscale, default True
    size: page image size, default (600, 846)
    """
    if checkPath(path) == 'folder':
        for root, d_names, f_names in os.walk(path):
            for f in f_names:
                if f.endswith('.pdf'):
                    try:
                        importPDF(os.path.join(root, f), outputFolder, dpi, grayscale, size)
                    except AlreadyImportedError as e:
                        print(e)

In [103]:
#hide
importPDFs('/Users/fb/Code/src/github.com/nbdev/fastagger', './')

('Pdf already imported: ', '0003230-12.2010.4.01.3500_2.pdf')


In [102]:
class Error(Exception):
    pass

class AlreadyImportedError(Error):
    pass

def importPDF(filePath, outputFolder, dpi=200, grayscale=True, size=(600, 846)):
    """Imports pdf pages to folder
    filePath: tring path for the file to be imported
    outputFolder: pages image folder destiny
    dpi: image quality, default 200
    grayscale: filter to grayscale, default True
    size: page image size, default (600, 846)
    """
    if checkPath(filePath) == 'file':
        fileName = os.path.basename(filePath)
        if fileName.endswith('.pdf'):
            fileNameBase = os.path.splitext(fileName)[0]
            outputFolder = outputFolder + fileNameBase
            if os.path.exists(outputFolder):
                raise AlreadyImportedError('Pdf already imported: ',fileName)                      
            metadata = {fileNameBase: {}}
            try:
                pages = convert_from_path(filePath, dpi=200, grayscale=True, size=(600, 846))
            except:
                raise Exception("Could not convert from path: ", filePath)                      

            metadata[fileNameBase]['path'] = outputFolder
            metadata[fileNameBase]['pages'] = dict([(x,0) for x in range(1,len(pages)+1)])
            
            try:
                os.mkdir(outputFolder)
            except OSError:
                raise Exception("Could not create folder: ", outputFolder)
                
            for idx, page in enumerate(pages):
                try:
                    page.save(outputFolder+'/'+fileNameBase+'_'+str(idx+1)+'.png', 'PNG')
                except OSError:
                    raise Exception("Could not save file:", outputFolder+'/'+fileNameBase+'_'+str(idx+1)+'.png')            
            return metadata
        else:
            raise Exception('File must be pdf: ', fileName) 
    else:
        raise Exception('Invalid file path', filePath)

In [104]:
#hide
importPDF('0003230-12.2010.4.01.3500_2.pdf','./') 

AlreadyImportedError: ('Pdf already imported: ', '0003230-12.2010.4.01.3500_2.pdf')