In [1]:
import json
import os


In [2]:
from reportlab.platypus import Paragraph, SimpleDocTemplate, Spacer  
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle   
from reportlab.lib.enums import TA_CENTER,TA_JUSTIFY       
from reportlab.pdfbase import pdfmetrics      
from reportlab.pdfbase.ttfonts import TTFont   
from reportlab.lib.fonts import addMapping
from reportlab.lib.pagesizes import letter, A4
from reportlab.pdfgen import canvas

In [3]:
def get_document_pages(filename, encoding='utf-8'):
    with open(filename, 'r', encoding=encoding) as f:
        data = json.load(f)
    return data['inputs'][0]['pages']

def vertices_to_boundingbox(vertices):
    c1, c2, c3, c4  = vertices[0], vertices[1], vertices[2], vertices[3]
    left, top       = c1['x'], c1['y']
    width, height   = (c3['x'] - c1['x']), (c3['y'] - c1['y'])
    return (left, top, width, height)


In [4]:
def get_page_paragraphs_lines(page):
    page_paragraphs    = []
    page_lines         = []
    
    if 'regions'in page.keys():
        for para_region in page['regions']:
            if 'class' in para_region.keys() and 'regions' in para_region.keys():
                if para_region['class'] == 'PARA':
                    lines = []
                    for line_region in para_region['regions']:
                        if 'class' in line_region.keys() and 'regions' in line_region.keys():
                            if line_region['class'] == 'LINE':
                                words = []
                                for word_region in line_region['regions']:
                                    if 'class' in word_region.keys() and 'regions' in word_region.keys() and 'text' in word_region.keys():
                                        if word_region['class'] == 'WORD':
                                            words.append(word_region['text'])

                                lines.append(' '.join(words) + '\n')
                                page_lines.append({'boundingBox': vertices_to_boundingbox(line_region['boundingBox']['vertices']), 
                                            'text': ' '.join(words)})

                    page_paragraphs.append({'boundingBox': vertices_to_boundingbox(para_region['boundingBox']['vertices']), 
                                            'text': ''.join(lines)})
    return page_paragraphs, page_lines

def get_page_dimensions(page):
    _, _, w, h = vertices_to_boundingbox(page['boundingBox']['vertices'])
    return w, h

def draw_line_text(page_canvas, x, y, text, word_space=1.75, horizontal_scale=105, font_name=None, font_size=8):
    txtobj = page_canvas.beginText()
    txtobj.setTextOrigin(x, y)
    txtobj.setWordSpace(1.75)
    txtobj.setHorizScale(105)
    txtobj.setFont(font_name, font_size)
    txtobj.textLine(text=text)
    page_canvas.drawText(txtobj)
    
def create_pdf(pages, pdf_filepath, font_name, scale_factor=4):
    '''
     using first page w & h as canvas
    '''
    w, h                      = get_page_dimensions(pages[0])
    pagesize                  = (w/scale_factor, h/scale_factor)
    c                         = canvas.Canvas(pdf_filepath, pagesize=pagesize)
    for page in pages:
        paragraphs, lines     = get_page_paragraphs_lines(page)
        
        for line in lines:
            boundingBox, text = line['boundingBox'], line['text']
            x, y, _, _        = boundingBox
            y                 = h - y
            draw_line_text(c, x/scale_factor, y/scale_factor, text, 1.75, 105, font_name, 30/scale_factor)
        c.showPage()
    c.save()

def load_font(font_name='arial-unicode-ms', font_dir=None):
    pdfmetrics.registerFont(TTFont(font_name, os.path.join(font_dir, font_name + '.ttf'))) 

# directory setup

In [5]:
WORKING_DIR      = os.path.dirname(os.getcwd())
FONT_DIR         = os.path.join(WORKING_DIR, 'font')

DATA_DIR         = os.path.join(WORKING_DIR, 'data')
DATA_INPUT_DIR   = os.path.join(DATA_DIR, 'input')
DATA_OUTPUT_DIR  = os.path.join(DATA_DIR, 'output')

# loading vernacular  font

In [6]:
filename         = '6.tamil.json'
input_filepath   = os.path.join(DATA_INPUT_DIR, filename)
output_filepath  = os.path.join(DATA_OUTPUT_DIR, filename + '.pdf')

In [7]:
pages  = get_document_pages(input_filepath)
print('total number of pages in document %d' % (len(pages)))

total number of pages in document 3


In [8]:
load_font('arial-unicode-ms', FONT_DIR)
create_pdf(pages, output_filepath, 'arial-unicode-ms', 4)
print('%s created' % (output_filepath))

/Users/kd/Workspace/python/PDF/REPORTLAB/ExportPDF/data/output/6.tamil.json.pdf created


In [6]:
FONT_NAME        = 'arial-unicode-ms'
FONT_FILENAME    = os.path.join(FONT_DIR, FONT_NAME + '.ttf')
pdfmetrics.registerFont(TTFont(FONT_NAME, FONT_FILENAME)) 


In [9]:
page              = pages[0]
pdf_filepath      = output_filepath


In [10]:
A4_SCALE_FACTOR       = 1
w, h                  = get_page_dimensions(page)
pagesize              = (w/A4_SCALE_FACTOR, h/A4_SCALE_FACTOR)
c                     = canvas.Canvas(pdf_filepath, pagesize=pagesize)

for line in lines:
    boundingBox, text = line['boundingBox'], line['text']
    x, y, _, _        = boundingBox
    y                 = h - y
    draw_line_text(x/A4_SCALE_FACTOR, y/A4_SCALE_FACTOR, text, 1.75, 105, FONT_NAME, 30/A4_SCALE_FACTOR)

c.save()

In [30]:
SCALE_FACTOR_A4_72DPI = 4

c = canvas.Canvas(output_filepath, pagesize=(page_width/SCALE_FACTOR_A4_72DPI, page_height/SCALE_FACTOR_A4_72DPI))

for index, row in df.iterrows():
    text, left, top, font_size = row['text'], row['left'], page_height - row['top'], row['font_size']
    c.setLineWidth(0.5)
    c.setFont(FONT_NAME, 30/SCALE_FACTOR_A4_72DPI)
    c.drawString(left/SCALE_FACTOR_A4_72DPI, top/SCALE_FACTOR_A4_72DPI, text)

c.save()

In [None]:
def create_pdf_file(pages, pdf_filepath):
    SCALE_FACTOR_A4_72DPI = 4
#     page_size             = (page_width/SCALE_FACTOR_A4_72DPI, page_height/SCALE_FACTOR_A4_72DPI))
    c                     = canvas.Canvas(pdf_filepath, pagesize=A4)

    for page in pages:
        _, _, page_width, page_height = vertices_to_boundingbox(page['vertices'])
        words                         = get_page_words(page)
        df                            = words_to_dataframe(words)
        print('total number of words %d, page_width %d, page_height %d' % (len(words), page_width, page_height))
        
        for index, row in df.iterrows():
            text, left, top, font_size = row['text'], row['left'], page_height - row['top'], row['font_size']
            c.setLineWidth(0.5)
            c.setFont(FONT_NAME, 30/SCALE_FACTOR_A4_72DPI)
            c.drawString(left/SCALE_FACTOR_A4_72DPI, top/SCALE_FACTOR_A4_72DPI, text)
        
        c.showPage()

    c.save()
