In [1]:
import json
import os
import pandas as pd

In [2]:
from reportlab.platypus import Paragraph, SimpleDocTemplate, Spacer  
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle   
from reportlab.lib.enums import TA_CENTER,TA_JUSTIFY       
from reportlab.pdfbase import pdfmetrics      
from reportlab.pdfbase.ttfonts import TTFont   
from reportlab.lib.fonts import addMapping
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas

In [3]:
def get_document_pages(filename, encoding='utf-8'):
    with open(filename, 'r', encoding=encoding) as f:
        data = json.load(f)
    return data['outputs'][0]['pages']

def vertices_to_boundingbox(vertices):
    c1, c2, c3, c4  = vertices[0], vertices[1], vertices[2], vertices[3]
    left, top       = c1['x'], c1['y']
    width, height   = (c3['x'] - c1['x']), (c3['y'] - c1['y'])
    return (left, top, width, height)

def get_page_words(page):
    if 'regions' not in page.keys():
        return []
    words = []
    for region in page['regions']:
        if 'class' in region.keys() and 'children' in region.keys():
            if region['class'] == 'TEXT':
                lines = region['children']
                for line in lines:
                    if 'children' in line.keys():
                        for word in line['children']:
                            words.append(word)
    return words

def words_to_dataframe(words):
    texts      = []
    tops       = []
    lefts      = []
    widths     = []
    heights    = []
    font_sizes = []

    for word in words:
        if 'text' in word.keys() and len(word['text']) != 0:
            texts.append(word['text'])
            left, top, width, height = vertices_to_boundingbox(word['boundingBox']['vertices'])
            tops.append(top)
            lefts.append(left)
            widths.append(width)
            heights.append(height)
            font_sizes.append(word['font']['size'])

    df = pd.DataFrame(list(zip(texts, tops, lefts, widths, heights,font_sizes)), 
                          columns =['text', 'top', 'left', 'width', 'height', 'font_size'])
    
    df1 = df.sort_values(['top', 'left'], ascending=[True, True])
    df1.reset_index(inplace=True)

    return df1

def show_df(df):
    return df.head(df.shape[0])

# directory setup

In [4]:
WORKING_DIR      = os.path.dirname(os.getcwd())
FONT_DIR         = os.path.join(WORKING_DIR, 'font')

DATA_DIR         = os.path.join(WORKING_DIR, 'data')
DATA_INPUT_DIR   = os.path.join(DATA_DIR, 'input')
DATA_OUTPUT_DIR  = os.path.join(DATA_DIR, 'output')

# loading vernacular kannada font

In [None]:
FONT_NAME        = 'Lohit-Kannada'
FONT_FILENAME    = os.path.join(FONT_DIR, FONT_NAME + '.ttf')
pdfmetrics.registerFont(TTFont(FONT_NAME, FONT_FILENAME)) 


In [4]:
filename         = '5.kannada.json'
input_filepath   = os.path.join(DATA_INPUT_DIR, filename)
output_filepath  = os.path.join(DATA_OUTPUT_DIR, filename + '.pdf')

In [5]:
pages  = get_document_pages(input_filepath)
print('total number of pages in document %d' % (len(pages)))

total number of pages in document 1


In [6]:
page                          = pages[0]
_, _, page_width, page_height = vertices_to_boundingbox(page['vertices'])
words                         = get_page_words(page)
df                            = words_to_dataframe(words)
print('total number of words %d, page_width %d, page_height %d' % (len(words), page_width, page_height))

total number of words 249, page_width 2484, page_height 3509


In [7]:
SCALE_FACTOR_A4_72DPI = 4

c = canvas.Canvas(output_filepath, pagesize=(page_width/SCALE_FACTOR_A4_72DPI, page_height/SCALE_FACTOR_A4_72DPI))

for index, row in df.iterrows():
    text, left, top, font_size = row['text'], row['left'], page_height - row['top'], row['font_size']
    c.setLineWidth(0.5)
    c.setFont(FONT_NAME, 30/SCALE_FACTOR_A4_72DPI)
    c.drawString(left/SCALE_FACTOR_A4_72DPI, top/SCALE_FACTOR_A4_72DPI, text)

c.save()