In [2]:
import os
import sys
import time
import base64
import uuid
import json
import codecs
import pandas as pd
import math

In [3]:
base_dir        = os.path.dirname(os.getcwd())
input_dir       = os.path.join(base_dir, 'data', 'input')
output_dir      = os.path.join(base_dir, 'data', 'output')
filename        = 'judgement.json'

input_filepath  = os.path.join(input_dir, filename)
output_filepath = os.path.join(output_dir, os.path.splitext(os.path.basename(filename))[0]+'.docx')


In [4]:
def get_pages(filepath):
    data  = json.load(codecs.open(filepath, 'r', 'utf-8-sig'))
    pages = data['data']
    return pages

In [5]:
pages =  get_pages(input_filepath)   
print('document has %d pages' % (len(pages)))

document has 54 pages


In [6]:
dfs              = []
page_width       = None
page_height      = None

for page in pages:
    text_tops        = []
    text_lefts       = []
    text_widths      = []
    text_heights     = []
    font_sizes       = []
    font_families    = []
    font_colors      = []
    text_values      = []
    b64_images       = []

    images       = page['images']
    texts        = page['text_blocks']
    page_num     = page['page_no']
    page_width   = page['page_width']
    page_height  = page['page_height']
    
    for text in texts:
        text_tops.append(text['text_top'])
        text_lefts.append(text['text_left'])
        text_widths.append(text['text_width'])
        text_heights.append(text['text_height'])
        font_sizes.append(text['font_size'])
        font_families.append(text['font_family'])
        font_colors.append(text['font_color'])
        b64_images.append(None)
        
        text_value = []
        for processed_text in text['tokenized_sentences']:
            text_value.append(processed_text['src_text'])        
        text_values.append(' '.join(text_value))
    
    for image in images:
        text_tops.append(image['text_top'])
        text_lefts.append(image['text_left'])
        text_widths.append(image['text_width'])
        text_heights.append(image['text_height'])
        b64_images.append(image['base64'])
        text_values.append(None)
        font_sizes.append(None)
        font_families.append(None)
        font_colors.append(None)
    
    df = pd.DataFrame(list(zip(text_tops, text_lefts, text_widths, text_heights,
                                            text_values, font_sizes, font_families, font_colors, b64_images)), 
                              columns =['text_top', 'text_left', 'text_width', 'text_height',
                                          'text', 'font_size', 'font_family', 'font_color', 'base64'])
    df.sort_values('text_top', axis = 0, ascending = True, inplace=True)    
    df.reset_index(inplace = True, drop = True) 
    dfs.append(df)


KeyError: 'tgt_text'

In [7]:
def update_horizontal_spacings_v1(df):
    horizontal_spacings     = []
    prev_top                = 0

    for index in range(df.shape[0]):
        curr_top = df.iloc[index]['text_top']
        if  curr_top > prev_top:
            horizontal_space    = 0
            prev_top            = curr_top
        else:
            horizontal_space    = df.iloc[index]['text_left'] \
                                    - (df.iloc[index-1]['text_left'] + df.iloc[index-1]['text_width'])
        horizontal_spacings.append(horizontal_space)

    new_df = df.copy()
    if 'horizontal_space' in new_df.columns:
        del new_df['horizontal_space']
    new_df['horizontal_space'] = horizontal_spacings
    return new_df


def update_vertical_spacings_v1(df):
    vertical_spacings       = []
    prev_top                = 0
    prev_height             = 0
    prev_vertical_space     = 0

    for index in range(df.shape[0]):
        curr_top    = df.iloc[index]['text_top']
        curr_height = df.iloc[index]['text_height']
        if  curr_top > prev_top:
            vertical_space      = curr_top - (prev_top+prev_height)
            
            prev_top            = curr_top
            prev_height         = curr_height
            prev_vertical_space = vertical_space
        else:
            vertical_space      = prev_vertical_space
        
        if index == 0:
            vertical_space      = 0
        vertical_spacings.append(vertical_space)
    
    new_df = df.copy()
    if 'vertical_space' in new_df.columns:
        del new_df['vertical_space']
    new_df['vertical_space'] = vertical_spacings
    return new_df


In [8]:
index  = 0
in_df  = dfs[index]
new_df = update_horizontal_spacings_v1(in_df)
df     = update_vertical_spacings_v1(new_df)


IndexError: list index out of range

In [9]:
from docx import Document
from docx.shared import Pt
from docx.shared import Twips, Cm, Inches
from docx.enum.text import WD_ALIGN_PARAGRAPH, WD_BREAK
from docx.enum.section import WD_SECTION, WD_ORIENT
from docx.shared import Length


def get_pixel_twips(pixels):
    PIXEL_TO_TWIPS = 14.999903622654
    return int(PIXEL_TO_TWIPS * pixels)

def get_font_point(pixels):
    return pixels * 0.75

def get_cms(pixels):
    PPI          = 108
    INCH_TO_CM   = 2.54
    PIXEL_PER_CM = PPI / 2.54
    
    return pixels / PIXEL_PER_CM

def get_path_from_base64(work_dir, b64_data):
    filepath = os.path.join(work_dir, str(uuid.uuid4().hex) + '.jpg')
    with open(filepath, 'wb') as file:
        file.write(base64.b64decode(b64_data))
    return filepath

def pixel_to_twips(px, dpi=108):
    INCH_TO_TWIPS  = 1440
    px_to_inches   = 1.0 / float(dpi)
    return math.ceil(px * px_to_inches * INCH_TO_TWIPS)

In [10]:
df

Unnamed: 0,text_top,text_left,text_width,text_height,text,font_size,font_family,font_color,base64,horizontal_space,vertical_space
0,0,0,892,1263,,,,,iVBORw0KGgoAAAANSUhEUgAAA3wAAATvCAIAAADSI2UWAA...,-454,0
1,111,694,90,16,Reportable,17.0,"Arial,Bold",#000000,,0,111
2,153,270,334,17,IN THE SUPREME COURT OF INDIA,18.0,"Arial,Bold",#000000,,0,26
3,175,292,307,17,CIVIL APPELLATE JURISDICTION,18.0,"Arial,Bold",#000000,,0,5
4,218,290,318,17,Civil Appeal Nos 9367-9369 of 2011,18.0,"Arial,Bold",#000000,,0,26
5,304,108,328,17,"The Secretary, Ministry of Defence",18.0,"Arial,Bold",#000000,,0,69
6,304,540,221,17,......Appellant,18.0,"Arial,Bold",#000000,,104,69
7,390,16,500,17,Versus,18.0,"Arial,Bold",#000000,,0,69
8,455,108,331,17,Babita Puniya & Ors.,18.0,"Arial,Bold",#000000,,0,48
9,455,594,197,17,......Respondents,18.0,"Arial,Bold",#000000,,155,48


In [12]:
document  = Document()
section             = document.sections[-1]
section.orientation = WD_ORIENT.PORTRAIT
section.page_width  = Cm(get_cms(page_width))
section.page_height = Cm(get_cms(page_height))

section.left_margin   = Cm(0)
section.right_margin  = Cm(0)
section.top_margin    = Cm(0)
section.bottom_margin = Cm(0)
LEFT_ADJUST           = 0
TOP_ADJUST            = 0

# empty p at start of page
# p                     = document.add_paragraph()
# p_format              = p.paragraph_format
# p_format.line_spacing = Pt(18)

###
index = 1
p                      = document.add_paragraph()
p_format               = p.paragraph_format
p_format.left_indent   = Twips(pixel_to_twips(df.iloc[index]['text_left'])) - LEFT_ADJUST
# p_format.space_before  = Twips(pixel_to_twips(df.iloc[index]['vertical_space']))  - TOP_ADJUST

run                    = p.add_run()
font                   = run.font
font.name              = 'Arial'
font.size              = Twips(pixel_to_twips(df.iloc[index]['font_size']))

for num_line_break in range(int(df.iloc[index]['vertical_space']/df.iloc[index]['font_size'])):
    run.add_break(WD_BREAK.LINE)
run.add_text(df.iloc[index]['text'])
###

###
index = 2
p                      = document.add_paragraph()
p_format               = p.paragraph_format
p_format.left_indent   = Twips(pixel_to_twips(df.iloc[index]['text_left'])) - LEFT_ADJUST
# p_format.space_before  = Twips(pixel_to_twips(df.iloc[index]['vertical_space']))  - TOP_ADJUST

run                    = p.add_run()
font                   = run.font
font.name              = 'Arial'
font.size              = Twips(pixel_to_twips(df.iloc[index]['font_size']))
for num_line_break in range(int(df.iloc[index]['vertical_space']/df.iloc[index]['font_size'])):
    run.add_break(WD_BREAK.LINE)
run.add_text(df.iloc[index]['text'])
###

###
index = 3
p                      = document.add_paragraph()
p_format               = p.paragraph_format
p_format.left_indent   = Twips(pixel_to_twips(df.iloc[index]['text_left'])) - LEFT_ADJUST
# p_format.space_before  = Twips(pixel_to_twips(df.iloc[index]['vertical_space']))  - TOP_ADJUST
run                    = p.add_run()
font                   = run.font
font.name              = 'Arial'
font.size              = Twips(pixel_to_twips(df.iloc[index]['font_size']))
for num_line_break in range(int(df.iloc[index]['vertical_space']/df.iloc[index]['font_size'])):
    run.add_break(WD_BREAK.LINE)
run.add_text(df.iloc[index]['text'])
###

###
index = 4
p                      = document.add_paragraph()
p_format               = p.paragraph_format
p_format.left_indent   = Twips(pixel_to_twips(df.iloc[index]['text_left'])) - LEFT_ADJUST
# p_format.space_before  = Twips(pixel_to_twips(df.iloc[index]['vertical_space']))  - TOP_ADJUST

run                    = p.add_run()
font                   = run.font
font.name              = 'Arial'
font.size              = Twips(pixel_to_twips(df.iloc[index]['font_size']))
for num_line_break in range(int(df.iloc[index]['vertical_space']/df.iloc[index]['font_size'])):
    run.add_break(WD_BREAK.LINE)
run.add_text(df.iloc[index]['text'])
###

###
index = 5
p                      = document.add_paragraph()
p_format               = p.paragraph_format
p_format.left_indent   = Twips(pixel_to_twips(df.iloc[index]['text_left'])) - LEFT_ADJUST
# p_format.space_before  = Twips(pixel_to_twips(df.iloc[index]['vertical_space']))  - TOP_ADJUST

run                    = p.add_run()
font                   = run.font
font.name              = 'Arial'
font.size              = Twips(pixel_to_twips(df.iloc[index]['font_size']))
for num_line_break in range(int(df.iloc[index]['vertical_space']/df.iloc[index]['font_size'])):
    run.add_break(WD_BREAK.LINE)
run.add_text(df.iloc[index]['text'])
###

###
index = 6

run                    = p.add_run()
font                   = run.font
font.name              = 'Arial'
font.size              = Twips(pixel_to_twips(df.iloc[index]['font_size']))
run.add_text('\t\t\t')
run                    = p.add_run()
font                   = run.font
font.name              = 'Arial'
font.size              = Twips(pixel_to_twips(df.iloc[index]['font_size']))
run.add_text(df.iloc[index]['text'])
###

###
index = 7
p                      = document.add_paragraph()
p_format               = p.paragraph_format
p_format.left_indent   = Twips(pixel_to_twips(df.iloc[index]['text_left'])) - LEFT_ADJUST
# p_format.space_before  = Twips(pixel_to_twips(df.iloc[index]['vertical_space']))  - TOP_ADJUST

run                    = p.add_run()
font                   = run.font
font.name              = 'Arial'
font.size              = Twips(pixel_to_twips(df.iloc[index]['font_size']))
for num_line_break in range(int(df.iloc[index]['vertical_space']/df.iloc[index]['font_size'])):
    run.add_break(WD_BREAK.LINE)
run.add_text(df.iloc[index]['text'])
###

###
index = 8
p                      = document.add_paragraph()
p_format               = p.paragraph_format
p_format.left_indent   = Twips(pixel_to_twips(df.iloc[index]['text_left'])) - LEFT_ADJUST
# p_format.space_before  = Twips(pixel_to_twips(df.iloc[index]['vertical_space']))  - TOP_ADJUST

run                    = p.add_run()
font                   = run.font
font.name              = 'Arial'
font.size              = Twips(pixel_to_twips(df.iloc[index]['font_size']))
for num_line_break in range(int(df.iloc[index]['vertical_space']/df.iloc[index]['font_size'])):
    run.add_break(WD_BREAK.LINE)
run.add_text(df.iloc[index]['text'])
###

###
index = 9

run                    = p.add_run()
font                   = run.font
font.name              = 'Arial'
font.size              = Twips(pixel_to_twips(df.iloc[index]['font_size']))

tab_stops              = p_format.tab_stops
tab_stop               = tab_stops.add_tab_stop(Inches(1.5))
run.add_text('\t')

run                    = p.add_run()
font                   = run.font
font.name              = 'Arial'
font.size              = Twips(pixel_to_twips(df.iloc[index]['font_size']))
run.add_text(df.iloc[index]['text'])
###

###
index = 10
p                      = document.add_paragraph()
p_format               = p.paragraph_format
p_format.left_indent   = Twips(pixel_to_twips(df.iloc[index]['text_left'])) - LEFT_ADJUST
# p_format.space_before  = Twips(pixel_to_twips(df.iloc[index]['vertical_space']))  - TOP_ADJUST

run                    = p.add_run()
font                   = run.font
font.name              = 'Arial'
font.size              = Twips(pixel_to_twips(df.iloc[index]['font_size']))
for num_line_break in range(int(df.iloc[index]['vertical_space']/df.iloc[index]['font_size'])):
    run.add_break(WD_BREAK.LINE)
run.add_text(df.iloc[index]['text'])
###

###
index = 11
p                      = document.add_paragraph()
p_format               = p.paragraph_format
p_format.left_indent   = Twips(pixel_to_twips(df.iloc[index]['text_left'])) - LEFT_ADJUST
# p_format.space_before  = Twips(pixel_to_twips(df.iloc[index]['vertical_space']))  - TOP_ADJUST

run                    = p.add_run()
font                   = run.font
font.name              = 'Arial'
font.size              = Twips(pixel_to_twips(df.iloc[index]['font_size']))
for num_line_break in range(int(df.iloc[index]['vertical_space']/df.iloc[index]['font_size'])):
    run.add_break(WD_BREAK.LINE)
run.add_text(df.iloc[index]['text'])
###

###
index = 12
p                      = document.add_paragraph()
p_format               = p.paragraph_format
p_format.left_indent   = Twips(pixel_to_twips(df.iloc[index]['text_left'])) - LEFT_ADJUST
# p_format.space_before  = Twips(pixel_to_twips(df.iloc[index]['vertical_space']))  - TOP_ADJUST

run                    = p.add_run()
font                   = run.font
font.name              = 'Arial'
font.size              = Twips(pixel_to_twips(df.iloc[index]['font_size']))
for num_line_break in range(int(df.iloc[index]['vertical_space']/df.iloc[index]['font_size'])):
    run.add_break(WD_BREAK.LINE)
run.add_text(df.iloc[index]['text'])
###

###
index = 13
p                      = document.add_paragraph()
p_format               = p.paragraph_format
p_format.left_indent   = Twips(pixel_to_twips(df.iloc[index]['text_left'])) - LEFT_ADJUST
# p_format.space_before  = Twips(pixel_to_twips(df.iloc[index]['vertical_space']))  - TOP_ADJUST

run                    = p.add_run()
font                   = run.font
font.name              = 'Arial'
font.size              = Twips(pixel_to_twips(df.iloc[index]['font_size']))
for num_line_break in range(int(df.iloc[index]['vertical_space']/df.iloc[index]['font_size'])):
    run.add_break(WD_BREAK.LINE)
run.add_text(df.iloc[index]['text'])
###

###
index = 14
p                      = document.add_paragraph()
p_format               = p.paragraph_format
p_format.left_indent   = Twips(pixel_to_twips(df.iloc[index]['text_left'])) - LEFT_ADJUST
# p_format.space_before  = Twips(pixel_to_twips(df.iloc[index]['vertical_space']))  - TOP_ADJUST

run                    = p.add_run()
font                   = run.font
font.name              = 'Arial'
font.size              = Twips(pixel_to_twips(df.iloc[index]['font_size']))
for num_line_break in range(int(df.iloc[index]['vertical_space']/df.iloc[index]['font_size'])):
    run.add_break(WD_BREAK.LINE)
run.add_text(df.iloc[index]['text'])
###

    
document.save(output_filepath)

In [None]:
import docx


def add_bookmark(paragraph, bookmark_text, bookmark_name):
    run = paragraph.add_run()
    tag = run._r  # for reference the following also works: tag =  document.element.xpath('//w:r')[-1]
    start = docx.oxml.shared.OxmlElement('w:bookmarkStart')
    start.set(docx.oxml.ns.qn('w:id'), '0')
    start.set(docx.oxml.ns.qn('w:name'), bookmark_name)
    tag.append(start)

    text = docx.oxml.OxmlElement('w:r')
    text.text = bookmark_text
    tag.append(text)

    end = docx.oxml.shared.OxmlElement('w:bookmarkEnd')
    end.set(docx.oxml.ns.qn('w:id'), '0')
    end.set(docx.oxml.ns.qn('w:name'), bookmark_name)
    tag.append(end)