In [1]:
import os
import sys
import time
import base64
import uuid
import json
import codecs
import pandas as pd


In [11]:
base_dir        = os.path.dirname(os.getcwd())
input_dir       = os.path.join(base_dir, 'data', 'input')
output_dir      = os.path.join(base_dir, 'data', 'output')
filename        = 'judgement.json'

input_filepath  = os.path.join(input_dir, filename)
output_filepath = os.path.join(output_dir, os.path.splitext(os.path.basename(filename))[0]+'.docx')


In [12]:
output_filepath

'/Users/kd/Workspace/python/DOCX/document-formatting/data/output/judgement.docx'

In [3]:
def get_pages(filepath):
    data  = json.load(codecs.open(filepath, 'r', 'utf-8-sig'))
    pages = data['data']
    return pages

In [4]:
pages =  get_pages(input_filepath)   
print('document has %d pages' % (len(pages)))

document has 54 pages


In [5]:
dfs              = []

page_width       = None
page_height      = None

for page in pages:
    text_tops        = []
    text_lefts       = []
    text_widths      = []
    text_heights     = []
    font_sizes       = []
    font_families    = []
    font_colors      = []
    text_values      = []

    images       = page['images']
    texts        = page['text_blocks']
    page_num     = page['page_no']
    page_width   = page['page_width']
    page_height  = page['page_height']
    
    for text in texts:
        text_tops.append(text['text_top'])
        text_lefts.append(text['text_left'])
        text_widths.append(text['text_width'])
        text_heights.append(text['text_height'])
        font_sizes.append(text['font_size'])
        font_families.append(text['font_family'])
        font_colors.append(text['font_color'])
        
        text_value = []
        for processed_text in text['tokenized_sentences']:
            text_value.append(processed_text['src_text'])        
        text_values.append(' '.join(text_value))
        
    df = pd.DataFrame(list(zip(text_tops, text_lefts, text_widths, text_heights,
                                            text_values, font_sizes, font_families, font_colors)), 
                              columns =['text_top', 'text_left', 'text_width', 'text_height',
                                          'text', 'font_size', 'font_family', 'font_color'])
    df.sort_values('text_top', axis = 0, ascending = True, inplace=True)        

    dfs.append(df)



In [6]:
df_index = 0
df       = dfs[df_index]
df

Unnamed: 0,text_top,text_left,text_width,text_height,text,font_size,font_family,font_color
1,111,694,90,16,Reportable,17,"Arial,Bold",#000000
2,153,270,334,17,IN THE SUPREME COURT OF INDIA,18,"Arial,Bold",#000000
3,175,292,307,17,CIVIL APPELLATE JURISDICTION,18,"Arial,Bold",#000000
4,218,290,318,17,Civil Appeal Nos 9367-9369 of 2011,18,"Arial,Bold",#000000
5,304,108,328,17,"The Secretary, Ministry of Defence",18,"Arial,Bold",#000000
6,304,540,221,17,......Appellant,18,"Arial,Bold",#000000
7,390,16,500,17,Versus,18,"Arial,Bold",#000000
8,455,108,331,17,Babita Puniya & Ors.,18,"Arial,Bold",#000000
9,455,594,197,17,......Respondents,18,"Arial,Bold",#000000
10,584,426,46,17,With,18,"Arial,Bold",#000000


In [7]:
from docx import Document
from docx.shared import Pt
from docx.shared import Twips, Cm
from docx.enum.text import WD_ALIGN_PARAGRAPH, WD_BREAK
from docx.enum.section import WD_SECTION, WD_ORIENT
from docx.shared import Length


# def get_twips(pixels):
#     PIXEL_TO_TWIPS = 0.0666666667
#     return int(PIXEL_TO_TWIPS * pixels)

# def get_cms(pixels):
#     return int(pixels * 0.0264583333)

def get_font_point(pixels):
    return pixels * 0.75

def get_cms(pixels):
    PPI          = 108
    INCH_TO_CM   = 2.54
    PIXEL_PER_CM = PPI / 2.54
    
    return pixels / PIXEL_PER_CM


In [8]:
document  = Document()

for df in dfs:
    section   = document.sections[-1]
    section.orientation = WD_ORIENT.PORTRAIT
    section.page_width  = Cm(get_cms(page_width))
    section.page_height = Cm(get_cms(page_height))

    section.left_margin   = Cm(1.27)
    section.right_margin  = Cm(1.27)
    section.top_margin    = Cm(1.27)
    section.bottom_margin = Cm(1.27)

    for index, row in df.iterrows():
        paragraph = document.add_paragraph()

        paragraph_format           = paragraph.paragraph_format
        paragraph_format.left_indent = Cm(get_cms(row['text_left']))

        run                        = paragraph.add_run()
        font                       = run.font
        font.name                  = 'Arial'
        font.size                  = Cm(get_cms(row['font_size']))
        run.add_text(row['text'])
    
    paragraph = document.add_paragraph()
    run       = paragraph.add_run()
    run.add_break(WD_BREAK.PAGE)
    
document.save(output_filepath)