In [1]:
import os
import sys
import time
import base64
import uuid
import json
import codecs
import pandas as pd


In [2]:
base_dir        = os.path.dirname(os.getcwd())
input_dir       = os.path.join(base_dir, 'data', 'input')
output_dir      = os.path.join(base_dir, 'data', 'output')
filename        = 'CH_FILE1.json'

input_filepath  = os.path.join(input_dir, filename)
output_filepath = os.path.join(output_dir, os.path.splitext(os.path.basename(filename))[0]+'.docx')


In [3]:
def get_pages(filepath):
    data  = json.load(codecs.open(filepath, 'r', 'utf-8-sig'))
    pages = data['data']
    return pages

In [4]:
pages =  get_pages(input_filepath)   
print('document has %d pages' % (len(pages)))

document has 4 pages


In [10]:
for page in pages:
    if 'images' not in list(page.keys()) or 'text_blocks' not in list(page.keys()):
        print('ignoring this page')
    else:
        print('processing this page')

processing this page
processing this page
processing this page
ignoring this page


In [5]:
dfs              = []
page_width       = None
page_height      = None

for page in pages:
    text_tops        = []
    text_lefts       = []
    text_widths      = []
    text_heights     = []
    font_sizes       = []
    font_families    = []
    font_colors      = []
    text_values      = []
    b64_images       = []

    images       = page['images']
    texts        = page['text_blocks']
    page_num     = page['page_no']
    page_width   = page['page_width']
    page_height  = page['page_height']
    
    for text in texts:
        text_tops.append(text['text_top'])
        text_lefts.append(text['text_left'])
        text_widths.append(text['text_width'])
        text_heights.append(text['text_height'])
        
        if (str(text['font_size'])=='NaN') or text['font_size'] == None:
            text['font_size'] = text['children'][0]['font_size']
            font_sizes.append(text['font_size'])
            
        if (str(text['font_family'])=='NaN') or text['font_family'] == None:
            text['font_family'] = text['children'][0]['font_family']
            font_families.append(text['font_family'])
            
        if (str(text['font_color'])=='NaN') or text['font_color'] == None:
            text['font_color'] = text['children'][0]['font_color']
            font_colors.append(text['font_color'])
        
        b64_images.append(None)
        
        text_value = []
        for processed_text in text['tokenized_sentences']:
            text_value.append(processed_text['src'])        
        text_values.append(' '.join(text_value))
    
    for image in images:
        text_tops.append(image['text_top'])
        text_lefts.append(image['text_left'])
        text_widths.append(image['text_width'])
        text_heights.append(image['text_height'])
        b64_images.append(image['base64'])
        text_values.append(None)
        font_sizes.append(None)
        font_families.append(None)
        font_colors.append(None)
    
    df = pd.DataFrame(list(zip(text_tops, text_lefts, text_widths, text_heights,
                                            text_values, font_sizes, font_families, font_colors, b64_images)), 
                              columns =['text_top', 'text_left', 'text_width', 'text_height',
                                          'text', 'font_size', 'font_family', 'font_color', 'base64'])
    df.sort_values('text_top', axis = 0, ascending = True, inplace=True)    
    dfs.append(df)


KeyError: 'text_blocks'

In [15]:
from docx import Document
from docx.shared import Pt
from docx.shared import Twips, Cm
from docx.enum.text import WD_ALIGN_PARAGRAPH, WD_BREAK
from docx.enum.section import WD_SECTION, WD_ORIENT
from docx.shared import Length


def get_pixel_twips(pixels):
    PIXEL_TO_TWIPS = 14.999903622654
    return int(PIXEL_TO_TWIPS * pixels)

def get_font_point(pixels):
    return pixels * 0.75

def get_cms(pixels):
    PPI          = 108
    INCH_TO_CM   = 2.54
    PIXEL_PER_CM = PPI / 2.54
    
    return pixels / PIXEL_PER_CM

def get_path_from_base64(work_dir, b64_data):
    filepath = os.path.join(work_dir, str(uuid.uuid4().hex) + '.jpg')
    with open(filepath, 'wb') as file:
        file.write(base64.b64decode(b64_data))
    return filepath

def pixel_to_twips(px, dpi=108):
    INCH_TO_TWIPS  = 1440
    px_to_inches   = 1.0 / float(dpi)
    return int(px * px_to_inches * INCH_TO_TWIPS)

In [16]:
df_index = 2
df       = dfs[df_index]
df

Unnamed: 0,text_top,text_left,text_width,text_height,text,font_size,font_family,font_color,base64
0,101,202,529,101,"the Preliminary Test, so many candidates from ...",,,,


In [20]:
document  = Document()
section             = document.sections[-1]
section.left_margin   = Cm(1.27)
section.right_margin  = Cm(1.27)
section.top_margin    = Cm(1.27)
section.bottom_margin = Cm(1.27)

row       = df.iloc[1]
# empty p at start of page
p1 = document.add_paragraph()
p1_format           = p1.paragraph_format
p1_format.line_spacing = Pt(18)

# text p
p2 = document.add_paragraph()
p2_format = p2.paragraph_format
p2_format.left_indent = Twips(pixel_to_twips(row['text_left']))

run                        = p2.add_run()
font                       = run.font
font.name                  = 'Arial'
font.size                  = Twips(pixel_to_twips(row['font_size']))
run.add_text(row['text'])

# next text on the same page
row       = df.iloc[2]

p3 = document.add_paragraph()
p3_format = p3.paragraph_format
p3_format.left_indent = Cm(get_cms(row['text_left']))

run                        = p3.add_run()
font                       = run.font
font.name                  = 'Arial'
font.size                  = Twips(pixel_to_twips(row['font_size']))
run.add_text(row['text'])

# next text on the same page
row       = df.iloc[3]

p3 = document.add_paragraph()
p3_format = p3.paragraph_format
p3_format.left_indent = Twips(pixel_to_twips(row['text_left']))
p4_format.space_before = Twips(pixel_to_twips(row['text_top'] - df.iloc[2]['text_top']) )


run                        = p3.add_run()
font                       = run.font
font.name                  = 'Arial'
font.size                  = Twips(pixel_to_twips(row['font_size']))
run.add_text(row['text'])

# next text on the same page
row       = df.iloc[4]
p4 = document.add_paragraph()
p4_format = p4.paragraph_format
p4_format.left_indent = Twips(pixel_to_twips(row['text_left']))
p4_format.space_before = Twips(pixel_to_twips(row['text_top'] - df.iloc[3]['text_top']) )

run                        = p4.add_run()
font                       = run.font
font.name                  = 'Arial'
font.size                  = Twips(pixel_to_twips(row['font_size']))
run.add_text(row['text'])


document.save(output_filepath)

In [45]:
page_width, page_height, get_pixel_twips(page_width)

(892, 1263, 13379)

In [32]:
width_dpi  = 892
height_dpi = 1263



In [34]:
dpi = 108
pixel_to_twips(width_dpi, dpi), pixel_to_twips(height_dpi, dpi) 


(11893, 16840)

In [25]:
document  = Document()

for index, df in enumerate(dfs[:1]):
    section             = document.sections[-1]
    section.orientation = WD_ORIENT.PORTRAIT
    section.page_width  = Cm(get_cms(page_width))
    section.page_height = Cm(get_cms(page_height))

    section.left_margin   = Cm(1.27)
    section.right_margin  = Cm(1.27)
    section.top_margin    = Cm(1.27)
    section.bottom_margin = Cm(1.27)

    for index, row in df.iterrows():
        if row['text'] == None and row['base64'] != None:
            pass
#             image_path = get_path_from_base64(output_dir, row['base64'])
#             paragraph = document.add_paragraph()
#             run                        = paragraph.add_run()
#             run.add_drawing(image_path, width=Cm(get_cms(row['text_width'])), 
#                                  height=Cm(get_cms(row['text_height'])))
#             os.remove(image_path)
        else:
            paragraph = document.add_paragraph()

            paragraph_format           = paragraph.paragraph_format
            paragraph_format.left_indent = Cm(get_cms(row['text_left']))

            run                        = paragraph.add_run()
            font                       = run.font
            font.name                  = 'Arial'
            font.size                  = Cm(get_cms(row['font_size']))
            run.add_text(row['text'])
    
    paragraph = document.add_paragraph()
    run       = paragraph.add_run()
    run.add_break(WD_BREAK.PAGE)
    
    
document.save(output_filepath)