In [10]:
import os
import sys
import time
import base64
import uuid
import json
import codecs
import pandas as pd


In [19]:
base_dir        = os.path.dirname(os.getcwd())
input_dir       = os.path.join(base_dir, 'data', 'input')
output_dir      = os.path.join(base_dir, 'data', 'output')
filename        = 'yolo.json'

input_filepath  = os.path.join(input_dir, filename)
output_filepath = os.path.join(output_dir, os.path.splitext(os.path.basename(filename))[0]+'.docx')


In [20]:
def get_pages(filepath):
    data  = json.load(codecs.open(filepath, 'r', 'utf-8-sig'))
    pages = data['data']
    return pages

In [21]:
pages =  get_pages(input_filepath)   
print('document has %d pages' % (len(pages)))

document has 10 pages


In [22]:
dfs              = []
page_width       = None
page_height      = None

for page in pages:
    text_tops        = []
    text_lefts       = []
    text_widths      = []
    text_heights     = []
    font_sizes       = []
    font_families    = []
    font_colors      = []
    text_values      = []
    b64_images       = []

    images       = page['images']
    texts        = page['text_blocks']
    page_num     = page['page_no']
    page_width   = page['page_width']
    page_height  = page['page_height']
    
    for text in texts:
        text_tops.append(text['text_top'])
        text_lefts.append(text['text_left'])
        text_widths.append(text['text_width'])
        text_heights.append(text['text_height'])
        font_sizes.append(text['font_size'])
        font_families.append(text['font_family'])
        font_colors.append(text['font_color'])
        b64_images.append(None)
        
        text_value = []
        for processed_text in text['tokenized_sentences']:
            text_value.append(processed_text['src_text'])        
        text_values.append(' '.join(text_value))
    
    for image in images:
        text_tops.append(image['text_top'])
        text_lefts.append(image['text_left'])
        text_widths.append(image['text_width'])
        text_heights.append(image['text_height'])
        b64_images.append(image['base64'])
        text_values.append(None)
        font_sizes.append(None)
        font_families.append(None)
        font_colors.append(None)
    
    df = pd.DataFrame(list(zip(text_tops, text_lefts, text_widths, text_heights,
                                            text_values, font_sizes, font_families, font_colors, b64_images)), 
                              columns =['text_top', 'text_left', 'text_width', 'text_height',
                                          'text', 'font_size', 'font_family', 'font_color', 'base64'])
    df.sort_values('text_top', axis = 0, ascending = True, inplace=True)    
    dfs.append(df)


In [23]:
df_index = 0
df     = dfs[df_index]
df

Unnamed: 0,text_top,text_left,text_width,text_height,text,font_size,font_family,font_color,base64
27,0,0,918,1188,,,,,iVBORw0KGgoAAAANSUhEUgAAA5YAAASkCAIAAACXfAIBAA...
0,118,346,200,19,You Only Look Once:,21.0,NimbusRomNo9L-Medi,#000000,
1,145,279,334,19,"Unified, Real-Time Object Detection",21.0,NimbusRomNo9L-Medi,#000000,
2,190,205,482,20,"Joseph Redmon ∗ , Santosh Divvala ∗† , Ross Gi...",17.0,NimbusRomNo9L-Regu,#000000,
3,214,243,407,16,"University of Washington ∗ , Allen Institute f...",13.0,NimbusRomNo9L-Regu,#000000,
4,237,312,269,14,http://pjreddie.com/yolo/,17.0,NimbusMonL-Regu,#ec008b,
5,297,219,67,16,Abstract,17.0,NimbusRomNo9L-Medi,#000000,
14,308,721,17,4,Person: 0.64,2.0,Futura-Medium,#ffffff,
15,320,784,16,4,Horse: 0.28,2.0,Futura-Medium,#ffffff,
6,325,75,354,157,"We present YOLO, a new approach to object dete...",14.0,NimbusRomNo9L-ReguItal,#000000,


In [24]:
from docx import Document
from docx.shared import Pt
from docx.shared import Twips, Cm
from docx.enum.text import WD_ALIGN_PARAGRAPH, WD_BREAK
from docx.enum.section import WD_SECTION, WD_ORIENT
from docx.shared import Length


# def get_twips(pixels):
#     PIXEL_TO_TWIPS = 0.0666666667
#     return int(PIXEL_TO_TWIPS * pixels)

# def get_cms(pixels):
#     return int(pixels * 0.0264583333)

def get_font_point(pixels):
    return pixels * 0.75

def get_cms(pixels):
    PPI          = 108
    INCH_TO_CM   = 2.54
    PIXEL_PER_CM = PPI / 2.54
    
    return pixels / PIXEL_PER_CM

def get_path_from_base64(work_dir, b64_data):
    filepath = os.path.join(work_dir, str(uuid.uuid4().hex) + '.jpg')
    with open(filepath, 'wb') as file:
        file.write(base64.b64decode(b64_data))
    return filepath

In [25]:
document  = Document()

for index, df in enumerate(dfs[:1]):
    section             = document.sections[-1]
    section.orientation = WD_ORIENT.PORTRAIT
    section.page_width  = Cm(get_cms(page_width))
    section.page_height = Cm(get_cms(page_height))

    section.left_margin   = Cm(1.27)
    section.right_margin  = Cm(1.27)
    section.top_margin    = Cm(1.27)
    section.bottom_margin = Cm(1.27)

    for index, row in df.iterrows():
        if row['text'] == None and row['base64'] != None:
            pass
#             image_path = get_path_from_base64(output_dir, row['base64'])
#             paragraph = document.add_paragraph()
#             run                        = paragraph.add_run()
#             run.add_drawing(image_path, width=Cm(get_cms(row['text_width'])), 
#                                  height=Cm(get_cms(row['text_height'])))
#             os.remove(image_path)
        else:
            paragraph = document.add_paragraph()

            paragraph_format           = paragraph.paragraph_format
            paragraph_format.left_indent = Cm(get_cms(row['text_left']))

            run                        = paragraph.add_run()
            font                       = run.font
            font.name                  = 'Arial'
            font.size                  = Cm(get_cms(row['font_size']))
            run.add_text(row['text'])
    
    paragraph = document.add_paragraph()
    run       = paragraph.add_run()
    run.add_break(WD_BREAK.PAGE)
    
    
document.save(output_filepath)