In [1]:
import os
import sys
import time
import base64
import uuid
import pandas as pd
from zipfile import ZipFile
from lxml import etree
import xml.etree.ElementTree as ET
import codecs
import json
from itertools import groupby
import difflib


In [2]:

input_filepath = '/Users/kd/Workspace/python/DOCX/document-formatting/data/input/Archive1.docx'
output_dir     = '/Users/kd/Workspace/python/DOCX/document-formatting/data/output'

fetch_content_filepath = '/Users/kd/Workspace/python/DOCX/document-formatting/data/input/long_paragraph.json'
filename       = os.path.splitext(os.path.basename(input_filepath))[0]
translated_filename = filename + '_translated' + '.docx'


In [3]:
def get_string_xmltree(xml):
    return etree.tostring(xml)

def get_xml_tree(xml_string):
    return etree.fromstring(xml_string)

def get_xmltree(filepath, parse='xml'):
    if parse == 'html':
        parser = etree.HTMLParser()
        tree   = etree.parse(open(filepath, mode='r', encoding='utf-8'), parser)
        return tree
    else:
        parser = etree.XMLParser(recover=True)
        with open(filepath,'r') as file:
            xml_string    = file.read()
            return etree.fromstring(bytes(xml_string, encoding='utf-8'), parser)
    return None

def check_element_is(element, type_char):
    word_schema1 = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
    word_schema2 = 'http://purl.oclc.org/ooxml/wordprocessingml/main'
     
    return (element.tag == '{%s}%s' % (word_schema1, type_char)) or (element.tag == '{%s}%s' % (word_schema2, type_char))

def get_specific_tags(node, type_char):
    nodes = []
    for elem in node.iter():
        if check_element_is(elem, type_char):
            nodes.append(elem)
    return nodes

def add_identifier(node):
    node.attrib['id'] = str(uuid.uuid4())

def is_run_superscript(run):
    attrib    = {}
    vertAlign = get_specific_tags(run, 'vertAlign')
    if len(vertAlign) > 0:
        for key in vertAlign[0].attrib.keys():
            attrib['vertAlign_' + key.split('}')[-1]] = vertAlign[0].attrib[key]
    if 'vertAlign_val' in attrib:
        if attrib['vertAlign_val'] == 'superscript':
            return True
    return False
    
def update_run_text(r1, r2):
    t1s = get_specific_tags(r1, 't')
    t2s = get_specific_tags(r2, 't')
#     print('r1 text [%s], r2 text [%s]'% (t1s[0].text, t2s[0].text))
    t1s[0].text = t1s[0].text + t2s[0].text
    t2s[0].text = ''
    
def get_run_properties(run):
    attrib = {}
    rFonts = get_specific_tags(run, 'rFonts')
    sz     = get_specific_tags(run, 'sz')
    szCs   = get_specific_tags(run, 'szCs')
    
    if len(rFonts) > 0:
        for key in rFonts[0].attrib.keys():
            attrib['rFonts_' + key.split('}')[-1]] = rFonts[0].attrib[key]
    
    if len(sz) > 0:
        for key in sz[0].attrib.keys():
            attrib['sz_' + key.split('}')[-1]] = sz[0].attrib[key]
        
    if len(szCs) > 0:
        for key in szCs[0].attrib.keys():
            attrib['szCs_' + key.split('}')[-1]] = szCs[0].attrib[key]

    return attrib

def update_font_property(p, reduce=4):    
    szs    = get_specific_tags(p, 'sz')
    szCss  = get_specific_tags(p, 'szCs')
    value  = '{%s}%s' % ("http://schemas.openxmlformats.org/wordprocessingml/2006/main", 'val')

    for szCs in szCss:
        size  = szCs.attrib[value]
        szCs.set(value, str(int(size) - reduce))

    for sz in szs:
        size  = sz.attrib[value]
        sz.set(value, str(int(size) - reduce))
    
def compare_run_properties(run1, run2):
    attrib1 = get_run_properties(run1)
    attrib2 = get_run_properties(run2)
    
    if all (k in attrib1 for k in ('rFonts_ascii', 'sz_val', 'szCs_val')):
        if all (k in attrib2 for k in ('rFonts_ascii', 'sz_val', 'szCs_val')):
            if (attrib1['rFonts_ascii'] == attrib2['rFonts_ascii']) and \
            (attrib1['szCs_val'] == attrib2['szCs_val']) and \
            (attrib1['sz_val'] == attrib2['sz_val']) :
                return True
    
    return False

def get_line_connections(p):
    runs             = get_specific_tags(p, 'r')
    text_runs        = []
    
    for run in runs:
        if is_run_superscript(run) == False:
            text_runs.append(run)

    line_connections = []
    for index in range(len(text_runs) - 1):
        if (compare_run_properties(text_runs[index], text_runs[index+1])):
            line_connections.append((index, index+1, 'CONNECTED'))
        else:
            line_connections.append((index, index+1, 'NOT_CONNECTED'))
    return line_connections

def arrange_grouped_line_indices(line_connections, debug=False):
    lines          = [list(i) for j, i in groupby(line_connections, lambda a: a[2])]
    if debug:
        print('arrange_grouped_line_indices: %s \n---------\n' % (str(lines)))
        
    arranged_lines = []

    for line_items in lines:
        indices = []
        for line_item in line_items:
            indices.append(line_item[0])
            indices.append(line_item[1])
        indices = sorted(list(set(indices)))
        arranged_lines.append([indices, line_items[0][2]])
        
    if debug:
        print('arrange_grouped_line_indices,arranged_lines : %s \n---------\n' % (str(arranged_lines)))
    
    final_arranged_lines = []
    
    if len(arranged_lines) == 1:
        final_arranged_lines.append([arranged_lines[0][0], arranged_lines[0][1]])
    else:
        for index, line_item in enumerate(arranged_lines):
            if index == 0 and line_item[1] == 'NOT_CONNECTED':
                del line_item[0][-1]
            if index > 0 and index < (len(arranged_lines) - 1) and line_item[1] == 'NOT_CONNECTED':
                del line_item[0][0]
                del line_item[0][-1]
            if index == (len(arranged_lines) - 1) and line_item[1] == 'NOT_CONNECTED':
                del line_item[0][0]

            final_arranged_lines.append([line_item[0], line_item[1]])
    if debug:
        print('final_arrange_grouped_line_indices,arranged_lines : %s \n---------\n' % (str(final_arranged_lines)))
            
    return final_arranged_lines

def merge_runs(node, grouped_runs, debug=False):
    runs      = get_specific_tags(node, 'r')
    text_runs = []
    
    for run in runs:
        if is_run_superscript(run) == False:
            text_runs.append(run)

    for element in grouped_runs:
        if (element[1] == 'CONNECTED'):
            for index, run_index in enumerate(element[0]):
                if (index > 0):
                    if (debug):
                        print('merge index %d with %d' % ( run_index, 0))
                    update_run_text(text_runs[0], text_runs[run_index])
                    text_runs[run_index].getparent().remove(text_runs[run_index])
                    
def update_document_runs(document):
    '''
    the function iterates through the p tags and merges run that have exactly same
    visual property.
    '''
    tag_name                 = 'p'
    tags                     = get_specific_tags(document, tag_name)
    for p in tags:
        grouped_runs = arrange_grouped_line_indices(get_line_connections(p))
        merge_runs(p, grouped_runs, debug=False)
    return document

def get_text_tags(document):
    tags         = []
    runs         = get_specific_tags(document, 'r')
    for run in runs:
        if is_run_superscript(run) == False:
            texts = get_specific_tags(run, 't')
            for text in texts:
                if text.text and len(text.text.strip()) > 0:
                    add_identifier(text)
                    tags.append(text)
    return tags

In [4]:
def extract_docx(filepath, working_dir):
    filename       = os.path.splitext(os.path.basename(filepath))[0]
    extract_dir    = os.path.join(working_dir, filename)
    
    with ZipFile(filepath, 'r') as file:
        file.extractall(path=extract_dir)
        filenames = file.namelist()
    
    return extract_dir, filenames

def save_docx(extracted_dir, filenames, output_filename):
    with ZipFile(output_filename, 'w') as docx:
        for filename in filenames: 
            docx.write(os.path.join(extracted_dir, filename), filename)
            
def save_document_xml(extracted_dir, xml):
    with open(os.path.join(extracted_dir,'word/document.xml'), 'wb') as f:
        xmlstr = get_string_xmltree(xml)
        f.write(xmlstr)

In [5]:
def get_tokenized_sentences(filepath):
    from jsonpath_rw import jsonpath, parse
    json_data     = json.load(codecs.open(fetch_content_filepath, 'r', 'utf-8-sig'))
    jsonpath_expr = parse('$..tokenized_sentences[*]')
    matches       = jsonpath_expr.find(json_data)

    tokenized_sentences = []
    for match in matches:
        tokenized_sentences.append(match.value)
    
    return tokenized_sentences

In [6]:
def get_anchor_text(x, y, w, h, text):
    xml_element = '<w:p xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:aink="http://schemas.microsoft.com/office/drawing/2016/ink" xmlns:am3d="http://schemas.microsoft.com/office/drawing/2017/model3d" xmlns:cx="http://schemas.microsoft.com/office/drawing/2014/chartex" xmlns:cx1="http://schemas.microsoft.com/office/drawing/2015/9/8/chartex" xmlns:cx2="http://schemas.microsoft.com/office/drawing/2015/10/21/chartex" xmlns:cx3="http://schemas.microsoft.com/office/drawing/2016/5/9/chartex" xmlns:cx4="http://schemas.microsoft.com/office/drawing/2016/5/10/chartex" xmlns:cx5="http://schemas.microsoft.com/office/drawing/2016/5/11/chartex" xmlns:cx6="http://schemas.microsoft.com/office/drawing/2016/5/12/chartex" xmlns:cx7="http://schemas.microsoft.com/office/drawing/2016/5/13/chartex" xmlns:cx8="http://schemas.microsoft.com/office/drawing/2016/5/14/chartex" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml" xmlns:w16="http://schemas.microsoft.com/office/word/2018/wordml" xmlns:w16cex="http://schemas.microsoft.com/office/word/2018/wordml/cex" xmlns:w16cid="http://schemas.microsoft.com/office/word/2016/wordml/cid" xmlns:w16se="http://schemas.microsoft.com/office/word/2015/wordml/symex" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape" w14:paraId="4B2133C5" w14:textId="548353EB" w:rsidR="0025221C" w:rsidRDefault="00720170"><w:r><w:rPr><w:noProof /></w:rPr><mc:AlternateContent><mc:Choice Requires="wps"><w:drawing><wp:anchor distT="0" distB="0" distL="114300" distR="114300" simplePos="0" relativeHeight="251659264" behindDoc="0" locked="0" layoutInCell="1" allowOverlap="1" wp14:anchorId="4D373A9B" wp14:editId="021DFA22"><wp:simplePos x="0" y="0" /><wp:positionH relativeFrom="page"><wp:posOffset>%d</wp:posOffset></wp:positionH><wp:positionV relativeFrom="page"><wp:posOffset>%d</wp:posOffset></wp:positionV><wp:extent cx="%d" cy="%d" /><wp:effectExtent l="0" t="0" r="0" b="0" /><wp:wrapNone /><wp:docPr id="2" name="Text Box 2" /><wp:cNvGraphicFramePr /><a:graphic xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"><a:graphicData uri="http://schemas.microsoft.com/office/word/2010/wordprocessingShape"><wps:wsp><wps:cNvSpPr txBox="1" /><wps:spPr><a:xfrm><a:off x="0" y="0" /><a:ext cx="%d" cy="%d" /></a:xfrm><a:prstGeom prst="rect"><a:avLst /></a:prstGeom><a:noFill /><a:ln w="6350"><a:noFill /></a:ln></wps:spPr><wps:txbx><w:txbxContent><w:p w14:paraId="01C6C9C7" w14:textId="6420C7CF" w:rsidR="00720170" w:rsidRPr="00720170" w:rsidRDefault="00720170"><w:pPr><w:rPr><w:lang w:val="en-US" /></w:rPr></w:pPr><w:r><w:rPr><w:lang w:val="en-US" /></w:rPr><w:t>%s</w:t></w:r></w:p></w:txbxContent></wps:txbx><wps:bodyPr rot="0" spcFirstLastPara="0" vertOverflow="overflow" horzOverflow="overflow" vert="horz" wrap="square" lIns="0" tIns="0" rIns="0" bIns="0" numCol="1" spcCol="0" rtlCol="0" fromWordArt="0" anchor="t" anchorCtr="0" forceAA="0" compatLnSpc="1"><a:prstTxWarp prst="textNoShape"><a:avLst /></a:prstTxWarp><a:noAutofit /></wps:bodyPr></wps:wsp></a:graphicData></a:graphic></wp:anchor></w:drawing></mc:Choice></mc:AlternateContent></w:r></w:p>' % (x, y, w, h, w, h, text)
    return get_xml_tree(xml_element)

def get_pixel_twips(pixels):
    PIXEL_TO_TWIPS = 14.999903622654
    return int(PIXEL_TO_TWIPS * pixels)
    
def pixel_to_twips(px, dpi=108):
    INCH_TO_TWIPS  = 1440
    px_to_inches   = 1.0 / float(dpi)
    return int(px * px_to_inches * INCH_TO_TWIPS)

def pixels_to_emu(px):
    PIXEL_TO_EMU = 9525
    return int(PIXEL_TO_EMU * px)



In [7]:
extracted_dir, filenames = extract_docx(input_filepath, output_dir)


In [8]:
document_xml             = get_xmltree(os.path.join(extracted_dir, 'word', 'document.xml'))
ps                       = get_specific_tags(document_xml, 'p')
body                     = get_specific_tags(document_xml, 'body')

tokenized_sentences      = get_tokenized_sentences(fetch_content_filepath)
print('document has (%d) text tags, tokenized sentences (%d)' % (len(ps), len(tokenized_sentences)))


document has (3) text tags, tokenized sentences (85)


In [9]:
text = "Reportable"
text_height = 19
text_left = 695
text_top = 108
text_width = 90

t1 = get_anchor_text(pixels_to_emu(text_left), pixels_to_emu(text_top), pixels_to_emu(text_width), \
                     pixels_to_emu(text_height), text)

t2 = get_anchor_text(1270, 1880446, 4038600, 419100, 'IN THE SUPREME COURT OF INDIA')

In [10]:
body[0].remove(ps[0])


body[0].insert(0, t1)
body[0].insert(1, t2)

In [11]:
save_document_xml(extracted_dir, document_xml)
save_docx(extracted_dir, filenames, os.path.join(output_dir, translated_filename))