In [1]:
import os
import sys
import time
import base64
import uuid
import pandas as pd
from zipfile import ZipFile
from lxml import etree
import xml.etree.ElementTree as ET
import codecs
import json
from itertools import groupby

utilities_dir = '/Users/kd/Workspace/python/helpers'
sys.path.append(utilities_dir)

from file_directory_utils import (create_directory, read_directory_files, get_subdirectories, get_all_file_paths)


In [2]:
input_filepath = '/Users/kd/Workspace/python/DOCX/document-formatting/data/input/demo_judgment_1.docx'
output_dir     = '/Users/kd/Workspace/python/DOCX/document-formatting/data/output'

fetch_content_filepath = '/Users/kd/Workspace/python/DOCX/document-formatting/data/input/demo_judgment_1.json'
filename       = os.path.splitext(os.path.basename(input_filepath))[0]


In [3]:
def get_string_xmltree(xml):
    return etree.tostring(xml)

def get_xml_tree(xml_string):
    return etree.fromstring(xml_string)

def get_xmltree(filepath, parse='xml'):
    if parse == 'html':
        parser = etree.HTMLParser()
        tree   = etree.parse(open(filepath, mode='r', encoding='utf-8'), parser)
        return tree
    else:
        with open(filepath,'r') as file:
            xml_string    = file.read()
            return etree.fromstring(bytes(xml_string, encoding='utf-8'))
    return None

def check_element_is(element, type_char):
    word_schema1 = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
    word_schema2 = 'http://purl.oclc.org/ooxml/wordprocessingml/main'
     
    return (element.tag == '{%s}%s' % (word_schema1, type_char)) or (element.tag == '{%s}%s' % (word_schema2, type_char))

def get_specific_tags(node, type_char):
    nodes = []
    for elem in node.iter():
        if check_element_is(elem, type_char):
            nodes.append(elem)
    return nodes

def add_identifier(node):
    node.attrib['id'] = str(uuid.uuid4())

def is_run_superscript(run):
    attrib    = {}
    vertAlign = get_specific_tags(run, 'vertAlign')
    if len(vertAlign) > 0:
        for key in vertAlign[0].attrib.keys():
            attrib['vertAlign_' + key.split('}')[-1]] = vertAlign[0].attrib[key]
    if 'vertAlign_val' in attrib:
        if attrib['vertAlign_val'] == 'superscript':
            return True
    return False
    
def update_run_text(r1, r2):
    t1s = get_specific_tags(r1, 't')
    t2s = get_specific_tags(r2, 't')
#     print('r1 text [%s], r2 text [%s]'% (t1s[0].text, t2s[0].text))
    t1s[0].text = t1s[0].text + t2s[0].text
    t2s[0].text = ''
    
def get_run_properties(run):
    attrib = {}
    rFonts = get_specific_tags(run, 'rFonts')
    sz     = get_specific_tags(run, 'sz')
    szCs   = get_specific_tags(run, 'szCs')
    
    if len(rFonts) > 0:
        for key in rFonts[0].attrib.keys():
            attrib['rFonts_' + key.split('}')[-1]] = rFonts[0].attrib[key]
    
    if len(sz) > 0:
        for key in sz[0].attrib.keys():
            attrib['sz_' + key.split('}')[-1]] = sz[0].attrib[key]
        
    if len(szCs) > 0:
        for key in szCs[0].attrib.keys():
            attrib['szCs_' + key.split('}')[-1]] = szCs[0].attrib[key]

    return attrib

def compare_run_properties(run1, run2):
    attrib1 = get_run_properties(run1)
    attrib2 = get_run_properties(run2)
    
    if all (k in attrib1 for k in ('rFonts_ascii', 'sz_val', 'szCs_val')):
        if all (k in attrib2 for k in ('rFonts_ascii', 'sz_val', 'szCs_val')):
            if (attrib1['rFonts_ascii'] == attrib2['rFonts_ascii']) and \
            (attrib1['szCs_val'] == attrib2['szCs_val']) and \
            (attrib1['sz_val'] == attrib2['sz_val']) :
                return True
    
    return False

def get_line_connections(p):
    runs             = get_specific_tags(p, 'r')
    text_runs        = []
    
    for run in runs:
        if is_run_superscript(run) == False:
            text_runs.append(run)

    line_connections = []
    for index in range(len(text_runs) - 1):
        if (compare_run_properties(text_runs[index], text_runs[index+1])):
            line_connections.append((index, index+1, 'CONNECTED'))
        else:
            line_connections.append((index, index+1, 'NOT_CONNECTED'))
    return line_connections

def arrange_grouped_line_indices(line_connections, debug=False):
    lines          = [list(i) for j, i in groupby(line_connections, lambda a: a[2])]
    if debug:
        print('arrange_grouped_line_indices: %s \n---------\n' % (str(lines)))
        
    arranged_lines = []

    for line_items in lines:
        indices = []
        for line_item in line_items:
            indices.append(line_item[0])
            indices.append(line_item[1])
        indices = sorted(list(set(indices)))
        arranged_lines.append([indices, line_items[0][2]])
        
    if debug:
        print('arrange_grouped_line_indices,arranged_lines : %s \n---------\n' % (str(arranged_lines)))
    
    final_arranged_lines = []
    
    if len(arranged_lines) == 1:
        final_arranged_lines.append([arranged_lines[0][0], arranged_lines[0][1]])
    else:
        for index, line_item in enumerate(arranged_lines):
            if index == 0 and line_item[1] == 'NOT_CONNECTED':
                del line_item[0][-1]
            if index > 0 and index < (len(arranged_lines) - 1) and line_item[1] == 'NOT_CONNECTED':
                del line_item[0][0]
                del line_item[0][-1]
            if index == (len(arranged_lines) - 1) and line_item[1] == 'NOT_CONNECTED':
                del line_item[0][0]

            final_arranged_lines.append([line_item[0], line_item[1]])
    if debug:
        print('final_arrange_grouped_line_indices,arranged_lines : %s \n---------\n' % (str(final_arranged_lines)))
            
    return final_arranged_lines

def merge_runs(node, grouped_runs, debug=False):
    runs      = get_specific_tags(node, 'r')
    text_runs = []
    
    for run in runs:
        if is_run_superscript(run) == False:
            text_runs.append(run)

    for element in grouped_runs:
        if (element[1] == 'CONNECTED'):
            for index, run_index in enumerate(element[0]):
                if (index > 0):
                    if (debug):
                        print('merge index %d with %d' % ( run_index, 0))
                    update_run_text(text_runs[0], text_runs[run_index])
                    
def update_document_runs(document):
    '''
    the function iterates through the p tags and merges run that have exactly same
    visual property.
    '''
    tag_name                 = 'p'
    tags                     = get_specific_tags(document, tag_name)
    for p in tags:
        grouped_runs = arrange_grouped_line_indices(get_line_connections(p))
        merge_runs(p, grouped_runs, debug=False)
    return document

def get_text_tags(document):
    tags         = []
    texts        = get_specific_tags(document, 't')
    for text in texts:
        if text.text and len(text.text.strip()) > 0:
            add_identifier(text)
            tags.append(text)
    return tags

In [4]:
def extract_docx(filepath, working_dir):
    filename       = os.path.splitext(os.path.basename(filepath))[0]
    extract_dir    = os.path.join(working_dir, filename)
    
    with ZipFile(filepath, 'r') as file:
        file.extractall(path=extract_dir)
        filenames = file.namelist()
    
    return extract_dir, filenames

def save_docx(extracted_dir, filenames, output_filename):
    with ZipFile(output_filename, 'w') as docx:
        for filename in filenames: 
            docx.write(os.path.join(extracted_dir, filename), filename)
            
def save_document_xml(extracted_dir, xml):
    with open(os.path.join(extracted_dir,'word/document.xml'), 'wb') as f:
        xmlstr = get_string_xmltree(xml)
        f.write(xmlstr)

In [5]:
def get_tokenized_sentences(filepath):
    from jsonpath_rw import jsonpath, parse
    json_data     = json.load(codecs.open(fetch_content_filepath, 'r', 'utf-8-sig'))
    jsonpath_expr = parse('$..tokenized_sentences[*]')
    matches       = jsonpath_expr.find(json_data)

    tokenized_sentences = []
    for match in matches:
        tokenized_sentences.append(match.value)
    
    return tokenized_sentences

In [6]:
def count_occurrences(string, substring): 
    count = 0
    start = 0
    while start < len(string): 
        pos = string.find(substring, start) 

        if pos != -1: 
            start = pos + 1
            count += 1
        else: 
            break
    return count 


def check_string_status(doc_tag, tokenized):
    doc_text       = doc_tag.text.replace(" ", "")
    tokenized_text = tokenized['src'].replace(" ", "")
    
    if len(doc_text) < 2 or len(tokenized_text) < 2:
        if doc_text.isdigit() == False or tokenized_text.isdigit() == False:
            return (False, False)
    
    '''
        perfect match
    '''
    if doc_text == tokenized_text:
        return (True, 0)
    
    count         = 0
    if len(doc_text) > len(tokenized_text):
        count = count_occurrences(doc_text, tokenized_text)
        if count != 0:
            return (True, -1)
    else:
        count = count_occurrences(tokenized_text, doc_text)
        if count != 0:
            return (True, 1)
    
    return (False, False)

def string_overlap(str1, str2):
    str1_list   = [x for x in str1.split(' ') if x]
    str1_set    = set(str1_list)
    
    str2_list   = [x for x in str2.split(' ') if x]
    str2_set    = set(str2_list)
    
    common_set  = str1_set.intersection(str2_set)
    diff_set    = str1_set.difference(str2_set)
    overlap_list = []
    
    if len(str1_list) > len(str2_list):
        for word in str2_list:
            if word in list(common_set):
                overlap_list.append(word)
    else:
        for word in str1_list:
            if word in list(common_set):
                overlap_list.append(word)
                
    return ' '.join(overlap_list)

def check_string_status_v1(doc_tag, tokenized, overlap_threshold=4):
    doc_text       = doc_tag.text.replace(" ", "")
    tokenized_text = tokenized['src'].replace(" ", "")
    
    if len(doc_text) < 2 or len(tokenized_text) < 2:
        if doc_text.isdigit() == False or tokenized_text.isdigit() == False:
            return (False, False)
    
    '''
        perfect match
    '''
    if doc_text == tokenized_text:
        return (True, 0)
    
    doc_text              = doc_tag.text
    tokenized_text        = tokenized['src']
    overlap_str           = string_overlap(doc_text, tokenized_text)

    doc_text_list         = [x for x in doc_text.split(' ') if x]
    tokenized_text_list   = [x for x in tokenized_text.split(' ') if x]
    
    if len(overlap_str) > 0:
        if (len(doc_text_list) <= len(tokenized_text_list)):
            if (abs(len(doc_text) - len(overlap_str)) <= overlap_threshold):
                return (True, 1)
        else:
            if (abs(len(tokenized_text) - len(overlap_str)) <= overlap_threshold):
                return (True, -1)
    
    return (False, False)


In [7]:
def get_as_df(tags, tokenized_sentences):
    doc_texts = []
    doc_ids   = []
    for tag in tags:
        doc_texts.append(tag.text)
        doc_ids.append(tag.attrib['id'])

    tokenized_src_texts = []
    tokenized_tgt_texts = []
    for tokenized_sentence in tokenized_sentences:
        tokenized_src_texts.append(tokenized_sentence['src'])
        tokenized_tgt_texts.append(tokenized_sentence['tgt'])

    if len(doc_texts) > len(tokenized_src_texts):
        empty = [''] * (len(doc_texts) - len(tokenized_src_texts))
        tokenized_src_texts.extend(empty)
        tokenized_tgt_texts.extend(empty)
    else:
        empty = [''] * (len(tokenized_src_texts) - len(doc_texts))
        doc_texts.extend(empty)
        doc_ids.extend(empty)

    df = pd.DataFrame(list(zip(doc_texts, doc_ids, tokenized_src_texts, tokenized_tgt_texts)), 
                                  columns =['doc_texts', 'doc_ids', 'tokenized_src_texts', 'tokenized_tgt_texts'])
    return df

In [8]:
def replace_translated(placeholders, texts):
    for placeholder in placeholders:
        for text in texts:
            if 'id' in text.attrib:
                if text.attrib['id'] == placeholder['id']:
                    text.text = placeholder['tgt']

In [9]:
extracted_dir, filenames = extract_docx(input_filepath, output_dir)


In [10]:
document_xml             = get_xmltree(os.path.join(extracted_dir, 'word', 'document.xml'))
document_xml             = update_document_runs(document_xml)
texts                    = get_text_tags(document_xml)

tokenized_sentences      = get_tokenized_sentences(fetch_content_filepath)
print('document has (%d) text tags, tokenized sentences (%d)' % (len(texts), len(tokenized_sentences)))


document has (54) text tags, tokenized sentences (46)


In [11]:
# df = get_as_df(texts, tokenized_sentences)
# df.head(60)


In [19]:
## re-attempt
matched_ids    = []

founds         = []
is_substrings  = []
doc_texts      = []
srcs           = []
tgts           = []
ids            = []
s_ids          = []

for sent_index in range(len(tokenized_sentences)):
    for text_index in range(len(texts)):
        if (texts[text_index].attrib['id'] in matched_ids):
            continue

        is_found, is_substring = check_string_status_v1(texts[text_index], tokenized_sentences[sent_index])

        if is_found:
            is_substrings.append(is_substring)
            doc_texts.append(texts[text_index].text)
            ids.append(texts[text_index].attrib['id'])
            s_ids.append(tokenized_sentences[sent_index]['s_id'])
            srcs.append(tokenized_sentences[sent_index]['src'])
            tgts.append(tokenized_sentences[sent_index]['tgt'])

df = pd.DataFrame(list(zip(is_substrings, doc_texts, ids, s_ids, srcs, tgts)), 
                                  columns =['is_substrings', 'doc_texts', 'ids', 's_ids', 'srcs', 'tgts'])

print(df.shape())

TypeError: 'tuple' object is not callable

In [17]:
df.to_csv('file1.csv')

In [18]:
tokenized_match

[{'id': '695f0c71-56e1-4790-94c4-e534813e8331',
  's_id': 'ca78342f-ea65-4781-993d-d3d7dbb964d5',
  'src': '1. A quest for equality of opportunity for women seeking Permanent Commissions in the Indian Army forms the basis of these appeals.',
  'text': 'A quest for equality of opportunity for women seeking Permanent Commissions in the Indian Army forms the basis of these appeals. The lead appeal originated in a batch of Writ Petitions which were instituted before the High Court of Delhi in 2003 and 2006.',
  'tgt': '1. भारतीय सेना में स्थायी आयोगों की मांग करने वाली महिलाओं के लिए अवसर की समानता की खोज इन अपीलों का आधार बनती है।'},
 {'id': '695f0c71-56e1-4790-94c4-e534813e8331',
  's_id': 'a82ec304-a717-4c54-83e7-f927e4480040',
  'src': 'The lead appeal originated in a batch of Writ Petitions which were instituted before the High Court of Delhi in 2003 and 2006.',
  'text': 'A quest for equality of opportunity for women seeking Permanent Commissions in the Indian Army forms the basis of

In [14]:
text_match_dict = {}
for elem in text_match:
    if elem['s_id'] not in text_match_dict.keys():
        text_match_dict[elem['s_id']] = [elem]
    else:
        text_match_dict[elem['s_id']].append(elem)

In [15]:
unique_text_match = []
multiple_text_match_dict = {}

for key in text_match_dict.keys():
    if len(text_match_dict[key]) > 1:
        multiple_text_match_dict[key] = text_match_dict[key]
    else:
        unique_text_match.append(text_match_dict[key][0])


In [16]:
for key in multiple_text_match_dict.keys():
    doc_text = ''
    for elem in multiple_text_match_dict[key]:
        doc_text = doc_text + ' ' + elem['text']
        
    src_list = [x for x in elem['src'].split(' ') if x]
    src_set  = set(src_list)
    
    doc_list = [x for x in doc_text.split(' ') if x]
    doc_set  = set(doc_list)
    if len(doc_list) <= len(src_list):
        if (len(doc_set) <= len(src_set.intersection(doc_set))) and \
        len(src_set.difference(doc_set)) <= 2:
            unique_text_match.append(multiple_text_match_dict[key][0])
    else:
        print('repeating text')
        print(doc_list, '\n')
        print(src_list, '\n')

repeating text
['Background', 'of', 'the', 'dispute', 'Background', 'of', 'the', 'dispute'] 

['A', 'Background', 'of', 'the', 'dispute'] 

repeating text
['Background', 'of', 'the', 'dispute', 'Background', 'of', 'the', 'dispute'] 

['A', 'Background', 'of', 'the', 'dispute'] 



In [17]:
replace_translated(perfect_match, texts)
replace_translated(unique_text_match, texts)

replace_translated(tokenized_match, texts)


In [18]:
save_document_xml(extracted_dir, document_xml)
save_docx(extracted_dir, filenames, os.path.join(output_dir, "kd2.docx"))

In [45]:



str1 = 'A quest for equality of opportunity for women seeking Permanent Commissions in the Indian Army forms the basis of these appeals. The lead appeal originated in a batch of Writ Petitions which were instituted before the High Court of Delhi in 2003 and 2006.'
str2 = '1. A quest for equality of opportunity for women seeking Permanent Commissions in the Indian Army forms the basis of these appeals.'
len(str1)
# print(string_overlap(str2, str1))
        

255

In [11]:
tokenized_sentences = tokenized_sentences
tags                = texts
texts_matched       = []
placeholders        = []

for sent_index in range(len(tokenized_sentences)):
    for tag_index in range(len(tags)):
        if (tags[tag_index].attrib['id'] not in texts_matched) and \
            (tokenized_sentences[sent_index]['s_id'] not in texts_matched):
            found, tag_text_substring = check_string_status(tags[tag_index], tokenized_sentences[sent_index])
            if found == True:
#                 print('%s <-> %s <--> %r' % (tags[tag_index].text, tokenized_sentences[sent_index]['src'], tag_text_substring))
                if tag_text_substring == None:
                    texts_matched.append(tags[tag_index].attrib['id'])
                    texts_matched.append(tokenized_sentences[sent_index]['s_id'])
                    placeholders.append({
                        's_id': tokenized_sentences[sent_index]['s_id'],
                        'src': tokenized_sentences[sent_index]['src'],
                        'tgt': tokenized_sentences[sent_index]['tgt'],
                        'id': tags[tag_index].attrib['id'],
                        'text': tags[tag_index].text
                    })
                if tag_text_substring == True:
                    texts_matched.append(tokenized_sentences[sent_index]['s_id'])
                    placeholders.append({
                        's_id': tokenized_sentences[sent_index]['s_id'],
                        'src': tokenized_sentences[sent_index]['src'],
                        'tgt': tokenized_sentences[sent_index]['tgt'],
                        'id': tags[tag_index].attrib['id'],
                        'text': tags[tag_index].text
                    })
        else:
            pass
#             print('tag_id %s %s already matched' % (tags[tag_index].attrib['id'], tags[tag_index].text))

'''
    check how much tokenized didn't match
'''
unmatched_tokenized_sentences = []
for tokenized_sentence in tokenized_sentences:
    if tokenized_sentence['s_id'] in texts_matched:
        pass
    else:
        print('unmatched tokenized text:-> %s' % (tokenized_sentence['src']))
        unmatched_tokenized_sentences.append(tokenized_sentence)

print('tokenized sentences, initial: %d, matched: %d, unmatched: %d' % (len(tokenized_sentences), len(placeholders), len(unmatched_tokenized_sentences)))

unmatched_tags = []
for tag in tags:
    if tag.attrib['id'] in texts_matched:
        pass
    else:
        print('unmatched tag text:-> %s' % (tag.text))
        unmatched_tags.append(tag.text)


unmatched tokenized text:-> G Blanket restriction on criteria appointments
tokenized sentences, initial: 48, matched: 47, unmatched: 1
unmatched tag text:-> ...Appellant
unmatched tag text:-> 
unmatched tag text:-> 
unmatched tag text:-> 
unmatched tag text:-> 
unmatched tag text:-> 
unmatched tag text:-> ...Respondents
unmatched tag text:-> Signature Not Verified
unmatched tag text:-> SANJAY KUMAR
unmatched tag text:-> 13:49:23 IST
unmatched tag text:-> Reason:
unmatched tag text:-> 
unmatched tag text:-> 
unmatched tag text:-> 
unmatched tag text:-> 
unmatched tag text:-> Background of the dispute
unmatched tag text:-> Proposal of the Union of India
unmatched tag text:-> Submissions
unmatched tag text:-> Consequence of the policy letter dated 25 February 2019
unmatched tag text:-> Stereotypes and women in the Armed Forces
unmatched tag text:-> Consequence of non-compliance
unmatched tag text:-> Directions
unmatched tag text:-> A
unmatched tag text:-> Background of the dispute
unmatch

In [12]:
placeholders

[{'id': '979e0e55-6333-447a-8976-ed8cb8ec82be',
  's_id': '78844881-23a5-4a12-955e-ddc7b2df8c1d',
  'src': 'Reportable',
  'text': 'Reportable',
  'tgt': 'रिपोर्ट योग्य'},
 {'id': '3da4ac2a-b07b-4304-80d0-6675c1fa050f',
  's_id': '0a8ee6fb-bc27-482e-ab47-7169c2acccbb',
  'src': 'IN THE SUPREME COURT OF INDIA',
  'text': 'IN THE SUPREME COURT OF INDIA',
  'tgt': 'भारत के सर्वोच्च न्यायालय में'},
 {'id': '7758957c-f62a-46bf-9c58-4b1b992bfd38',
  's_id': '65b11398-df84-4199-aad7-e5eb6646c770',
  'src': 'CIVIL APPELLATE JURISDICTION',
  'text': 'CIVIL APPELLATE JURISDICTION',
  'tgt': 'दीवानी अपीलीय क्षेत्राधिकार'},
 {'id': '66ea0f20-8970-4ccf-bb53-5a7d74f8b285',
  's_id': 'fd77ae53-eadd-48a2-8555-3eb00db0583b',
  'src': 'Civil Appeal Nos 9367-9369 of 2011',
  'text': 'Civil Appeal Nos 9367-9369 of 2011',
  'tgt': '9367 की दीवानी अपील सं. 2011-9369'},
 {'id': 'd88ec932-f403-45d6-a290-1933a471ed27',
  's_id': 'd30d5348-1829-4ab4-ac3c-19ac102c0411',
  'src': 'The Secretary, Ministry of Defen

In [13]:
# replace the target token
for placeholder in placeholders:
    for tag in tags:
        if 'id' in tag.attrib:
            if tag.attrib['id'] == placeholder['id']:
                tag.text = placeholder['tgt']

In [8]:
save_document_xml(extracted_dir, document_xml)
save_docx(extracted_dir, filenames, os.path.join(output_dir, "kd2.docx"))