In [1]:
import os
import sys
import time
import base64
import uuid
import pandas as pd
from zipfile import ZipFile
from lxml import etree
import xml.etree.ElementTree as ET
import codecs
import json
from itertools import groupby
import difflib


In [2]:
input_filepath = '/Users/kd/Workspace/python/DOCX/document-formatting/data/input/long_paragraph.docx'
output_dir     = '/Users/kd/Workspace/python/DOCX/document-formatting/data/output'

fetch_content_filepath = '/Users/kd/Workspace/python/DOCX/document-formatting/data/input/long_paragraph.json'
filename       = os.path.splitext(os.path.basename(input_filepath))[0]
translated_filename = filename + '_translated' + '.docx'


In [3]:
def get_string_xmltree(xml):
    return etree.tostring(xml)

def get_xml_tree(xml_string):
    return etree.fromstring(xml_string)

def get_xmltree(filepath, parse='xml'):
    if parse == 'html':
        parser = etree.HTMLParser()
        tree   = etree.parse(open(filepath, mode='r', encoding='utf-8'), parser)
        return tree
    else:
        with open(filepath,'r') as file:
            xml_string    = file.read()
            return etree.fromstring(bytes(xml_string, encoding='utf-8'))
    return None

def check_element_is(element, type_char):
    word_schema1 = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
    word_schema2 = 'http://purl.oclc.org/ooxml/wordprocessingml/main'
     
    return (element.tag == '{%s}%s' % (word_schema1, type_char)) or (element.tag == '{%s}%s' % (word_schema2, type_char))

def get_specific_tags(node, type_char):
    nodes = []
    for elem in node.iter():
        if check_element_is(elem, type_char):
            nodes.append(elem)
    return nodes

def add_identifier(node):
    node.attrib['id'] = str(uuid.uuid4())

def is_run_superscript(run):
    attrib    = {}
    vertAlign = get_specific_tags(run, 'vertAlign')
    if len(vertAlign) > 0:
        for key in vertAlign[0].attrib.keys():
            attrib['vertAlign_' + key.split('}')[-1]] = vertAlign[0].attrib[key]
    if 'vertAlign_val' in attrib:
        if attrib['vertAlign_val'] == 'superscript':
            return True
    return False
    
def update_run_text(r1, r2):
    t1s = get_specific_tags(r1, 't')
    t2s = get_specific_tags(r2, 't')
#     print('r1 text [%s], r2 text [%s]'% (t1s[0].text, t2s[0].text))
    t1s[0].text = t1s[0].text + t2s[0].text
    t2s[0].text = ''
    
def get_run_properties(run):
    attrib = {}
    rFonts = get_specific_tags(run, 'rFonts')
    sz     = get_specific_tags(run, 'sz')
    szCs   = get_specific_tags(run, 'szCs')
    
    if len(rFonts) > 0:
        for key in rFonts[0].attrib.keys():
            attrib['rFonts_' + key.split('}')[-1]] = rFonts[0].attrib[key]
    
    if len(sz) > 0:
        for key in sz[0].attrib.keys():
            attrib['sz_' + key.split('}')[-1]] = sz[0].attrib[key]
        
    if len(szCs) > 0:
        for key in szCs[0].attrib.keys():
            attrib['szCs_' + key.split('}')[-1]] = szCs[0].attrib[key]

    return attrib

def update_font_property(p, reduce=4):    
    szs    = get_specific_tags(p, 'sz')
    szCss  = get_specific_tags(p, 'szCs')
    value  = '{%s}%s' % ("http://schemas.openxmlformats.org/wordprocessingml/2006/main", 'val')

    for szCs in szCss:
        size  = szCs.attrib[value]
        szCs.set(value, str(int(size) - reduce))

    for sz in szs:
        size  = sz.attrib[value]
        sz.set(value, str(int(size) - reduce))
    
def compare_run_properties(run1, run2):
    attrib1 = get_run_properties(run1)
    attrib2 = get_run_properties(run2)
    
    if all (k in attrib1 for k in ('rFonts_ascii', 'sz_val', 'szCs_val')):
        if all (k in attrib2 for k in ('rFonts_ascii', 'sz_val', 'szCs_val')):
            if (attrib1['rFonts_ascii'] == attrib2['rFonts_ascii']) and \
            (attrib1['szCs_val'] == attrib2['szCs_val']) and \
            (attrib1['sz_val'] == attrib2['sz_val']) :
                return True
    
    return False

def get_line_connections(p):
    runs             = get_specific_tags(p, 'r')
    text_runs        = []
    
    for run in runs:
        if is_run_superscript(run) == False:
            text_runs.append(run)

    line_connections = []
    for index in range(len(text_runs) - 1):
        if (compare_run_properties(text_runs[index], text_runs[index+1])):
            line_connections.append((index, index+1, 'CONNECTED'))
        else:
            line_connections.append((index, index+1, 'NOT_CONNECTED'))
    return line_connections

def arrange_grouped_line_indices(line_connections, debug=False):
    lines          = [list(i) for j, i in groupby(line_connections, lambda a: a[2])]
    if debug:
        print('arrange_grouped_line_indices: %s \n---------\n' % (str(lines)))
        
    arranged_lines = []

    for line_items in lines:
        indices = []
        for line_item in line_items:
            indices.append(line_item[0])
            indices.append(line_item[1])
        indices = sorted(list(set(indices)))
        arranged_lines.append([indices, line_items[0][2]])
        
    if debug:
        print('arrange_grouped_line_indices,arranged_lines : %s \n---------\n' % (str(arranged_lines)))
    
    final_arranged_lines = []
    
    if len(arranged_lines) == 1:
        final_arranged_lines.append([arranged_lines[0][0], arranged_lines[0][1]])
    else:
        for index, line_item in enumerate(arranged_lines):
            if index == 0 and line_item[1] == 'NOT_CONNECTED':
                del line_item[0][-1]
            if index > 0 and index < (len(arranged_lines) - 1) and line_item[1] == 'NOT_CONNECTED':
                del line_item[0][0]
                del line_item[0][-1]
            if index == (len(arranged_lines) - 1) and line_item[1] == 'NOT_CONNECTED':
                del line_item[0][0]

            final_arranged_lines.append([line_item[0], line_item[1]])
    if debug:
        print('final_arrange_grouped_line_indices,arranged_lines : %s \n---------\n' % (str(final_arranged_lines)))
            
    return final_arranged_lines

def merge_runs(node, grouped_runs, debug=False):
    runs      = get_specific_tags(node, 'r')
    text_runs = []
    
    for run in runs:
        if is_run_superscript(run) == False:
            text_runs.append(run)

    for element in grouped_runs:
        if (element[1] == 'CONNECTED'):
            for index, run_index in enumerate(element[0]):
                if (index > 0):
                    if (debug):
                        print('merge index %d with %d' % ( run_index, 0))
                    update_run_text(text_runs[0], text_runs[run_index])
                    text_runs[run_index].getparent().remove(text_runs[run_index])
                    
def update_document_runs(document):
    '''
    the function iterates through the p tags and merges run that have exactly same
    visual property.
    '''
    tag_name                 = 'p'
    tags                     = get_specific_tags(document, tag_name)
    for p in tags:
        grouped_runs = arrange_grouped_line_indices(get_line_connections(p))
        merge_runs(p, grouped_runs, debug=False)
    return document

def get_text_tags(document):
    tags         = []
    runs         = get_specific_tags(document, 'r')
    for run in runs:
        if is_run_superscript(run) == False:
            texts = get_specific_tags(run, 't')
            for text in texts:
                if text.text and len(text.text.strip()) > 0:
                    add_identifier(text)
                    tags.append(text)
    return tags

In [4]:
def extract_docx(filepath, working_dir):
    filename       = os.path.splitext(os.path.basename(filepath))[0]
    extract_dir    = os.path.join(working_dir, filename)
    
    with ZipFile(filepath, 'r') as file:
        file.extractall(path=extract_dir)
        filenames = file.namelist()
    
    return extract_dir, filenames

def save_docx(extracted_dir, filenames, output_filename):
    with ZipFile(output_filename, 'w') as docx:
        for filename in filenames: 
            docx.write(os.path.join(extracted_dir, filename), filename)
            
def save_document_xml(extracted_dir, xml):
    with open(os.path.join(extracted_dir,'word/document.xml'), 'wb') as f:
        xmlstr = get_string_xmltree(xml)
        f.write(xmlstr)

In [5]:
def get_tokenized_sentences(filepath):
    from jsonpath_rw import jsonpath, parse
    json_data     = json.load(codecs.open(fetch_content_filepath, 'r', 'utf-8-sig'))
    jsonpath_expr = parse('$..tokenized_sentences[*]')
    matches       = jsonpath_expr.find(json_data)

    tokenized_sentences = []
    for match in matches:
        tokenized_sentences.append(match.value)
    
    return tokenized_sentences

In [6]:
def count_occurrences(string, substring): 
    count = 0
    start = 0
    while start < len(string): 
        pos = string.find(substring, start) 

        if pos != -1: 
            start = pos + 1
            count += 1
        else: 
            break
    return count 


def check_string_status(doc_tag, tokenized):
    doc_text       = doc_tag.text.replace(" ", "")
    tokenized_text = tokenized['src'].replace(" ", "")
    
    if len(doc_text) < 2 or len(tokenized_text) < 2:
        if doc_text.isdigit() == False or tokenized_text.isdigit() == False:
            return (False, False)
    
    '''
        perfect match
    '''
    if doc_text == tokenized_text:
        return (True, 0)
    
    count         = 0
    if len(doc_text) > len(tokenized_text):
        count = count_occurrences(doc_text, tokenized_text)
        if count != 0:
            return (True, -1)
    else:
        count = count_occurrences(tokenized_text, doc_text)
        if count != 0:
            return (True, 1)
    
    return (False, False)

def string_overlap(str1, str2):
    str1_list   = [x for x in str1.split(' ') if x]
    str1_set    = set(str1_list)
    
    str2_list   = [x for x in str2.split(' ') if x]
    str2_set    = set(str2_list)
    
    common_set  = str1_set.intersection(str2_set)
    diff_set    = str1_set.difference(str2_set)
    overlap_list = []
    
    if len(str1_list) > len(str2_list):
        for word in str2_list:
            if word in list(common_set):
                overlap_list.append(word)
    else:
        for word in str1_list:
            if word in list(common_set):
                overlap_list.append(word)
                
    return ' '.join(overlap_list)

def check_string_status_v1(doc_tag, tokenized, overlap_threshold=4):
    doc_text       = doc_tag.text.replace(" ", "")
    tokenized_text = tokenized['src'].replace(" ", "")
    
    if len(doc_text) < 2 or len(tokenized_text) < 2:
        if doc_text.isdigit() == False or tokenized_text.isdigit() == False:
            return (False, False)
    
    '''
        perfect match
    '''
    if doc_text == tokenized_text:
        return (True, 0)
    
    
    doc_text              = doc_tag.text
    tokenized_text        = tokenized['src']
    overlap, start, end, percentage, smaller = get_overlap(doc_text, tokenized_text)

#     overlap_str_list      = [x for x in overlap_str.split(' ') if x]
#     doc_text_list         = [x for x in doc_text.split(' ') if x]
#     tokenized_text_list   = [x for x in tokenized_text.split(' ') if x]
    
#     if len(overlap_str) > 0:
#         if (len(doc_text_list) <= len(tokenized_text_list)):
#             if (abs(len(doc_text_list) - len(overlap_str_list)) <= overlap_threshold):
#                 return (True, 1)
#         else:
#             if (abs(len(tokenized_text_list) - len(overlap_str_list)) <= overlap_threshold):
#                 return (True, -1)
#     else:
#         '''
#          when sentence overlap is not found, trying overlap at character level
#         '''
#         count         = 0
#         if len(doc_text) > len(tokenized_text):
#             count = count_occurrences(doc_text, tokenized_text)
#             if count != 0:
#                 return (True, -1)
#         else:
#             count = count_occurrences(tokenized_text, doc_text)
#             if count != 0:
#                 return (True, 1)
                
    
#     return (False, False)


In [7]:
def get_as_df(tags, tokenized_sentences):
    doc_texts = []
    doc_ids   = []
    for tag in tags:
        doc_texts.append(tag.text)
        doc_ids.append(tag.attrib['id'])

    tokenized_src_texts = []
    tokenized_tgt_texts = []
    for tokenized_sentence in tokenized_sentences:
        tokenized_src_texts.append(tokenized_sentence['src'])
        tokenized_tgt_texts.append(tokenized_sentence['tgt'])

    if len(doc_texts) > len(tokenized_src_texts):
        empty = [''] * (len(doc_texts) - len(tokenized_src_texts))
        tokenized_src_texts.extend(empty)
        tokenized_tgt_texts.extend(empty)
    else:
        empty = [''] * (len(tokenized_src_texts) - len(doc_texts))
        doc_texts.extend(empty)
        doc_ids.extend(empty)

    df = pd.DataFrame(list(zip(doc_texts, doc_ids, tokenized_src_texts, tokenized_tgt_texts)), 
                                  columns =['doc_texts', 'doc_ids', 'tokenized_src_texts', 'tokenized_tgt_texts'])
    return df

In [8]:
def get_updated_tokenized_text_matched(df):
    '''
     return df, df1, creates merged string of tokenized sentences that are equivalent to document text
    '''
    df1 = []
    for doc_id in df['doc_id'].unique():
        temp_df = df[df['doc_id'] == doc_id]

        src_text = ''
        tgt_text = ''
        for index, row in temp_df.iterrows():
            src_text = src_text + row['src'] + ' '
            tgt_text = tgt_text + row['tgt'] + ' '

        temp_df_first = temp_df[0:1].reset_index(drop=True)
        temp_df_first.at[0, 'src'] = src_text
        temp_df_first.at[0, 'tgt'] = tgt_text

        df1.append(temp_df_first)
    
    return pd.concat(df1)

def get_updated_document_text_matched(df):
    '''
     returns two dfs, df3: contains text that are actually substring of tokenized sentence
     df4: are multiple occurance text that have same spelling.
    '''
    df1 = []
    df2 = []

    for s_id in df['s_id'].unique():
        sid_df = df[df['s_id'] == s_id].reset_index(drop=True)

        doc_text = ''
        for index, row in sid_df.iterrows():
            doc_text = doc_text + row['doc_text'] + ' '

        doc_text_list         = [x for x in doc_text.split(' ') if x]
        sid_src_text_list     = [x for x in sid_df.iloc[0]['src'].split(' ') if x]

        if len(doc_text_list) <= len(sid_src_text_list):
            for index, row in sid_df.iterrows():
                if index > 0:
                    sid_df.at[index, 'tgt'] = ''
            df1.append(sid_df)
        else:
            df2.append(sid_df)

    df3 = pd.DataFrame()
    df4 = pd.DataFrame()
    if len(df1) > 0:
        df3 = pd.concat(df1).reset_index(drop=True)
    
    if len(df2) > 0:
        df4 = pd.concat(df2).drop_duplicates(subset='doc_id', keep='first').reset_index(drop=True)
    
    return df3, df4

def replace_translated_df(df, texts):
    for index, row in df.iterrows():
        for text in texts:
            if 'id' in text.attrib:
                if text.attrib['id'] == row['doc_id']:
                    text.text = row['tgt']
                    
                    
def get_matched_dfs(tokenized_sentences, texts):
    matched_ids    = []
    matched_sids   = []

    founds         = []
    is_substrings  = []
    doc_texts      = []
    srcs           = []
    tgts           = []
    ids            = []
    s_ids          = []

    for sent_index in range(len(tokenized_sentences)):
        for text_index in range(len(texts)):
            if (texts[text_index].attrib['id'] in matched_ids) or \
            (tokenized_sentences[sent_index]['s_id'] in matched_sids):
                continue

            is_found, is_substring = check_string_status_v1(texts[text_index], tokenized_sentences[sent_index])

            if is_found and is_substring == 0:
                matched_ids.append(texts[text_index].attrib['id'])
                matched_sids.append(tokenized_sentences[sent_index]['s_id'])

            founds.append(is_found)
            is_substrings.append(is_substring)
            doc_texts.append(texts[text_index].text)
            ids.append(texts[text_index].attrib['id'])
            s_ids.append(tokenized_sentences[sent_index]['s_id'])
            srcs.append(tokenized_sentences[sent_index]['src'])
            tgts.append(tokenized_sentences[sent_index]['tgt'])

    df = pd.DataFrame(list(zip(founds, is_substrings, doc_texts, ids, s_ids, srcs, tgts)), 
                                      columns =['found', 'substr', 'doc_text', 'doc_id', 's_id', 'src', 'tgt'])
    return df.loc[(df['substr'] == 0) & (df['found'] == True)].reset_index(drop=True), \
        df.loc[(df['substr'] == -1) & (df['found'] == True)].reset_index(drop=True), \
        df.loc[(df['substr'] == 1) & (df['found'] == True)].reset_index(drop=True), \
        df.loc[(df['found'] == False)].reset_index(drop=True)
    

In [9]:
def get_overlap(s1, s2):
    if len(s1) < len(s2):
        s = difflib.SequenceMatcher(lambda x: x == '.', s1, s2)
        pos_a, pos_b, size = s.find_longest_match(0, len(s1), 0, len(s2)) 
        return s1[pos_a:pos_a+size], pos_a, (pos_a+size), size/len(s1), s.ratio()

    s = difflib.SequenceMatcher(None, s2, s1)
    pos_a, pos_b, size = s.find_longest_match(0, len(s2), 0, len(s1)) 
    return s2[pos_a:pos_a+size], pos_a, (pos_a+size), size/len(s2), s.ratio()

def get_filtered_dfs(tokenized_sentences, texts):
    percent_threshold = 0.3
    ratio_threshold   = 0.5

    overlaps = []
    percents = []
    ratios   = []
    docs     = []
    srcs     = []
    tgts     = []
    ids      = []
    s_ids    = []

    for sent_index in range(len(tokenized_sentences)):
        for text_index in range(len(texts)):
            doc_text       = texts[text_index].text
            tokenized_text = tokenized_sentences[sent_index]['src']
            overlap, start, end, percent, ratio = get_overlap(doc_text, tokenized_text)

            overlaps.append(overlap)
            percents.append(percent)
            ratios.append(ratio)
            docs.append(texts[text_index].text)
            ids.append(texts[text_index].attrib['id'])
            srcs.append(tokenized_sentences[sent_index]['src'])
            tgts.append(tokenized_sentences[sent_index]['tgt'])
            s_ids.append(tokenized_sentences[sent_index]['s_id'])

    df = pd.DataFrame(list(zip(percents, ratios, overlaps, docs, srcs, tgts, s_ids, ids)), 
                                          columns =['percent', 'ratio', 'overlap', 'doc_text', 'src', 'tgt', 's_id', 'doc_id'])

    filtered_df = df.loc[(df['percent'] >= percent_threshold ) & (df['ratio'] >= ratio_threshold)]
    pm_df       = filtered_df.loc[(filtered_df['percent'] == 1.0 ) & (filtered_df['ratio'] == 1.0)]
    pm_df1      = filtered_df[~filtered_df.index.isin(pm_df.index)]
    return pm_df, pm_df1, df


In [10]:
extracted_dir, filenames = extract_docx(input_filepath, output_dir)


In [11]:
document_xml             = get_xmltree(os.path.join(extracted_dir, 'word', 'document.xml'))
document_xml             = update_document_runs(document_xml)
texts                    = get_text_tags(document_xml)

tokenized_sentences      = get_tokenized_sentences(fetch_content_filepath)
print('document has (%d) text tags, tokenized sentences (%d)' % (len(texts), len(tokenized_sentences)))

pm_df, filtered_df, df   = get_filtered_dfs(tokenized_sentences, texts)
tm_df                    = get_updated_tokenized_text_matched(filtered_df)
substring_df, multiple_df = get_updated_document_text_matched(filtered_df)

df.to_csv('df.csv')
filtered_df.to_csv('filtered_df.csv')
pm_df.to_csv('pm_df.csv')
tm_df.to_csv('tm_df.csv')
substring_df.to_csv('substring_df.csv')
multiple_df.to_csv('multiple_df.csv')

print('filtered match (%d)' % (len(filtered_df)))
print('perfect match (%d), tokenized match (%d) substring match (%d) multiple match (%d)' \
      % (len(pm_df), len(tm_df), len(substring_df), len(multiple_df)))
# pm_df, tm_df, dtm_df, df = get_matched_dfs(tokenized_sentences, texts)
# print('perfect match (%d), tokenized sentences matched (%d), document text matched (%d)' % (len(pm_df), len(tm_df), len(dtm_df)))

document has (51) text tags, tokenized sentences (85)
filtered match (40)
perfect match (48), tokenized match (30) substring match (9) multiple match (21)


In [34]:
'''
 replace and create translated sentence
'''
replace_translated_df(pm_df, texts)
# replace_translated_df(tm_df, texts)
# replace_translated_df(substring_df, texts)
# replace_translated_df(multiple_df, texts)

# replace_translated_df(get_updated_tokenized_text_matched(tm_df), texts)
# substring_df, multiple_df = get_updated_document_text_matched(dtm_df)

# if substring_df.empty == False:
#     replace_translated_df(substring_df, texts)
    
# if multiple_df.empty != False:
#     replace_translated_df(multiple_df, texts)


In [35]:
ps = get_specific_tags(document_xml, 'p')
for p in ps:
    update_font_property(p, 5)

In [36]:
save_document_xml(extracted_dir, document_xml)
save_docx(extracted_dir, filenames, os.path.join(output_dir, translated_filename))

In [32]:
# doc_text = 'recruited prior to change of policy as (ii) aforesaid. The Permanent Commission shall be offered to them after completion of five years. They would also be entitled to all consequential benefits such as promotion and other financial benefits. However, the aforesaid benefits are to be made available only to women officers in service or who have approached this Court by filing these petitions and have retired during the course of pendency of the petitions.'
# tokenized_text = 'This benefit would be conferred to women officers recruited prior to change of policy as (ii) aforesaid.'

tokenized_text = 'The original ToE provided for a contractual period of five years after which the officers were to be released from service. The officers who were granted commission under the Army instruction were not entitled to PC or to any extension beyond five years of commissioned service.'
doc_text = '8. The original ToE provided for a contractual period of five years after which the officers were to be released from service.'


In [33]:
def get_overlap_v1(s1, s2):
    if len(s1) > len(s2):
        return get_diff(s1, s2)
    else:
        return get_diff(s2, s1)

def get_diff(s1, s2):
    s      = difflib.SequenceMatcher(lambda x: x == '.', s1, s2)
    pos_s1, pos_s2, match_size = s.find_longest_match(0, len(s1), 0, len(s2))

    return s1[pos_s1:pos_s1+match_size], pos_s1, pos_s2, \
            len(s1[pos_s1:pos_s1+match_size])/len(s1), \
            len(s2[pos_s2:pos_s2+match_size])/len(s2), \
            s.ratio()

In [42]:
print(get_overlap_v1('1. 1999', '1999'))

('1999', 3, 0, 0.5714285714285714, 1.0, 0.7272727272727273)


In [43]:
print(get_overlap_v1('1999', '1. 1999'))

('1999', 3, 0, 0.5714285714285714, 1.0, 0.7272727272727273)
