In [30]:
import os
import sys
import time
import base64
import uuid
import pandas as pd
from zipfile import ZipFile
from lxml import etree
import xml.etree.ElementTree as ET

utilities_dir = '/Users/kd/Workspace/python/helpers'
sys.path.append(utilities_dir)

from file_directory_utils import (create_directory, read_directory_files, get_subdirectories, get_all_file_paths)


In [31]:
input_filepath = '/Users/kd/Workspace/python/DOCX/document-formatting/data/input/template_1.docx'
output_dir     = '/Users/kd/Workspace/python/DOCX/document-formatting/data/output'
filename       = os.path.splitext(os.path.basename(input_filepath))[0]


In [32]:
def get_string_xmltree(xml):
    return etree.tostring(xml)

def get_xml_tree(xml_string):
    return etree.fromstring(xml_string)

def get_xmltree(filepath, parse='xml'):
    if parse == 'html':
        parser = etree.HTMLParser()
        tree   = etree.parse(open(filepath, mode='r', encoding='utf-8'), parser)
        return tree
    else:
        with open(filepath,'r') as file:
            xml_string    = file.read()
            return etree.fromstring(bytes(xml_string, encoding='utf-8'))
    return None

def check_element_is(element, type_char):
    word_schema1 = 'http://www.w3.org/1999/xhtml'
    word_schema2 = 'http://purl.oclc.org/ooxml/wordprocessingml/main'
     
    return (element.tag == '{%s}%s' % (word_schema1, type_char)) or (element.tag == '{%s}%s' % (word_schema2, type_char))

def check_element_is(element, type_char):     
    return (element.tag == type_char)

def get_specific_tags(node, type_char):
    nodes = []
    for elem in node.iter():
        if check_element_is(elem, type_char):
            nodes.append(elem)
    return nodes


In [33]:
def extract_docx(filepath, working_dir):
    filename       = os.path.splitext(os.path.basename(filepath))[0]
    extract_dir    = os.path.join(working_dir, filename)
    
    with ZipFile(filepath, 'r') as file:
        file.extractall(path=extract_dir)
        filenames = file.namelist()
    
    return extract_dir, filenames

def save_docx(extracted_dir, filenames, output_filename):
    with ZipFile(output_filename, 'w') as docx:
        for filename in filenames: 
            docx.write(os.path.join(extracted_dir, filename), filename)

In [34]:
extracted_dir, filenames = extract_docx(input_filepath, output_dir)



In [36]:
document_xml             = get_xmltree(os.path.join(extracted_dir, 'word', 'document.xml'))
# get_string_xmltree(document_xml)

In [20]:
from lxml.etree import Element, SubElement, QName, tounicode
    
class DOCX_NS_NSDEF_FACTORY:
    def __init__(self, ns, nsdef):
        self.ns      = ns
        self.nsdef   = nsdef
        self.name    = None
        self.attribs = []
        self.root    = None

    def add_name(self, name):
        self.name = name
    
    def add_attribs(self, without_qname, value):
        self.attribs.append({'qname': QName(self.ns, without_qname), 'val':value})
        
    def get_node(self):
        if len(self.attribs) > 0:
            attrib = {}
            for attr in self.attribs:
                self.root.set(attr['qname'], attr['val'])

        return self.root
    
    def add_child(self, parent, child):
        return parent.append(child)
    
    def create_root_node(self, name):
        self.name = name
        self.attribs = []
        self.root = Element(QName(self.ns, self.name), nsmap={self.nsdef:self.ns})

class DOCX_NS_W_FACTORY(DOCX_NS_NSDEF_FACTORY):
    def __init__(self):
        self.ns     = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
        self.nsdef  = 'w'
        super().__init__(self.ns, self.nsdef)
    
class DOCX_NS_PIC_FACTORY(DOCX_NS_NSDEF_FACTORY):
    def __init__(self):
        self.ns    = 'http://schemas.openxmlformats.org/drawingml/2006/picture'
        self.nsdef = 'pic'
        super().__init__(self.ns, self.nsdef)
        
class DOCX_NS_A_FACTORY(DOCX_NS_NSDEF_FACTORY):
    def __init__(self):
        self.ns    = 'http://schemas.openxmlformats.org/drawingml/2006/picture'
        self.nsdef = 'a'
        super().__init__(self.ns, self.nsdef)


In [21]:
class PageSection (DOCX_NS_W_FACTORY):
    def __init__(self):
        super().__init__()

    def get_node(self):
        '''
            <w:sectPr w:rsidR="00A66D74" w:rsidSect="00034616">
            <w:pgSz w:w="11893" w:h="16840"/>
            <w:pgMar w:top="720" w:right="720" w:bottom="720" w:left="720" w:header="720" w:footer="720" w:gutter="0"/>
            <w:cols w:space="720"/>
            <w:docGrid w:linePitch="360"/>
            </w:sectPr>
        '''
        self.create_root_node('sectPr')
        self.add_attribs('rsidR', '00A66D74')
        self.add_attribs('rsidSect', '00034616')
        root = super().get_node()
        
        self.create_root_node('pgSz')
        self.add_attribs('w', '11893')
        self.add_attribs('h', '16840')
        self.add_child(root, super().get_node())
        
        self.create_root_node('pgMar')
        self.add_attribs('top', '720')
        self.add_attribs('right', '720')
        self.add_attribs('bottom', '720')
        self.add_attribs('left', '720')
        self.add_attribs('header', '720')
        self.add_attribs('footer', '720')
        self.add_attribs('gutter', '0')        
        self.add_child(root, super().get_node())

        self.create_root_node('cols')
        self.add_attribs('space', '720')
        self.add_child(root, super().get_node())

        self.create_root_node('docGrid')
        self.add_attribs('linePitch', '360')
        self.add_child(root, super().get_node())
        
        return root


In [37]:
node = PageSection()
print (tounicode(node.get_node(), pretty_print=True))

<w:sectPr xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" w:rsidR="00A66D74" w:rsidSect="00034616">
  <w:pgSz w:w="11893" w:h="16840"/>
  <w:pgMar w:top="720" w:right="720" w:bottom="720" w:left="720" w:header="720" w:footer="720" w:gutter="0"/>
  <w:cols w:space="720"/>
  <w:docGrid w:linePitch="360"/>
</w:sectPr>



In [28]:
class A_NODE (DOCX_NS_A_FACTORY):
    def __init__(self):
        super().__init__()
    
    def get_node_blip(self):
        '''
            <a:blip r:embed="rId8">
                <a:extLst>
                    <a:ext uri="{28A0092B-C50C-407E-A947-70E740481C1C}"/>
                </a:extLst>
            </a:blip>
        '''
        self.create_root_node('blip')
        self.add_attribs('embed', 'rId8')
        blip = super().get_node()
        
        return blip
        
        

class Pic (DOCX_NS_PIC_FACTORY):
    def __init__(self):
        super().__init__()
        
    def get_node(self):
        '''
        <pic:pic
            xmlns:pic="http://schemas.openxmlformats.org/drawingml/2006/picture">
            <pic:nvPicPr>
                <pic:cNvPr id="0" name="Picture 1"/>
                <pic:cNvPicPr>
                    <a:picLocks noChangeAspect="1" noChangeArrowheads="1"/>
                </pic:cNvPicPr>
            </pic:nvPicPr>
            <pic:blipFill>
                <a:blip r:embed="rId8">
                    <a:extLst>
                        <a:ext uri="{28A0092B-C50C-407E-A947-70E740481C1C}"/>
                    </a:extLst>
                </a:blip>
                <a:srcRect/>
                <a:stretch>
                    <a:fillRect/>
                </a:stretch>
            </pic:blipFill>
            <pic:spPr bwMode="auto">
                <a:xfrm>
                    <a:off x="0" y="0"/>
                    <a:ext cx="4181475" cy="5353050"/>
                </a:xfrm>
                <a:prstGeom prst="rect">
                    <a:avLst/>
                </a:prstGeom>
                <a:noFill/>
            </pic:spPr>
        </pic:pic>
        '''
        self.create_root_node('pic')
        pic = super().get_node()
        
        self.create_root_node('nvPicPr')
        nvPicPr = super().get_node()
        
        self.create_root_node('cNvPr')
        cNvPr   = super().get_node()
        
        self.create_root_node('cNvPicPr')
        cNvPicPr = super().get_node()
        
        self.create_root_node('')
        
        return root

In [29]:
node = A_NODE()
print (tounicode(node.get_node_blip(), pretty_print=True))


<a:blip xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/picture" a:embed="rId8"/>



In [77]:
factory = DOCX_NS_W_FACTORY('sectPr')
factory.add_attribs('rsidR', '00A66D74')
factory.add_attribs('rsidSect', '00034616')
section = factory.get_node()

factory = DOCX_NS_W_FACTORY('sectPr')

print (tounicode(p, pretty_print=True))


<w:sectPr xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" w:rsidR="00A66D74" w:rsidSect="00034616"/>



In [36]:
from lxml.etree import Element, SubElement, QName, tounicode
class XMLNamespaces:
   s = 'http://www.w3.org/2003/05/soap-envelope'
   a = 'http://www.w3.org/2005/08/addressing'

root = Element(QName(XMLNamespaces.s, 'Envelope'), nsmap={'s':XMLNamespaces.s, 'a':XMLNamespaces.a})

header = SubElement(root, QName(XMLNamespaces.s, 'Header'))
action  = SubElement(header, QName(XMLNamespaces.a, 'Action'), attrib={
    'notUnderstand':'1',
    QName(XMLNamespaces.s, 'mustUnderstand'):'1'
    })
print (tounicode(root, pretty_print=True))


<s:Envelope xmlns:a="http://www.w3.org/2005/08/addressing" xmlns:s="http://www.w3.org/2003/05/soap-envelope">
  <s:Header>
    <a:Action notUnderstand="1" s:val="1"/>
  </s:Header>
</s:Envelope>



In [39]:
from lxml.etree import Element, SubElement, QName, tounicode

class DOCX_NS:
    w = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
    
p        = Element(QName(DOCX_NS.w, 'p'), nsmap={'w':DOCX_NS.w})
pPr      = SubElement(p, QName(DOCX_NS.w, 'pPr'))

framePr  = SubElement(pPr, QName(DOCX_NS.w, 'framePr'), attrib = {
        QName(DOCX_NS.w, 'w'):'3500',
        QName(DOCX_NS.w, 'h'):'3500',
        QName(DOCX_NS.w, 'wrap'):'auto',
        QName(DOCX_NS.w, 'hAnchor'):'page',
        QName(DOCX_NS.w, 'xAlign'):'right',
        QName(DOCX_NS.w, 'yAlign'):'top',
    })

rPr      = SubElement(pPr, QName(DOCX_NS.w, 'rPr'), attrib = {})

print (tounicode(p, pretty_print=True))


<w:p xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
  <w:pPr>
    <w:framePr w:h="3500" w:hAnchor="page" w:w="3500" w:wrap="auto" w:xAlign="right" w:yAlign="top"/>
  </w:pPr>
</w:p>



In [25]:
#save_docx(extracted_dir, filenames, os.path.join(output_dir, "kd.docx"))