In [223]:
#coding: utf-8
# citations: https://pythonmana.com/2021/03/20210329161147051K.html
# citation: https://stackoverflow.com/questions/43637211/retrieve-document-content-with-document-structure-with-python-docx

# Import Libraries
!pip install docx
from docx import Document
from docx.document import Document as _Document
from docx.oxml.text.paragraph import CT_P
from docx.oxml.table import CT_Tbl
from docx.table import _Cell, Table
from docx.text.paragraph import Paragraph
import pandas as pd
import numpy as np
import re
from os.path import exists
!pip install xlrd



In [224]:
########################################################################
# USER INPUTS
########################################################################
# Replace File Path. This is where all the relevant files should be saved.
path = 'C:/Users/jstockham/OneDrive - American Institutes for Research in the Behavioral Sciences/JSmith/PYTHON_CODE_2022/table_automate_jess/general_table_automate/'

######################
# Table Shells in Word
######################

# Word Document Name (If your Word doc is tableshells.docx, report_doc = 'tableshell.docx')
report_doc = 'Reportshell.docx'

# Table Caption Heading Style in Word. Replace the default if needed.
# Note that you cannot have embedded cross-references in the captions
caption_style = 'Exhibit Title'

# Table Style in Word
table_style = '__Table Style-AIR 2021'

# Row Heading Styles in Word. Replace the default if needed.
row_heading_style = 'Table 11 Row Heading'

# Column Heading Styles in Word. Replace the default if needed.
col_heading_style = 'Table 11 Column Heading'

# # Overwrite the header rows of your table shell in the Word doc? Enter lowercase 'y' for yes and 'n' for no.
# overwrite_table_row = 'n'

# # Overwrite the header columns of your table shell in the Word doc? Enter 'y' for yes and 'n' for no.
# overwrite_table_col = 'n'

# Name of updated Word Document - OPTIONAL.  (If your Word doc is outputfile.docx, output_doc = 'outputfile.docx')
# If not provided, will replace the contents of the inputted Word doc
output_doc = ''

######################
# Excel Input Data
######################

# Are all your data tables in a single Excel workbook (1 table per worksheet?) Enter 'y' for yes and 'n' for no.
# If you are pulling data from multiple workbooks, populate that information in the Crosswalk file
single_workbook = 'y'

# Excel Document Name (If your Excel doc is rawdata.xlsx, excel_doc = 'rawdata.xlsx')
excel_doc = 'rawdata.xlsx'

######################
# Crosswalk File
######################

# Crosswalk Excel Document Name (If your Excel doc is crosswalk.xlsx, crosswalk_doc = 'crosswalk.xlsx')
crosswalk_doc = 'crosswalk.xlsx' 

In [225]:
########################################################################
# Load files
########################################################################

# 1. Load Crosswalk data into Pandas Dataframe
crosswalk_data = pd.read_excel(crosswalk_doc)
#print(crosswalk_data.head())

# 2. Load in all sheets from Excel Workbook(s) into Pandas Dataframe
tables_data = pd.read_excel(excel_doc, sheet_name=None, header=None)
#print(tables_data["Table 1"])
#print(tables_data["Table 2"])


#print(tables_data['Table 1'][0][2])
tables_data['Table 1'].style.format({1: "{:.2f}", 2: "{:.2%}"})
print(tables_data["Table 1"])

# 3. Load Word Document into a docx file object
document = Document(report_doc)

#4. Set output to be the same as the input Word file
print(report_doc)
if not output_doc:
    output_doc = report_doc

                                   0            1                2      3
0                           Category  Participant  Pre-Participant  Total
1                      Google/Online        0.028            0.012  0.026
2                            Faculty        0.603            0.612  0.604
3                     Outreach Event        0.095            0.106  0.097
4                      Word-of-Mouth        0.127            0.082   0.12
5                     Job Fair/Email        0.004                0  0.004
6  Advisor/Class/J-Standard Bootcamp        0.006                0  0.006
7                              Other        0.037            0.082  0.044
Reportshell.docx


In [226]:
########################################################################
# Populate Table Data in Word
########################################################################

# Loop through the paragraphs & table pairs in the Word document
#citation: source: https://theprogrammingexpert.com/write-table-fast-python-docx/
''' Define script to identify table 'child' within paragraph 'parent' based on document order
    Each returned value is an instance of either Table or Paragraph. '''
def iter_block_items(parent):
    if isinstance(parent, _Document):
        parent_elm = parent.element.body
    elif isinstance(parent, _Cell):
        parent_elm = parent._tc
    else:
        raise ValueError("something's not right")

    for child in parent_elm.iterchildren():
        if isinstance(child, CT_P):
            yield Paragraph(child, parent)
        elif isinstance(child, CT_Tbl):
            yield Table(child, parent)      

'''Identifies table meta-data from crosswalk file'''
def findtables(document, crosswalk_data):

    # Iterate through paragraphs and table objects in document
    for block in iter_block_items(document):

        # If we encounter a paragraph with the AIR Exhibit Title Formatting
        if isinstance(block, Paragraph):

            # Check if it has Exhibit Title Formatting
            if block.style.name==caption_style:
                print(block.text)

                # Find Exhibit Title in the Crosswalk
                exhibit_name = block.text
        
                # Excel Workbook and Worksheet
                # Need to update this to be .loc rather than .iloc so not sensitive to column ordering
                excel_sheet_name = crosswalk_data.loc[crosswalk_data['word_table_title'] == exhibit_name].iloc[0, 1]
                workbook_name = crosswalk_data.loc[crosswalk_data['word_table_title'] == exhibit_name].iloc[0, 2]
                tables_data = pd.read_excel(workbook_name, sheet_name=excel_sheet_name, header=None)
                #print(f"workbook: {workbook_name}")
                
                # Overwrite Settings
                overwrite_table_row = crosswalk_data.loc[crosswalk_data['word_table_title'] == exhibit_name].iloc[0, 3]
                overwrite_table_col = crosswalk_data.loc[crosswalk_data['word_table_title'] == exhibit_name].iloc[0, 4]
        
                # Continue to the next block item, which is the corresponding table
                continue

        # Access the table object
        elif isinstance(block,Table):
            
            tablepopulate(block, tables_data, overwrite_table_row, overwrite_table_col)

'''Extracts that table's formatting specifications and fills in the data'''
def tablepopulate(block, tables_data, overwrite_table_row, overwrite_table_col):

        # Set the initial row for iteration in the dataframe (default = 0)
        initial_row = 0
        if overwrite_table_row == 'n': 
            initial_row = 1 
        print(f"initial_row {initial_row}")

        # Set the initial col for iteration in the dataframe (default = 0)
        initial_col = 0
        if overwrite_table_col == 'n': 
            initial_col = 1 
        print(f"initial_col {initial_col}")

        # loop through rows and cols of the dataframe to populate the table object
#             try:

        for i in range(initial_row, tables_data.shape[0]):
            for j in range(initial_col, tables_data.shape[1]):
                block.cell(i,j).text = str(tables_data.values[i,j])

#             except:

#                 print("f({exhibit_name} does not exist in the Excel file."
#                       f"Please double-check your Excel file information for typos")

#             # Add table styles and formats                        
#             #block.style = table_style   # table format
#             print(f"first column {block.row.cells[0].text}")
#             print(f"first row {block.columns.cells[0].text}")

#             row = block.rows[0]
#             # Access the first column of the row
#             row.cells[0].text
#             # Access the second column of the row
#             row.cells[1].text = 'This is the second row, second column'

#             block.rows[0].style = col_heading_style  # first column heading style
#             block.columns[0].style = row_heading_style   # first row heading style


findtables(document, crosswalk_data)

#3. Save the outputted Word document
document.save(output_doc)

Exhibit 1. Caption
initial_row 1
initial_col 1
i 1
j 1
i 1
j 2
i 1
j 3
i 2
j 1
i 2
j 2
i 2
j 3
i 3
j 1
i 3
j 2
i 3
j 3
i 4
j 1
i 4
j 2
i 4
j 3
i 5
j 1
i 5
j 2
i 5
j 3
i 6
j 1
i 6
j 2
i 6
j 3
i 7
j 1
i 7
j 2
i 7
j 3
Exhibit 2. Caption
initial_row 1
initial_col 1
i 1
j 1
i 1
j 2
i 2
j 1
i 2
j 2
i 3
j 1
i 3
j 2
