## VENRON-Electricity Dataset preprocessing for Fonduer

This script is used to pre-process the spreadsheets in order to apply the cell annotations from the prediction json files or the corresponding manually labeled annotation range sheet.

In [None]:
import os
import pandas as pd
import json

In [None]:
# First create xlsx output folder
# Run in terminal
# ! libreoffice --headless --calc --convert-to xlsx --outdir src/data/gold/xlsx src/data/gold/spreadsheet/*

## Cell annotations based on predictions

We apply the json predictions to the gold data set of 114 spreadsheets (real annotations, not gold/manual labeled)

In [None]:
from openpyxl import load_workbook
from openpyxl.cell.cell import Cell
from openpyxl.styles import Border, Side, PatternFill, Font, GradientFill, Alignment

xlsx_path = f"{os.getcwd()}/data/gold/xlsx"
json_path = f"{os.getcwd()}/data/gold_pred_annotated/json_predictions"
pred_out_path = f"{os.getcwd()}/data/gold_pred_annotated/spreadsheet"

files_json = os.listdir(json_path)

pred_styles = {
    "data": Font(color="000001"),
    "derived": Font(color="000001"),
    "header": Font(color="000002"),
    "attributes": Font(color="000002"),
    "metadata": Font(color="000003"),
    "notes": Font(color="000004"),
}


def excel_wb(f):
    wb = load_workbook(filename = f"{xlsx_path}/{f}")
    return wb

# Assuming maximally ZZ column in excel
def int_to_char(i):
    if (i <= 25):
        return chr(65+j)
    else:
        return chr(90)+chr(65+ (i-25))

pred_styles = {
    "data": Font(color="000001"),
    "derived": Font(color="000001"),
    "header": Font(color="000002"),
    "attributes": Font(color="000002"),
    "metadata": Font(color="000003"),
    "notes": Font(color="000004"),
}

for json_file in files_json:
    fname = json_file[0:-5]
    
    f = open(f'./data/gold_pred_annotated/json_predictions/{json_file}', 'r')
    data = f.read()
    parsed = json.loads(data)
    
    
    try:
        wb = excel_wb(f"{fname}.xlsx")

        # For each worksheet in the spreadsheet
        for idx, (sheet_name, sheet_values) in enumerate(parsed.items()):
            # load annotations
            text = sheet_values['text']
            labels = sheet_values['labels']
            labels_probs = sheet_values['labels_probs']

            # load sheet
            ws = wb[sheet_name]

            # Override the cell-style tags
            for i, row in enumerate(text):
                for j, cell in enumerate(row):
                    ws_index = f"{int_to_char(j)}{i+1}"
                    if (cell != "" and ws[ws_index].font.color is not None):
                        if (labels_probs[i][j] > 0.6):
                            ws[ws_index].font = pred_styles[labels[i][j]]
                        else:
                            ws[ws_index].font = None

        # save the spreadsheet with annotated worksheets                
        wb.save(f"{pred_out_path}/{fname}.xlsx")
    except: 
        # just copy the file if errors occur (openpyxl min value issue)
        wb.save(f"{pred_out_path}/{fname}.xlsx")
        print(f"FAILED to read {fname}.xlsx")

We also apply the json predictions to the full data set of 687 spreadsheets

In [None]:
# First create xlsx output folder
# Run in terminal for each batch (batches are needed to avoid libreoffice failure)
# ! libreoffice --headless --calc --convert-to xlsx --outdir src/data/full/xlsx src/data/full/spreadsheet/batch_X*

In [None]:
from openpyxl import load_workbook
from openpyxl.cell.cell import Cell
from openpyxl.styles import Border, Side, PatternFill, Font, GradientFill, Alignment

xlsx_path = f"{os.getcwd()}/data/full/xlsx"
json_path = f"{os.getcwd()}/data/full_pred_annotated/json_predictions"
pred_out_path = f"{os.getcwd()}/data/full_pred_annotated/spreadsheet"

files_json = os.listdir(json_path)

pred_styles = {
    "data": Font(color="000001"),
    "derived": Font(color="000001"),
    "header": Font(color="000002"),
    "attributes": Font(color="000002"),
    "metadata": Font(color="000003"),
    "notes": Font(color="000004"),
}


def excel_wb(f):
    wb = load_workbook(filename = f"{xlsx_path}/{f}")
    return wb

# Assuming maximally ZZ column in excel
def int_to_char(i):
    if (i <= 25):
        return chr(65+j)
    else:
        return chr(90)+chr(65+ (i-25))

pred_styles = {
    "data": Font(color="000001"),
    "derived": Font(color="000001"),
    "header": Font(color="000002"),
    "attributes": Font(color="000002"),
    "metadata": Font(color="000003"),
    "notes": Font(color="000004"),
}

for json_file in files_json:
    fname = json_file[0:-5]
    
    f = open(f'./data/full_pred_annotated/json_predictions/{json_file}', 'r')
    data = f.read()
    parsed = json.loads(data)
    
    
    try:
        wb = excel_wb(f"{fname}.xlsx")

        # For each worksheet in the spreadsheet
        for idx, (sheet_name, sheet_values) in enumerate(parsed.items()):
            # load annotations
            text = sheet_values['text']
            labels = sheet_values['labels']
            labels_probs = sheet_values['labels_probs']

            # load sheet
            ws = wb[sheet_name]

            # Override the cell-style tags
            for i, row in enumerate(text):
                for j, cell in enumerate(row):
                    ws_index = f"{int_to_char(j)}{i+1}"
                    if (cell != "" and ws[ws_index].font.color is not None):
                        if (labels_probs[i][j] > 0.6):
                            ws[ws_index].font = pred_styles[labels[i][j]]
                        else:
                            ws[ws_index].font = None

        # save the spreadsheet with annotated worksheets                
        wb.save(f"{pred_out_path}/{fname}.xlsx")
    except: 
        # just copy the file if errors occur (openpyxl min value issue)
        wb.save(f"{pred_out_path}/{fname}.xlsx")
        print(f"FAILED to read {fname}.xlsx")

## HTML modifications

Fonduer makes it difficult to deal with image names and document names similar to spans.
In order to avoid rewriting all featurizers we simple construct the HTML files with new spans for the image and document name.

In [46]:
import bs4

def extend_html_file(html_path, file_name):
    # load the file
    with open(f"{html_path}/{file_name}") as fin:
        txt = fin.read()
        soup = bs4.BeautifulSoup(txt)

    # Insert document name
    new_tag = soup.new_tag("div")
    new_tag.string = f"Document name: {file_name}"
    soup.body.insert(0, new_tag)

    # Insert image urls
    for i in soup.find_all("img"):
        image_url = i["src"]
        if (i.parent.name == "body"):
            t = soup.new_tag("div")
            t.string = image_url
            i.parent.insert(-1, t)
        else:
            t = soup.new_tag("span")
            t.string = image_url
            i.parent.insert(0, t)


    # save the file again
    with open(fin.name, "w") as outf:
        outf.write(str(soup))

In [47]:
import os
from html.parser import HTMLParser

paths = ["gold", "full", "gold_pred_annotated", "full_pred_annotated"]

for p in paths:
    html_path = f"{os.getcwd()}/data/{p}/html"
    files_html = [x for x in os.listdir(html_path) if x[-4:] == "html"]
    for file_html in files_html:
        extend_html_file(html_path, file_html)
    

## Playground

Experimenting