In [None]:
import pandas as pd
import re

#### This notebook helps process the output generated by PyMuPDF, a PDF-to-text Python module. When converting PDF files, PyMuPDF can be used to automatically identify and label the strings based on font size, font weight and the most used font.  The output file generated contains HTML stype tags such as `<h1>, <h2>, <p>, <s1>, <s2>`. 

This notebook provide scripts and interactive widgets for:

- Loading the PDF file
- Process the PDF file with PyMuPDF to extract headers, paragraphs and subscripts
- Inspecting the auto-generated HTML tags
    (select any tags and view random samples of paragraphs with the selected tag)
- Renaming the labelings to desired naming schemes

### Upload a PDF File:

In [None]:
from ipywidgets import FileUpload
from IPython.display import display
upload = FileUpload(accept='.pdf', multiple=False)
print("\nPlease use the upload button to upload a PDF file:")
display(upload)

In [None]:
with open("input.pdf", "w+b") as i:
    i.write(upload.data[0])

### Process PDF with PyMuPDF

In [None]:
from operator import itemgetter
import fitz
import json


def fonts(doc, granularity=False):
    """Extracts fonts and their usage in PDF documents.
    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param granularity: also use 'font', 'flags' and 'color' to discriminate text
    :type granularity: bool
    :rtype: [(font_size, count), (font_size, count}], dict
    :return: most used fonts sorted by count, font style information
    """
    styles = {}
    font_counts = {}

    for page in doc:
        blocks = page.getText("dict")["blocks"]
        for b in blocks:  # iterate through the text blocks
            if b['type'] == 0:  # block contains text
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if granularity:
                            identifier = "{0}_{1}_{2}_{3}".format(s['size'], s['flags'], s['font'], s['color'])
                            styles[identifier] = {'size': s['size'], 'flags': s['flags'], 'font': s['font'],
                                                  'color': s['color']}
                        else:
                            identifier = "{0}".format(s['size'])
                            styles[identifier] = {'size': s['size'], 'font': s['font']}

                        font_counts[identifier] = font_counts.get(identifier, 0) + 1  # count the fonts usage

    font_counts = sorted(font_counts.items(), key=itemgetter(1), reverse=True)

    if len(font_counts) < 1:
        raise ValueError("Zero discriminating fonts found!")

    return font_counts, styles


def font_tags(font_counts, styles):
    """Returns dictionary with font sizes as keys and tags as value.
    :param font_counts: (font_size, count) for all fonts occuring in document
    :type font_counts: list
    :param styles: all styles found in the document
    :type styles: dict
    :rtype: dict
    :return: all element tags based on font-sizes
    """
    p_style = styles[font_counts[0][0]]  # get style for most used font by count (paragraph)
    p_size = p_style['size']  # get the paragraph's size

    # sorting the font sizes high to low, so that we can append the right integer to each tag
    font_sizes = []
    for (font_size, count) in font_counts:
        font_sizes.append(float(font_size))
    font_sizes.sort(reverse=True)

    # aggregating the tags for each font size
    idx = 0
    size_tag = {}
    for size in font_sizes:
        idx += 1
        if size == p_size:
            idx = 0
            size_tag[size] = '<p>'
        if size > p_size:
            size_tag[size] = '<h{0}>'.format(idx)
        elif size < p_size:
            size_tag[size] = '<s{0}>'.format(idx)

    return size_tag


def headers_para(doc, size_tag):
    """Scrapes headers & paragraphs from PDF and return texts with element tags.
    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param size_tag: textual element tags for each size
    :type size_tag: dict
    :rtype: list
    :return: texts with pre-prended element tags
    """
    header_para = []  # list with headers and paragraphs
    first = True  # boolean operator for first header
    previous_s = {}  # previous span

    for page in doc:
        blocks = page.getText("dict")["blocks"]
        for b in blocks:  # iterate through the text blocks
            if b['type'] == 0:  # this block contains text

                # REMEMBER: multiple fonts and sizes are possible IN one block

                block_string = ""  # text found in block
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if s['text'].strip():  # removing whitespaces:
                            if first:
                                previous_s = s
                                first = False
                                block_string = size_tag[s['size']] + s['text']
                            else:
                                if s['size'] == previous_s['size']:

                                    if block_string and all((c == "|") for c in block_string):
                                        # block_string only contains pipes
                                        block_string = size_tag[s['size']] + s['text']
                                    if block_string == "":
                                        # new block has started, so append size tag
                                        block_string = size_tag[s['size']] + s['text']
                                    else:  # in the same block, so concatenate strings
                                        block_string += " " + s['text']

                                else:
                                    header_para.append(block_string)
                                    block_string = size_tag[s['size']] + s['text']

                                previous_s = s

                    # new block started, indicating with a pipe
#                     block_string += "|"

                header_para.append(block_string)

    return header_para

In [None]:
doc = fitz.open("input.pdf")

font_counts, styles = fonts(doc, granularity=False)

size_tag = font_tags(font_counts, styles)

elements = headers_para(doc, size_tag)

In [None]:
df = pd.DataFrame(elements).rename(columns={0: "section"})
df

### Processing PyMuPDF output file

In [None]:
def extract_tag(s):
    r = re.search(r"<[^>]+>", s)
    return r.group(0) if r else ""

def extract_text(s):
    tag = re.search(r"<[^>]+>", s).group(0).strip()
    return s.replace(tag, "")

In [None]:
df["label"] = df["section"].apply(extract_tag)
df = df[df.label != ""]
df["section"] = df["section"].apply(extract_text)

labels = df["label"].unique()

In [None]:
df

### From here, we can take a look at what the extracted strings looks like. The following output shows a list of all the unique HTML tag generated by PyMuPDF, the number of occurance of each label, as well as a sample extracted text for each label

In [None]:
sample = pd.DataFrame(columns = ['section', 'label'])

for lbl in labels:
    sample = sample.append(df[df.label == lbl].iloc[0])
    
label_count = df.groupby("label").size().to_frame().rename(columns={0: "label_count"}).reset_index()
    
sample = sample.merge(label_count, on="label")

sample

### PyMuPDF differantiate sections/paragraphs/headings/etc within a PDF file based on their font/font size/font color. Therefore, unless all the PDF files we are using has the exact same formatting, there is a good chance that PyMuPDF will label the heading types and section types differently for each PDF file

#### We can use this interactive widget to have a quick look at how strings are labeled by PyMuPDF in the loaded PDF file, inspect the extraction quality, and decide how to map them manually.

In [None]:
import ipywidgets as widgets
from IPython.display import display, Markdown, clear_output

menu = widgets.Dropdown(
       options=labels,
       value=labels[0],
       description='Select Label:')
button = widgets.Button(description='show random string')
output = widgets.Output()

def on_button_clicked(b):
    with output:
        clear_output()
        selected_label = menu.value
        print(df[df["label"] == selected_label].sample(n=1)["section"].values)
        
button.on_click(on_button_clicked)

print("\n\nTake a look at some sample labeled string extracted from the PDF file by PyMuPDF:")

widgets.VBox([menu, button,output])

### Now we can manually map the generated tags to our desired labeling standard:

The inputs will be stored in a dict object

In [None]:
label_input_list = []
for lbl in labels:
    next_text_input = widgets.Text(
                            value=lbl,
                            description="Rename:" + str(lbl))
    label_input_list.append(next_text_input)
    
box = widgets.VBox(label_input_list)
print("Original PyMuPDF Generated Labels:")
print(labels)
print("\nPlease rename the labels below as needed (remember to follow the original orders above):")
print("It is OK to rename multiple labels into the same name")
box

### Confirm that our user-defined label mapping is correctly recorded:

In [None]:
user_labels = []
for lbl in label_input_list:
    user_labels.append(lbl.value)  # stores user text input from the above ipywidget Text Box
    
# maps user defined labels to the PyMuPDF generated labels
# key=PyMuPDF labels, value=User defined labels from text input box
user_label_mappings = {labels[i]: user_labels[i] for i in range(len(labels))}
print("User defined label mappings:\n")
print("{Original: User Defined}")
user_label_mappings      

### (optional) Download CSV file for manual corrections if needed:

In [None]:
def remap(s):
    return user_label_mappings[s]

df_renamed = df.copy(deep=True)
# replaces PyMuPDF generated labels
df_renamed["label"] = df_renamed["label"].apply(remap)

df_renamed.to_csv("processed.csv")

### One-hot encoding:

In [None]:
def remap(s):
    return user_label_mappings[s]

df_renamed = df.copy(deep=True)
# replaces PyMuPDF generated labels
df_renamed["label"] = df_renamed["label"].apply(remap)


transformed = pd.get_dummies(data = df_renamed, prefix = "", prefix_sep = "", columns = ["label"])
transformed