In [1]:
file = 'input/digital-thermometer-ds18b20.pdf'
FILENAME = 'digital-thermometer'

In [2]:
import fitz  # PyMuPDF
import numpy as np
import pandas as pd
import pdfplumber

def convert_bbox_plumber_to_fitz(bbox, page_height):
    """Convert pdfplumber bbox to fitz bbox."""
    x0, y0, x1, y1 = bbox
    return (x0, page_height - y1, x1, page_height - y0)

def get_table_bboxes(page) -> list:
    """Extract bboxes of tables in PdfPlumber page."""
    return [
        convert_bbox_plumber_to_fitz(table.bbox, page_height=page.height)
        for table in page.find_tables()
    ] 

def get_diagram_bboxes(page) -> list:
    """Extract bboxes of diagrams/images in PdfPlumber page."""
    diagram_bboxes = []
    for objects in [page.rects, page.lines, page.curves]:
        for obj in objects: 
            diagram_bboxes.append(
                convert_bbox_plumber_to_fitz(
                    (obj['x0'], obj['top'], obj['x1'], obj['bottom']),
                    page_height=page.height
                )
            )
    return diagram_bboxes

def get_bboxes_to_skip(pdf_file: str) -> dict:
    """Get bboxes to be skipped in pdfplumber page (images, diagrams and tables)."""
    bboxes = {}  
    with pdfplumber.open(pdf_file) as pdf:
        for page in pdf.pages:
            bboxes[page.page_number] = [*get_table_bboxes(page), *get_diagram_bboxes(page)]
    return bboxes

def bbox_inside_bboxes_to_skip(bbox, bboxes_to_skip):
    """Check if bbox is inside any of the bboxes to be skipped."""
    if bboxes_to_skip is None:
        return False
    x0, y0, x1, y1 = bbox
    for skip_bbox in bboxes_to_skip:
        skip_x0, skip_y0, skip_x1, skip_y1 = skip_bbox
        if (x0 >= skip_x0 and y0 >= skip_y0 and x1 <= skip_x1 and y1 <= skip_y1):
            return True
    return False

def extract_formatted_text(pdf_path, bboxes_to_skip=None):
    """Load the PDF from the pdf_path extracting from it the words outside tables and diagrams."""
    doc = fitz.open(pdf_path)
    formatted_text = []

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        blocks = page.get_text("dict")["blocks"]
        
        for block in blocks:
            if "lines" in block:
                for line in block["lines"]:
                    for span in line["spans"]:
                        if not bbox_inside_bboxes_to_skip(bbox=span['bbox'], bboxes_to_skip=bboxes_to_skip[page_num + 1]):
                            formatted_text.append({
                                "page": page_num + 1,
                                "text": span["text"],
                                "font": span["font"],
                                "size": span["size"],
                                "color": span["color"],
                                "bbox": span["bbox"],
                            })

    return pd.DataFrame.from_records(formatted_text)


def normalize_document_font_size(df: pd.DataFrame) -> pd.Series:
    """Consider the most frequent font size document to be the default.
    
    Words with inferir font size are normalized to have the same font size as the defualt.
    Words with greater font size are considered to be headers of sections.
    """
    font_sizes = df.copy()
    font_sizes = df['size'].value_counts().to_frame().reset_index()
    
    most_frequent_font = font_sizes.loc[0, 'size']
    normalized_font_size = df['size'].copy()
    normalized_font_size.loc[df['size'] < most_frequent_font] = most_frequent_font
    return normalized_font_size
    
    
def get_heading_font_map(df: pd.DataFrame, font_column: str) -> dict[float, str]:
    """
    Produce a header font mapping from the distinct font sizes in the document (pandas dataframe with words)

    Example
    -------
    [10, 11, 12, 14] -> {10: '', 14: '#', 12: '##', 11: '###'}
    """
    from collections import deque
    # TODO: tratar caso em que temos mais tamanhos distintos de fontes do que tipos de headings do amrkdown (~6 níveis)
    ordered_font_sizes = np.sort(df[font_column].unique()).tolist()[::-1]
    queue = deque(ordered_font_sizes)
    queue.appendleft(queue.pop())
    
    heading_font_map = {}
    for heading_level, font_size in enumerate(queue):
        heading_font_map[font_size] = "#" * heading_level
    
    return heading_font_map


bboxes_to_skip = get_bboxes_to_skip(file)
text = extract_formatted_text(file, bboxes_to_skip=bboxes_to_skip)
text['size'] = text['size'].round(1)
text['normalized_font_size'] = normalize_document_font_size(df=text)

font_mapping = get_heading_font_map(df=text, font_column='normalized_font_size')

text

Unnamed: 0,page,text,font,size,color,bbox,normalized_font_size
0,1,General Description,Arial-BoldMT,12.5,48317,"(54.0, 150.78436279296875, 172.78550720214844,...",12.5
1,1,The DS18B20 digital thermometer provides 9-bit...,ArialMT,9.5,0,"(54.0, 167.16021728515625, 297.86151123046875,...",9.5
2,1,12-bit Celsius temperature measurements and ha...,ArialMT,9.5,0,"(54.0, 178.66473388671875, 297.8301696777344, ...",9.5
3,1,alarm function with nonvolatile user-programma...,ArialMT,9.5,0,"(54.0, 190.16925048828125, 297.8301696777344, ...",9.5
4,1,and lower trigger points. The DS18B20 communic...,ArialMT,9.5,0,"(54.0, 201.67376708984375, 297.82354736328125,...",9.5
...,...,...,...,...,...,...,...
1646,20,"© 2019 Maxim Integrated Products, Inc.",ArialMT,8.0,0,"(392.4710998535156, 752.971923828125, 540.8888...",9.5
1647,20,│,ArialMT,6.9,0,"(541.1016235351562, 753.971923828125, 544.8516...",9.5
1648,20,,Arial-BoldMT,8.5,0,"(544.8516235351562, 753.519287109375, 547.2131...",9.5
1649,20,20,ArialMT,8.0,0,"(547.2131958007812, 753.971923828125, 557.9988...",9.5


In [3]:
text['normalized_font_size'].value_counts()

normalized_font_size
9.5     1546
14.0      60
12.5      31
11.0      11
12.0       3
Name: count, dtype: int64

In [4]:
def format_dataframe_into_text(df: pd.DataFrame, heading_map: dict):
    final = [] 
    text_buffer = []
    previous_font_size = None
    
    for idx, row in df.iterrows():
        current_font_size = row['normalized_font_size']
        if current_font_size != previous_font_size:
            final.append(f"\n\n{heading_map[current_font_size]}")
            previous_font_size = current_font_size
            current_font_size = row['size']

        final.append(row['text'])
            
    return ' '.join(final)

final = format_dataframe_into_text(df=text, heading_map=font_mapping)

In [5]:
import os

with open(os.path.join('..', 'outputs', f'{FILENAME}-fitz.md'), 'w') as file:
    file.write(final)

In [6]:
print(final)



## General Description 

 The DS18B20 digital thermometer provides 9-bit to  12-bit Celsius temperature measurements and has an  alarm function with nonvolatile user-programmable upper  and lower trigger points. The DS18B20 communicates  over a 1-Wire bus that by definition requires only one  data line (and ground) for communication with a central    microprocessor. In addition, the DS18B20 can derive  power directly from the data line (“parasite power”),    eliminating the need for an external power supply.  Each DS18B20 has a unique 64-bit serial code, which  allows multiple DS18B20s to function on the same 1-Wire  bus. Thus, it is simple to use one microprocessor to  control many DS18B20s distributed over a large area.  Applications that can benefit from this feature include  HVAC environmental controls, temperature monitoring  systems inside buildings, equipment, or machinery, and  process monitoring and control systems. 

## Applications 

 ● ● Thermostatic Controls ● ● Industri