In [1]:
import pdfplumber
from PyPDF2 import PdfWriter
import fitz  # PyMuPDF


def is_inside_bbox(small_bbox, big_bbox):
    """Check if small_bbox is inside big_bbox."""
    return (
        small_bbox[0] >= big_bbox[0]
        and small_bbox[1] >= big_bbox[1]
        and small_bbox[2] <= big_bbox[2]
        and small_bbox[3] <= big_bbox[3]
    )


def get_table_bboxes(page) -> list:
    return [table.bbox for table in page.find_tables()] 


def get_diagram_bboxes(page) -> list:
    diagram_bboxes = []
    for objects in [page.rects, page.lines, page.curves]:
        for obj in objects: 
            diagram_bboxes.append(
                (obj['x0'], obj['top'], obj['x1'], obj['bottom'])
            )
    return diagram_bboxes

def extract_filtered_words_from_page(page, double_column=False):
    exclusion_bboxes = [
        *get_table_bboxes(page=page),
        *get_diagram_bboxes(page=page),
    ]
    
    if double_column:
        left_bbox = (0, 0, page.width / 2, page.height)
        right_bbox = (page.width / 2, 0, page.width, page.height)
        selection_bboxes = [left_bbox, right_bbox]
    else:
        selection_bboxes = [page.bbox]

    filtered_words = []
    
    for current_bbox in selection_bboxes:
        pre_selected_bbox_words = page.within_bbox(current_bbox).extract_words(extra_attrs=['size', 'fontname'])
        for word in pre_selected_bbox_words:
            word_bbox = (word['x0'], word['top'], word['x1'], word['bottom'])
            if not any(is_inside_bbox(word_bbox, excl_bbox) for excl_bbox in exclusion_bboxes):
                filtered_words.append(word)             
    
    return filtered_words


def get_exclusion_bboxes(pdf_path: str) -> dict[int, list]:
    bboxes = {}
    with pdfplumber.open(pdf_path) as pdf:
        for page_index, page in enumerate(pdf.pages):
            bboxes[page_index] = [*get_table_bboxes(page=page), *get_diagram_bboxes(page=page)]
    return bboxes


def remove_tables_and_diagrams(pdf_path, output_path):
    bboxes_to_exclude = get_exclusion_bboxes(pdf_path)
            
    with fitz.open(pdf_path) as doc:
        for page_index in bboxes_to_exclude.keys():
            page = doc.load_page(page_index)
            for bbox in bboxes_to_exclude[page_index]:
                rect = fitz.Rect(bbox)

                # Remove text within the bounding box
                text_instances = page.get_text("words")
                for inst in text_instances:
                    inst_rect = fitz.Rect(inst[:4])
                    if rect.intersects(inst_rect):
                        page.add_redact_annot(inst_rect, fill=(1, 1, 1))
                page.apply_redactions()
                page.draw_rect(rect, color=(1, 1, 1), fill=(1, 1, 1))
    
        doc.save(output_path, garbage=4)
        

input_pdf = "input/digital-thermometer-ds18b20.pdf"
output_pdf = "/tmp/output.pdf"
remove_tables_and_diagrams(input_pdf, output_pdf)

In [2]:
import pymupdf4llm
md_text = pymupdf4llm.to_markdown("/tmp/output.pdf")

In [3]:
import os

filename = 'digital-thermometer'
with open(os.path.join('..', 'outputs', f'{filename}-pdfplumber-overwrite-tables.md'), 'w') as file:
    file.write(md_text)

In [4]:
print(md_text)

_[Click here](https://www.maximintegrated.com/en/storefront/storefront.html)_ _for production status of specific part numbers._

# DS18B20 Programmable Resolution 1-Wire Digital Thermometer


**General Description**
The DS18B20 digital thermometer provides 9-bit to
12-bit Celsius temperature measurements and has an
alarm function with nonvolatile user-programmable upper
and lower trigger points. The DS18B20 communicates
over a 1-Wire bus that by definition requires only one
data line (and ground) for communication with a central
microprocessor. In addition, the DS18B20 can derive
power directly from the data line (“parasite power”),
eliminating the need for an external power supply.

Each DS18B20 has a unique 64-bit serial code, which
allows multiple DS18B20s to function on the same 1-Wire
bus. Thus, it is simple to use one microprocessor to
control many DS18B20s distributed over a large area.
Applications that can benefit from this feature include
HVAC environmental controls, temperat