In [1]:
file = 'input/digital-thermometer-ds18b20.pdf'

In [2]:
import os

import numpy as np
import pandas as pd
import pdfplumber

def is_inside_bbox(small_bbox, big_bbox):
    """Check if small_bbox is inside big_bbox."""
    return (
        small_bbox[0] >= big_bbox[0]
        and small_bbox[1] >= big_bbox[1]
        and small_bbox[2] <= big_bbox[2]
        and small_bbox[3] <= big_bbox[3]
    )

def get_table_bboxes(page) -> list:
    return [table.bbox for table in page.find_tables()] 

def get_diagram_bboxes(page) -> list:
    diagram_bboxes = []
    for objects in [page.rects, page.lines, page.curves]:
        for obj in objects: 
            diagram_bboxes.append(
                (obj['x0'], obj['top'], obj['x1'], obj['bottom'])
            )
    return diagram_bboxes

def extract_filtered_words_from_page(page, double_column=False):
    exclusion_bboxes = [
        *get_table_bboxes(page=page),
        *get_diagram_bboxes(page=page),
    ]
    
    if double_column:
        left_bbox = (0, 0, page.width / 2, page.height)
        right_bbox = (page.width / 2, 0, page.width, page.height)
        selection_bboxes = [left_bbox, right_bbox]
    else:
        selection_bboxes = [page.bbox]

    filtered_words = []
    
    for current_bbox in selection_bboxes:
        pre_selected_bbox_words = page.within_bbox(current_bbox).extract_words(extra_attrs=['size'])
        for word in pre_selected_bbox_words:
            word_bbox = (word['x0'], word['top'], word['x1'], word['bottom'])
            if not any(is_inside_bbox(word_bbox, excl_bbox) for excl_bbox in exclusion_bboxes):
                filtered_words.append(word)             
    
    return filtered_words


word_dfs = []

with pdfplumber.open(file) as pdf:
    for page in pdf.pages:
        page_words = extract_filtered_words_from_page(page, double_column=True)
        page_df = pd.DataFrame().from_records(page_words)
        page_df['page'] = page.page_number
        word_dfs.append(page_df)

words = pd.concat(word_dfs).reset_index(drop=True)
words['size'] = words['size'].round(1)
words

Unnamed: 0,text,x0,x1,top,doctop,bottom,upright,height,width,direction,size,page
0,General,54.00000,100.55000,154.3003,154.3003,166.8003,True,12.5,46.5500,ltr,12.5,1
1,Description,104.02500,172.78750,154.3003,154.3003,166.8003,True,12.5,68.7625,ltr,12.5,1
2,The,54.00000,70.52050,169.3478,169.3478,178.8478,True,9.5,16.5205,ltr,9.5,1
3,DS18B20,77.26930,118.38530,169.3478,169.3478,178.8478,True,9.5,41.1160,ltr,9.5,1
4,digital,125.13505,150.40505,169.3478,169.3478,178.8478,True,9.5,25.2700,ltr,9.5,1
...,...,...,...,...,...,...,...,...,...,...,...,...
6445,20,549.10160,557.99760,755.8141,15803.8141,763.8141,True,8.0,8.8960,ltr,8.0,20
6446,ine,307.02060,316.35860,696.5831,15744.5831,703.5831,True,7.0,9.3380,ltr,7.0,20
6447,storefront,318.50480,348.07280,696.5831,15744.5831,703.5831,True,7.0,29.5680,ltr,7.0,20
6448,at,350.21480,356.05280,696.5831,15744.5831,703.5831,True,7.0,5.8380,ltr,7.0,20


In [3]:
def analyze_vertical_margin_words(pdf_path: str, percentage: float = 0.3, direction: str = 'top') -> pd.DataFrame:
    """
    Extract the words contained in the top/bottom `percentage` of the file height into a dataframe.
    This step is necessary for the header/footer ignoring heuristic
    """
    dfs = []

    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            width, height = page.width, page.height

            # Define top region
            if direction == 'top':
                bbox = (0, 0, width, height * percentage)
            elif direction == 'bottom':
                bbox = (0, height * (1 - percentage), width, height)
            else:
                raise ValueError("Invalid direction. Use 'top' or 'bottom'.")

            area_words = page.within_bbox(bbox).extract_words()
            page_df = pd.DataFrame.from_records(area_words)
            page_df['page_number'] = page.page_number
            dfs.append(page_df)

    return pd.concat(dfs), bbox, page.page_number


def select_words_to_exclude(words: pd.DataFrame, n_pages: int, thresh: float = 0.9) -> list[str]:
    gb = area_words.groupby('text', as_index=False)['page_number'].count().sort_values('page_number', ascending=False)
    minimum_page_occurences_to_drop = int(n_pages * thresh)
    return gb.query('page_number >= @minimum_page_occurences_to_drop')['text'].to_list()


def exclude_words_from_bbox(words: pd.DataFrame, bbox: tuple, words_to_exclude: list[str]) -> pd.DataFrame:
    df = words.copy()
    check_inside_exclusion_bbox = lambda row: (
        is_inside_bbox((row['x0'], row['top'], row['x1'], row['bottom']), bbox)
        and row['text'] in words_to_exclude
    )
    df['to_delete'] = df.apply(check_inside_exclusion_bbox, axis=1)
    df = df.query('not to_delete')
    df = df.drop(columns='to_delete')
    return df


area_words, header_bbox, n_pages = analyze_vertical_margin_words(file, percentage=0.15, direction='top')
header_to_exclude = select_words_to_exclude(words=area_words, n_pages=n_pages)

area_words, footer_bbox, n_pages = analyze_vertical_margin_words(file, percentage=0.15, direction='bottom')
footer_to_exclude = select_words_to_exclude(words=area_words, n_pages=n_pages)

result = exclude_words_from_bbox(words, bbox=header_bbox, words_to_exclude=header_to_exclude)
result = exclude_words_from_bbox(result, bbox=footer_bbox, words_to_exclude=footer_to_exclude)

result

Unnamed: 0,text,x0,x1,top,doctop,bottom,upright,height,width,direction,size,page
0,General,54.00000,100.55000,154.3003,154.3003,166.8003,True,12.5,46.5500,ltr,12.5,1
1,Description,104.02500,172.78750,154.3003,154.3003,166.8003,True,12.5,68.7625,ltr,12.5,1
2,The,54.00000,70.52050,169.3478,169.3478,178.8478,True,9.5,16.5205,ltr,9.5,1
3,DS18B20,77.26930,118.38530,169.3478,169.3478,178.8478,True,9.5,41.1160,ltr,9.5,1
4,digital,125.13505,150.40505,169.3478,169.3478,178.8478,True,9.5,25.2700,ltr,9.5,1
...,...,...,...,...,...,...,...,...,...,...,...,...
6445,20,549.10160,557.99760,755.8141,15803.8141,763.8141,True,8.0,8.8960,ltr,8.0,20
6446,ine,307.02060,316.35860,696.5831,15744.5831,703.5831,True,7.0,9.3380,ltr,7.0,20
6447,storefront,318.50480,348.07280,696.5831,15744.5831,703.5831,True,7.0,29.5680,ltr,7.0,20
6448,at,350.21480,356.05280,696.5831,15744.5831,703.5831,True,7.0,5.8380,ltr,7.0,20


In [4]:
font_sizes = result.copy()

font_sizes = font_sizes['size'].value_counts().to_frame().reset_index()
normal_size = font_sizes.loc[0, 'size']

font_sizes = font_sizes.query('size >= @normal_size').reset_index(drop=True)
font_sizes

result = result.query('size >= @normal_size')

In [5]:
result

Unnamed: 0,text,x0,x1,top,doctop,bottom,upright,height,width,direction,size,page
0,General,54.00000,100.55000,154.3003,154.3003,166.8003,True,12.5,46.5500,ltr,12.5,1
1,Description,104.02500,172.78750,154.3003,154.3003,166.8003,True,12.5,68.7625,ltr,12.5,1
2,The,54.00000,70.52050,169.3478,169.3478,178.8478,True,9.5,16.5205,ltr,9.5,1
3,DS18B20,77.26930,118.38530,169.3478,169.3478,178.8478,True,9.5,41.1160,ltr,9.5,1
4,digital,125.13505,150.40505,169.3478,169.3478,178.8478,True,9.5,25.2700,ltr,9.5,1
...,...,...,...,...,...,...,...,...,...,...,...,...
6263,EEPROM.,366.44700,410.26100,208.8703,13672.8703,218.3703,True,9.5,43.8140,ltr,9.5,18
6308,Ordering,54.00000,106.78750,136.3004,14392.3004,148.8004,True,12.5,52.7875,ltr,12.5,19
6309,Information,110.26250,179.01250,136.3004,14392.3004,148.8004,True,12.5,68.7500,ltr,12.5,19
6333,Revision,54.00000,106.10000,136.3003,15184.3003,148.8003,True,12.5,52.1000,ltr,12.5,20


In [6]:
def get_heading_font_map(df: pd.DataFrame, font_column: str):
    from collections import deque
    # TODO: tratar caso em que temos mais tamanhos distintos de fontes do que tipos de headings do amrkdown (~6 níveis)
    ordered_font_sizes = np.sort(df[font_column].unique()).tolist()[::-1]
    queue = deque(ordered_font_sizes)
    queue.appendleft(queue.pop())
    
    heading_font_map = {}
    for heading_level, font_size in enumerate(queue):
        heading_font_map[font_size] = "#" * heading_level
    
    return heading_font_map

heading_map = get_heading_font_map(df=result, font_column='size')
heading_map

{9.5: '', 12.5: '#', 12.0: '##', 11.0: '###'}

In [7]:
def format_dataframe_into_text(df: pd.DataFrame, heading_map: dict):
    final = [] 
    text_buffer = []
    previous_font_size = None
    
    for idx, row in df.iterrows():
        current_font_size = row['size']
        if current_font_size != previous_font_size:
            final.append(f"\n\n{heading_map[current_font_size]}")
            previous_font_size = current_font_size
            current_font_size = row['size']

        final.append(row['text'])
            
    return ' '.join(final)

final = format_dataframe_into_text(df=result, heading_map=heading_map)

In [10]:
filename = 'digital-thermometer'
with open(os.path.join('..', 'outputs', f'{filename}-pdfplumber.md'), 'w') as file:
    file.write(final)

In [12]:
print(final)



# General Description 

 The DS18B20 digital thermometer provides 9-bit to 12-bit Celsius temperature measurements and has an alarm function with nonvolatile user-programmable upper and lower trigger points. The DS18B20 communicates over a 1-Wire bus that by definition requires only one data line (and ground) for communication with a central microprocessor. In addition, the DS18B20 can derive power directly from the data line (“parasite power”), eliminating the need for an external power supply. Each DS18B20 has a unique 64-bit serial code, which allows multiple DS18B20s to function on the same 1-Wire bus. Thus, it is simple to use one microprocessor to control many DS18B20s distributed over a large area. Applications that can benefit from this feature include HVAC environmental controls, temperature monitoring systems inside buildings, equipment, or machinery, and process monitoring and control systems. 

# Applications 

 ● Thermostatic Controls ● Industrial Systems ● Consumer Prod