In [None]:
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual

widget_style = {"description_width": "11em"}
widget_layout = widgets.Layout(width="90%")

In [None]:
from traveller_book_parser.books.parse_books import get_book_code_names
from traveller_book_parser.books.load_book_description import load_book_description

book_code_names = get_book_code_names()
book_descriptions = [
    load_book_description(book_code_name)
    for book_code_name in book_code_names
]

book_widget = widgets.Dropdown(
    options=[(book_description.name, book_description) for book_description in book_descriptions],
    description="Book:",
    style=widget_style,
    layout=widget_layout,
)

In [None]:
import pdfplumber

from traveller_book_parser.books.book_description import get_book_file_path

def get_page_max(book_description):
    pdf_path = get_book_file_path(book_description.code_name)
    with pdfplumber.open(pdf_path) as pdf:
        return len(pdf.pages) + 1

page_widget = widgets.IntText(
    value=1,
    min=1,
    max=get_page_max(book_widget.value),
    description="Page:",
    style=widget_style,
    layout=widget_layout,
)

def update_page_widget(*args):
    page_widget.max = get_page_max(book_widget.value)

book_widget.observe(update_page_widget, names='value')

In [None]:
from traveller_book_parser.data_sources.pdfplumber.pdfplumber_integration import get_pdfplumber_page

page = None
im = None

def update_pdf_page(*args):
    global page
    global im
    page = get_pdfplumber_page(
        pdf_path=get_book_file_path(book_widget.value.code_name),
        page_number=page_widget.value,
    )
    im = page.to_image(resolution=144)

update_pdf_page()

book_widget.observe(update_pdf_page, names='value')
page_widget.observe(update_pdf_page, names='value')

In [None]:
from collections import OrderedDict
import json

import pandas

from traveller_book_parser.data_sources.pdfplumber.pdfplumber_integration import TABLE_SETTINGS_DEFAULTS


def get_table_settings(
    vertical_strategy=TABLE_SETTINGS_DEFAULTS["vertical_strategy"],
    horizontal_strategy=TABLE_SETTINGS_DEFAULTS["horizontal_strategy"],
    snap_x_tolerance=TABLE_SETTINGS_DEFAULTS["snap_x_tolerance"],
    snap_y_tolerance=TABLE_SETTINGS_DEFAULTS["snap_y_tolerance"],
    join_x_tolerance=TABLE_SETTINGS_DEFAULTS["join_x_tolerance"],
    join_y_tolerance=TABLE_SETTINGS_DEFAULTS["join_y_tolerance"],
    min_words_vertical=TABLE_SETTINGS_DEFAULTS["min_words_vertical"],
    min_words_horizontal=TABLE_SETTINGS_DEFAULTS["min_words_horizontal"],
    intersection_x_tolerance=TABLE_SETTINGS_DEFAULTS["intersection_x_tolerance"],
    intersection_y_tolerance=TABLE_SETTINGS_DEFAULTS["intersection_y_tolerance"],
    text_x_tolerance=TABLE_SETTINGS_DEFAULTS["text_x_tolerance"],
    text_y_tolerance=TABLE_SETTINGS_DEFAULTS["text_y_tolerance"],
):
    table_settings = OrderedDict(
        vertical_strategy=vertical_strategy,
        horizontal_strategy=horizontal_strategy,
        snap_x_tolerance=snap_x_tolerance,
        snap_y_tolerance=snap_y_tolerance,
        join_x_tolerance=join_x_tolerance,
        join_y_tolerance=join_y_tolerance,
        min_words_vertical=min_words_vertical,
        min_words_horizontal=min_words_horizontal,
        intersection_x_tolerance=intersection_x_tolerance,
        intersection_y_tolerance=intersection_y_tolerance,
        text_x_tolerance=text_x_tolerance,
        text_y_tolerance=text_y_tolerance,
    )
    return table_settings

table_settings_widget = interactive(
    get_table_settings,
    vertical_strategy=["lines", "lines_strict", "text"],
    horizontal_strategy=["lines", "lines_strict", "text"],
    snap_x_tolerance=(1.0, 40.0),
    snap_y_tolerance=(1.0, 40.0),
    join_x_tolerance=(1.0, 40.0),
    join_y_tolerance=(1.0, 40.0),
    min_words_vertical=(1, 50),
    min_words_horizontal=(1, 50),
    intersection_x_tolerance=(1.0, 40.0),
    intersection_y_tolerance=(1.0, 40.0),
    text_x_tolerance=fixed(TABLE_SETTINGS_DEFAULTS["text_x_tolerance"]),
    text_y_tolerance=fixed(TABLE_SETTINGS_DEFAULTS["text_y_tolerance"]),
)

for child in table_settings_widget.children:
    child.style=widget_style
    child.layout=widget_layout

In [None]:
changed_table_settings_json = widgets.Output(style=widget_style, layout=widget_layout)

defaults_without_explicit = TABLE_SETTINGS_DEFAULTS.copy()
defaults_without_explicit.pop("explicit_vertical_lines")
defaults_without_explicit.pop("explicit_horizontal_lines")

def update_changed_table_settings_json():
    with changed_table_settings_json:
        changed_table_settings = (
            set(table_settings_widget.result.items())
            - set(defaults_without_explicit.items())
        )
        changed_keys = {key for key, value in changed_table_settings}
        print(json.dumps({
            "table_settings": OrderedDict(
                (key, value)
                for (key, value) in table_settings_widget.result.items()
                if key in changed_keys
            ),
        }, indent=4))
    changed_table_settings_json.clear_output(wait=True)

pdf_debug_image = widgets.Output(style=widget_style, layout=widget_layout)

def update_pdf_debug_image():
    if im is None: return

    with pdf_debug_image:
        display(im.reset().debug_tablefinder(table_settings_widget.result))
    pdf_debug_image.clear_output(wait=True)

table_dataframe = widgets.Output(style=widget_style, layout=widget_layout)

def update_table_dataframe():
    if page is None: return

    with table_dataframe:
        table = page.extract_table(table_settings_widget.result)
        if table:
            df = pandas.DataFrame(table[1:], columns=table[0])
            display(df)
        else:
            print("No table found")
    table_dataframe.clear_output(wait=True)

In [None]:
def update_output(*args):
    update_changed_table_settings_json()
    update_pdf_debug_image()
    update_table_dataframe()

update_output()

book_widget.observe(update_output, names='value')
page_widget.observe(update_output, names='value')
for input_widget in table_settings_widget.children:
    input_widget.observe(update_output, names='value')

In [None]:
input_box = widgets.VBox([
    book_widget,
    page_widget,
    table_settings_widget,
], layout={"width": "90%"})

# output_layout = widgets.TwoByTwoLayout(
#     top_left=input_box,
#     top_right=changed_table_settings_json,
#     bottom_left=pdf_debug_image,
#     bottom_right=table_dataframe
# )
output_layout = widgets.VBox([
    widgets.HBox([
        input_box,
        changed_table_settings_json,
    ]),
    widgets.HBox([
        pdf_debug_image,
        table_dataframe,
    ])
])

display(output_layout)

In [None]:
from operator import itemgetter

text_widget = interactive(
    page.extract_text_lines,
    use_text_flow=True,
    keep_blank_chars=False,
    layout=False,
    x_tolerance=(0, 4000),
    y_tolerance=(0, 40),
    split_at_punctuation="",
)

text_output = widgets.Output()

def get_line_font_size(line):
    return line["chars"][0]["size"]

@text_output.capture(clear_output=True)
def update_text_output(*args):
    lines = text_widget.result
    line_font_sizes = [(line, get_line_font_size(line)) for line in lines]
    line_font_sizes.sort(key=itemgetter(1), reverse=True)
    print("Biggest font:", line_font_sizes[0][0]["text"])
    print("Second biggest font:", line_font_sizes[1][0]["text"])
    print("Third biggest font:", line_font_sizes[2][0]["text"])
    
update_text_output()
for input_widget in text_widget.children:
    input_widget.observe(update_text_output, names='value')

display(book_widget)
display(page_widget)
display(text_widget)
display(text_output)