In [None]:
import fitz
import os
from collections import defaultdict
import json

from term_extractors import RegexBlockExtractor, BlockExtractor
from table_extractor import TableExtractor
from footnote_extractor import FootnoteExtractor

from parser_utils import print_n_pages, join_jsons, is_inside_bbox, calculate_content_box

In [None]:
# Muudetavad parameetrid

fdir = 'C:/Users/sandra.eiche/Documents/kood/parser_comparison/varia_data/'
fname = os.listdir(fdir)[2]

output_fdir = 'C:\\Users\\sandra.eiche\\OneDrive - Eesti Keele Instituut\\Documents\\KVA\\kva_parsed_jsons'
output_fname = fname.rsplit('.', 1)[0] + '.json'

# Tabelite eraldamiseks
table_extraction_strategy = 'lines_strict' # Strateegiad kirjeldatud: https://pymupdf.readthedocs.io/en/latest/page.html#Page.find_tables, saab valida failipõhiselt sobiva strateegia.
horizontal_sorting = False # Määrata False, kui tegemist on veergudepõhise dokumendiga.

# Päiste ja jaluste eemaldamiseks
header_height = 60
footer_height = 50

# Viidete eraldamiseks
footnote_regex =  r'^\d+\s+(.*)'
footnote_group = 1

# Mõistete eraldamiseks
block_based_extraction = {
    'pages' : []}

custom_regex_extraction = {
        'pages': [(22, 430)],
        'regular_expression': r"(\n\s*\n|^)((.|\n)*?)(?=\n\s*\n)",
        'regex_group': 1
}

In [None]:
from pprint import pprint

with fitz.open(os.path.join(fdir, fname)) as doc:
    print(fname)
    pprint(doc.metadata)

In [None]:
metadata = {
    'filename' : fname,
    'publication' : '',
    'publication_year' : 2013,
    'title': '',
    'author': '',
    'languages': ['en', 'fr'],
    'field_keywords' : [],
    'header_height': header_height,
    'footer_height': footer_height,
    'table_extraction_strategy': table_extraction_strategy,
    'horizontal_sorting': horizontal_sorting,
    'footnote_regex': footnote_regex,
    'footnote_group': footnote_group,
    'custom_regex': custom_regex_extraction
}

---
10 lk või bloki kuvamine näitena. Saab välja võtta päiste ja jaluste koordinaadid ning need muudetavatesse parameetritesse kopeerida, et saaks neid lõike ignoreerida.
Koordinaadid on blocki 4 esimest elementi.

In [None]:
print_n_pages(fdir, fname, display='blocks', horizontal_sorting=horizontal_sorting)

In [None]:
print_n_pages(fdir, fname, display='text', horizontal_sorting=horizontal_sorting)

---
### Ekstraheerijate initsialiseerimine

In [None]:
table_extractor = TableExtractor(table_extraction_strategy=table_extraction_strategy)
block_extractor = BlockExtractor()
regex_extractor = RegexBlockExtractor(regex_pattern=custom_regex_extraction['regular_expression'], 
                                      regex_group=custom_regex_extraction['regex_group'])

footnote_extractor = FootnoteExtractor(footnote_pattern=footnote_regex, footnote_group=footnote_group)

In [None]:
document_path = os.path.join(fdir, fname)

with fitz.open(document_path) as doc:
    table_data = table_extractor.extract_tables_from_doc(doc, header_height=header_height, footer_height=footer_height)
    block_data = block_extractor.extract_text_by_page(doc, block_based_extraction['pages'], header_height=header_height, footer_height=footer_height)
    regex_data = regex_extractor.extract_text_by_page(doc, custom_regex_extraction['pages'], header_height=header_height, footer_height=footer_height)
    footnote_data = footnote_extractor.extract_footnotes_from_doc(doc, header_height=header_height, footer_height=footer_height)
    
    term_data = join_jsons([block_data, regex_data])

In [None]:
def is_term_page(page_no, term_data):
    """Check if the given page number is a term page."""
    for term_page in json.loads(term_data):
        if term_page['page_number'] == page_no:
            return True
    return False

In [None]:
document_path = os.path.join(fdir, fname)

with fitz.open(document_path) as doc:
    non_table_blocks = defaultdict(list)

    content_text_data = []

    for page_no, page in enumerate(doc, 1):
        page_blocks = []

        if is_term_page(page_no, term_data):
                continue
        
        # Teksti eraldamine päist/jalust välja jättes
        text_blocks = page.get_text('blocks', sort=horizontal_sorting, clip=calculate_content_box(page, header_height, footer_height))


        for text_block in text_blocks:
            block_is_table_data = False
            block_is_footnote_data = False

            # Tabelite väljajätt
            for table_page in json.loads(table_data):
                if table_page['page_number'] == page_no:
                    for table_box in table_page['bboxes']:
                        if is_inside_bbox(text_block, table_box):
                                block_is_table_data = True
                                break
                        if block_is_table_data:
                            break

            # Joonealuste viidete väljajätt
            for footnote_page in json.loads(footnote_data):
                if footnote_page['page_number'] == page_no:
                    for footnote_box in footnote_page['bboxes']:
                        if is_inside_bbox(text_block, footnote_box):
                                block_is_footnote_data = True
                                break
                        if block_is_footnote_data:
                            break

            if not block_is_table_data and not block_is_footnote_data:
                 page_blocks.append(text_block[4])

        # Koguteksti salvestamine json-ina
        full_text_page_json = [{
            "page_number": page.number + 1,  # Page numbers are zero-based in PyMuPDF
            "text": '\n'.join(page_blocks)
            }]
        
        content_text_data.extend(full_text_page_json)

In [None]:
print('Saving to', os.path.join(output_fdir, output_fname))
      
with open(os.path.join(output_fdir, output_fname), 'w') as fout:
          
    content_dict = {
        'table_data': json.loads(table_data),
        'term_data': json.loads(term_data),
        'footnote_data': json.loads(footnote_data),
        'content_text_data': content_text_data,
        'term_data': json.loads(term_data)
    }
          
    metadata.update(content_dict)

    fout.write(json.dumps(metadata))

    

In [None]:
json.dumps(metadata)