# Import packages

In [1]:
from collections import Counter
import re
import spacy
from spacy_langdetect import LanguageDetector
from spacy.language import Language
from collections import Counter
# from utils import detect_para

from operator import itemgetter
import fitz
import json

import jellyfish

  from .autonotebook import tqdm as notebook_tqdm


# Load Spacy voor taal detectie

In [2]:
def get_lang_detector(nlp, name):
    return LanguageDetector()

nlp = spacy.load("nl_core_news_sm")
Language.factory("language_detector", func=get_lang_detector)
nlp.add_pipe('language_detector', last=True)

<spacy_langdetect.spacy_langdetect.LanguageDetector at 0x7f183331e890>

## Functie om vaakstvoorkomende taal vast te stellen

In [3]:
def most_frequent(List):
    occurence_count = Counter(List)
    return occurence_count.most_common(1)[0][0]

path = 'Raadsprogramma 2022-2026 Aan de slag voor Aalsmeer.pdf'
path = "Coalitieakkoord-Samen-werken-aan-Delft.pdf"

## Main functies

In [7]:
# functie om te controleren of value een nummer is.
def digitize(value):
    try:
        float(value)
        return True
    except ValueError:
        return False

def fonts(doc, granularity=True):
    """Extracts fonts and their usage in PDF documents.

    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param granularity: also use 'font', 'flags' and 'color' to discriminate text
    :type granularity: bool

    :rtype: [(font_size, count), (font_size, count}], dict
    :return: most used fonts sorted by count, font style information
    """
    styles = {}
    font_counts = {}

    for page in doc:
        blocks = page.get_text("dict", flags=11)["blocks"]
        # print(blocks)
        for b in blocks:  # iterate through the text blocks
            # print(b)
            if b['type'] == 0:  # block contains text
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if granularity:
                            identifier = "{0}_{1}_{2}_{3}".format(s['size'], s['flags'], s['font'], s['color'])
                            styles[identifier] = {'size': s['size'], 'flags': s['flags'], 'font': s['font'],
                                                  'color': s['color']}
                            
                        else:
                            identifier = "{0}".format(s['size'])
                            styles[identifier] = {'size': s['size'], 'font': s['font']}
                            
                        font_counts[identifier] = font_counts.get(identifier, 0) + 1  # count the fonts usage

    font_counts = sorted(font_counts.items(), key=itemgetter(1), reverse=True)

    # print(b, identifier)
    # print(styles)

    if len(font_counts) < 1:
        raise ValueError("Zero discriminating fonts found!")

    return font_counts, styles


def font_tags(font_counts, styles):
    """Returns dictionary with font sizes as keys and tags as value.

    :param font_counts: (font_size, count) for all fonts occuring in document
    :type font_counts: list
    :param styles: all styles found in the document
    :type styles: dict

    :rtype: dict
    :return: all element tags based on font-sizes
    """
    p_style = styles[font_counts[0][0]]  # get style for most used font by count (paragraph)
    p_size = p_style['size']  # get the paragraph's size


    # sorting the font sizes high to low, so that we can append the right integer to each tag
    font_sizes = []
    for (font_size, count) in font_counts:
        font_sizes.append(float(font_size))
    font_sizes.sort(reverse=True)
    
    # aggregating the tags for each font size
    idx = 0
    size_tag = {}
    for size in font_sizes:
        idx += 1
        if size == p_size:
            idx = 0
            size_tag[size] = '<p>'
        if size > p_size:
            size_tag[size] = '<h{0}>'.format(idx)
        elif size < p_size:
            size_tag[size] = '<s{0}>'.format(idx)

    return size_tag


def headers_para(doc, size_tag):
    """Scrapes headers & paragraphs from PDF and return texts with element tags.

    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param size_tag: textual element tags for each size
    :type size_tag: dict

    :rtype: list
    :return: texts with pre-prended element tags
    """
    header_para = []  # list with headers and paragraphs
    first = True  # boolean operator for first header
    previous_s = {}  # previous span
    page_num = 1
    b_ = []
    s_ = []
    t_point = 20 # amount of pixels/points that defines a white line is between two lines.

    for page in doc:
        blocks = page.get_text("dict")["blocks"]
        for b in blocks:  # iterate through the text blocks
            if b['type'] == 0:  # this block contains text+

                # REMEMBER: multiple fonts and sizes are possible IN one block, wil ik dit? Of wil ik gewogen? 
                # Hoe vaak kom je titel tegen met in dezelfde lijn paragraaf_text? functie is niet gearceerde woorden eruit te halen.

                block_string = ""  # text found in block
                for l in b["lines"]:  # iterate through the text lines
                    
                    for s in l["spans"]:  # iterate through the text spans
                        s_.append(s)
                        if first:
                            previous_s = s
                            first = False
                            block_string = {
                                'tag': size_tag[s['size']], 
                                'text': s['text'].strip(),
                                'page_num': page_num,
                                'p_position_x': s['origin'][0],
                                'p_position_y': s['origin'][1],
                                'font_size': s['size']
                                }
                            header_para.append(block_string)
                        else:
                            previous_s = header_para[-1]
                            if size_tag[s['size']] == previous_s['tag'] and 0 <= s['origin'][1] - previous_s['p_position_y'] <= (t_point + previous_s['font_size']): # and size_tag[s['size']] == '<p>':

                                last_entry = header_para[-1]
                                last_entry['text'] += f" {s['text'].strip()}" 
                                last_entry['p_position_x'] = s['origin'][0]
                                last_entry['p_position_y'] = s['origin'][1]

                                header_para[-1] = last_entry
                            
                            elif s['text'].strip() == '':
                                pass

                            else:
                                block_string = {
                                    'tag': size_tag[s['size']], 
                                    'text': s['text'].strip(),
                                    'page_num': page_num,
                                    'p_position_x': s['origin'][0],
                                    'p_position_y': s['origin'][1],
                                    'font_size': s['size']
                                    }
                                header_para.append(block_string)
            else:
                b_.append(b)

        if  digitize(header_para[-1]['text'].strip()): # check if last line on page is a (page) number and remove it.
            header_para.pop(-1)
        
        block_string = {
            'tag': '<p_break>', 
            'text': f'A physical page break on page {page_num}',
            'page_num': page_num,
            'p_position_x': 0,
            'p_position_y': 0,
            'font_size': 0
}
        header_para.append(block_string)
        page_num += 1
    return header_para, s_

def detect_para(path):

    doc = fitz.open(path)

    font_counts, styles = fonts(doc, granularity=False)
    # print(font_counts)
    # print(styles)

    size_tag = font_tags(font_counts, styles)
    # print(size_tag)

    elements, b_ = headers_para(doc, size_tag)
    topscript_list = [elements.index(elem)+1 for elem in elements if elem['tag'] == '<p_break>'] # make list of first entries on page
    topscript_list.pop(-1) # remove last, because there is no topscript on the next page after the last page
    elements = pop_repeating(elements, topscript_list)
    endscript_list = [elements.index(elem)-1 for elem in elements if elem['tag'] == '<p_break>'] # make list of last entries on page
    elements = pop_repeating(elements, endscript_list)

    primary_heading = deter_primary_h(elements)
    return elements, b_, primary_heading

def pop_repeating(elements, s_list): # based on list of first or last entries on page, deter if these entries are similair and thus a standard footer for example.
    sim_list = []
    for i in range(len(s_list)-1):
        a, b = s_list[i], s_list[i+1]
        str_dist = str_distance(a, b, elements)
        if str_dist >= 0.8:
            sim_list.extend([a, b])
    sim_list = list(set(sim_list))
    sim_list.sort(reverse=True)
    if len(sim_list) - len(s_list) <= 3:
        [elements.pop(i) for i in sim_list]
    return elements

def str_distance(a, b, elements):
    a = elements[a]['text']
    b = elements[b]['text']
    dist = jellyfish.jaro_distance(a, b)
    return dist

# beter om terug te zoeken vanaf <p> en alle hogere headings een plek geven.
def deter_primary_h(elements):
    b = [elem['tag'] for elem in elements] # make list of only the tags
    b_ = []
    for i in range(len(b)-2): # walk through list for headings that is inbetween 2 paragraphs
        if b[i] == b[i+2] and b[i] == '<p>':
            b_.append(b[i+1])
    tag_amount = []
    
    for x in list(set(b_)):
        if x.startswith('<h'): # only keep headings
            d_tag_amoung = {
                'tag': x,
                'amount': b_.count(x) 
            }
            tag_amount.append(d_tag_amoung)
    tag_amount = sorted(tag_amount, key=lambda i: i['amount'], reverse=True)
    
    primary = tag_amount[0]['tag'] if len(tag_amount) > 0 else ''
    return primary

## Oud; Initiatie van PDF minen

In [6]:
paragraph = []
heading = []
lang = []

elements = detect_para(path)
s_with = []
# print(elements)
for i in elements:
    print(i)
    s_with.append(i)
    # print(i)
    if (
        i.startswith("<h1>")
        or i.startswith("<h2>")
        or i.startswith("<h3>")
        or i.startswith("<h4>")
    ):
        heading.append(i.split(">")[1].replace("|", ""))

        text = re.sub("[^a-zA-Z]", " ", i)
        if text:
            doc = nlp(text)
            lang.append(doc._.language["language"])
        else:
            print("missing text in sentence")

    elif i.startswith("<p>") or i.startswith("<s1>"):
        paragraph.append(i.split(">")[1].replace("|", ""))

        text = re.sub("[^a-zA-Z]", " ", i)
        if text:
            doc = nlp(text)
            lang.append(doc._.language["language"])
        else:
            print("missing text in sentence")
    else:
        # print("This is not useful text in document")
        pass
print("Language detected is : ", most_frequent(lang))

[('11.0', 128), ('10.0', 14), ('8.0', 10), ('18.0', 5), ('28.0', 3), ('16.0', 3), ('14.0', 2), ('12.0', 2), ('24.0', 1)]
{'28.0': {'size': 28.0, 'font': 'ArialMT'}, '16.0': {'size': 16.0, 'font': 'Arial-BoldMT'}, '11.0': {'size': 11.0, 'font': 'ArialMT'}, '14.0': {'size': 14.0, 'font': 'ArialMT'}, '8.0': {'size': 8.0, 'font': 'ArialMT'}, '24.0': {'size': 24.0, 'font': 'ArialMT'}, '10.0': {'size': 10.0, 'font': 'Verdana-Bold'}, '18.0': {'size': 18.0, 'font': 'ArialMT'}, '12.0': {'size': 12.0, 'font': 'ArialMT'}}
{28.0: '<h1>', 24.0: '<h2>', 18.0: '<h3>', 16.0: '<h4>', 14.0: '<h5>', 12.0: '<h6>', 11.0: '<p>', 10.0: '<s1>', 8.0: '<s2>'}
{'tag': '<h1>', 'text': 'Adviesrapport Data Warehousing'}


AttributeError: 'dict' object has no attribute 'startswith'

## Nieuw; Initiatie van PDF minen

In [8]:
paragraph = []
heading = []
lang = []

elements, b_ ,primary_heading = detect_para(path)


In [9]:
elements

[{'tag': '<h2>',
  'text': 'Samen werken aan Delft',
  'page_num': 1,
  'p_position_x': 147.40159606933594,
  'p_position_y': 447.3299255371094,
  'font_size': 57.0},
 {'tag': '<h4>',
  'text': 'Coalitieakkoord 2022-2026',
  'page_num': 1,
  'p_position_x': 147.40159606933594,
  'p_position_y': 493.3299255371094,
  'font_size': 19.0},
 {'tag': '<p_break>',
  'text': 'A physical page break on page 1',
  'page_num': 1,
  'p_position_x': 0,
  'p_position_y': 0,
  'font_size': 0},
 {'tag': '<h3>',
  'text': 'Samen werken aan Delft',
  'page_num': 2,
  'p_position_x': 138.89759826660156,
  'p_position_y': 169.01220703125,
  'font_size': 40.0},
 {'tag': '<h5>',
  'text': 'Delft heeft veel om trots op te zijn. Een unieke concentratie van kennis, technologie, talent en ondernemerschap, waarmee Delft nationaal en internationaal van betekenis is. Een stad met cultureel erfgoed en historische iconen. Een stad met een grote diversiteit aan bewoners en ondernemers die zich betrokken voelen bij hun 

In [None]:
# HEADING LOGIC
####
# heading above paragraph == head of paragraph
# biggest heading on page == page heading/title (?)
# biggest heading == titel (?)
# when len(heading) > 10, dan geen echte heading maar inleidend stuk 

In [None]:
# FINAL OUTPUT
####
# Paragraph text
# Direct heading
# as dict or as HTML?
# document content based on headings

In [10]:
[elem['text'] for elem in elements if elem['tag'] == primary_heading]
[elem['text'] for elem in elements if elem['tag'] == '<h3>']

['Samen werken aan Delft',
 'Inhoudsopgave',
 'De stad maken we met elkaar',
 'Delft kansrijk voor iedereen',
 'Ruimte voor wonen',
 'Een gezonde en veilige stad',
 'Een duurzame stad',
 'Economische kracht en een bruisende stad',
 'Financiën',
 'Portefeuilleverdeling']

In [60]:
[elem for elem in elements if elem['page_num'] ==8]

[{'tag': '<h1>',
  'text': '1',
  'page_num': 8,
  'p_position_x': 70.03929901123047,
  'p_position_y': 182.0347900390625},
 {'tag': '<h3>',
  'text': 'De stad maken we met elkaar',
  'page_num': 8,
  'p_position_x': 138.89759826660156,
  'p_position_y': 193.16973876953125},
 {'tag': '<h5>',
  'text': 'Mensen maken de stad. Dit zijn de ruim 100.000 Delftenaren die hier wonen en deze stad hun thuis noemen – een aantal dat groeit tot meer dan 120.000 in 2040. De mensen die hier werken, ondernemen, studeren, zorg krijgen, actief zijn of verblijven. De bedrij\xad ven, de zorg\xad en kennisinstellingen, de maatschappelijke organisaties en verenigingen die hier actief zijn. Samen bepalen we hoe Delft leeft, werkt en zich ontwikkelt. We hebben elkaar hard nodig. Wij maken er daarom werk van om meer samen op te pakken en om zichtbaar te zijn in de wijken.',
  'page_num': 8,
  'p_position_x': 45.35430145263672,
  'p_position_y': 371.622314453125},
 {'tag': '<p>',
  'text': 'De mensen en organis

verschil in pixels in eigen beleidsdocument nooit groter dan 15 als het tussen een heading en een paragraaf is.

In [8]:
b_

[{'size': 27.0,
  'flags': 20,
  'font': 'KievitPro-Bold',
  'color': 2236191,
  'ascender': 0.9380000233650208,
  'descender': -0.25,
  'text': 'Inhoud',
  'origin': (361.41729736328125, 56.5201416015625),
  'bbox': (361.41729736328125,
   31.194141387939453,
   444.9822998046875,
   63.2701416015625)},
 {'size': 12.0,
  'flags': 4,
  'font': 'KievitPro-Regular',
  'color': 2236191,
  'ascender': 0.9380000233650208,
  'descender': -0.25,
  'text': 'Voorwoord: Aan de slag voor Aalsmeer! ',
  'origin': (361.41729736328125, 85.32012939453125),
  'bbox': (361.41729736328125,
   74.06413269042969,
   558.6470947265625,
   88.32012939453125)},
 {'size': 12.0,
  'flags': 4,
  'font': 'KievitPro-Regular',
  'color': 2236191,
  'ascender': 0.9380000233650208,
  'descender': -0.25,
  'text': 'Gemeentebestuur                                     ',
  'origin': (361.41729736328125, 114.1201171875),
  'bbox': (361.41729736328125,
   102.86412048339844,
   551.2445678710938,
   117.1201171875)},
 {'