In [1]:
from docx2python import docx2python
import json
import os
import re
from collections import defaultdict
from itertools import zip_longest, filterfalse
import sys
from Levenshtein import distance as editdistance
from organ_utils import preprocess_line, preprocess_doc, getPartFromID, isEmptyBlock, isTable, get_last_text_line

In [2]:
os.getcwd()

'/Users/krane108/git/polifonia-project/organs-dataset/scripts'

In [3]:
#path to output data
outputroot = '../output/'

In [4]:
#load organids and filenames
with open('../output/organids.json', 'r') as f:
    organids = json.load(f)

with open('../output/filenames.json', 'r') as f:
    filename = json.load(f)

## helper functions

### Footnotes

In [5]:
def extract_footnote_symbol(line):
    occs = re.findall(r'\*+', line)
    if occs:
        return occs[0]
    else:
        return ''

def match_footnote_symbol(line, symbol):
    footno = symbol.count('*')
    return bool(re.match(r''+'\*'*footno+'[^\*]', line.strip())) #line starting with footno *s

def strip_footnote_symbol(line, symbol):
    return line.replace(symbol, '').strip()

def extract_footnote_text(lines, ix_first, symbol):
    footnote_text = [line for line in lines[ix_first:] if match_footnote_symbol(line, symbol)]
    if len(footnote_text) > 0:
        #print(footnote_text[0])
        return strip_footnote_symbol(footnote_text[0], symbol)
    else:
        return ''
    
def add_footnotes_part(divisions, lines, ix_first):
    #footnote in disposition title?
    symbol = extract_footnote_symbol(divisions['description'])
    if symbol:
        divisions['footnote'] = extract_footnote_text(lines, ix_first, symbol)
        divisions['spec'] = strip_footnote_symbol(divisions['description'], symbol)
    for division in divisions['disposition']:
        symbol = extract_footnote_symbol(division['division_name'])
        if symbol:
            division['footnote'] = extract_footnote_text(lines, ix_first, symbol)
            division['division_name'] = strip_footnote_symbol(division['division_name'], symbol)
        for stop in division['stoplist']:
            symbol = extract_footnote_symbol(stop['stopname']+stop['spec'])
            if symbol:
                stop['footnote'] = extract_footnote_text(lines, ix_first, symbol)
                stop['stopname'] = strip_footnote_symbol(stop['stopname'], symbol)
                stop['spec'] = strip_footnote_symbol(stop['spec'], symbol)
            

## Dictionary for all dispositions

In [6]:
result = defaultdict(lambda : {} )

## This is for Part 1 and 2

In [7]:
def extract_stops_part1_2(lines, ix_start_stops, ix_end_stops, ix_start_foots, ix_end_foots):
    #collect stop names (and strip)
    stopnames = [line.strip() for line in lines[ix_start_stops:ix_end_stops+1] ]
    pitches   = [line.strip() for line in lines[ix_start_foots:ix_end_foots+1] ]
    stoplist = list(
        zip_longest(
            stopnames,
            pitches,
            fillvalue = ''
        )
    )
    #find out whether there is the division name in the first row
    #heuristic: If something is in the second column, it is not a division name
    first_stop_ix = 0
    if len(stoplist[0][1].strip()) > 0:
        division_name = ''
    else:
        division_name = stoplist[0][0].strip()
        first_stop_ix += 1
    if 'stem' in stoplist[first_stop_ix][0]:
        #find first non empty row
        first_stop_ix += 1
    while(len(stoplist[first_stop_ix][0]) == 0 ):
        first_stop_ix += 1
    stoplist = stoplist[first_stop_ix:]
    stoplist = [{'stopname':stop[0].strip(), 'spec':stop[1].strip(), 'footnote':''} for stop in stoplist]
    return {'division_name':division_name, 'footnote':'', 'stoplist':stoplist}
    
def extract_disposition_part1_2(lines, ix):
    divisions = []
    ix += 1
    #scan for line starting with @
    #if something else comes, then this is no disposition. E.g. a paragraph that starts with 'Dispositie...'
    while not lines[ix].strip().startswith('@'):
        ix += 1
    another_division = True
    while another_division:
        ix_start_stops = ix
        while not lines[ix].strip().endswith('@'):
            ix += 1
        ix_end_stops = ix
        #scan for line starting with @: start of foot sizes
        ix += 1
        while not lines[ix].strip().startswith('@'):
            ix +=1
        ix_start_foots = ix
        ix += 1 # assume '@....@' does not exist
        while not lines[ix].strip().endswith('@'):
            ix +=1
        ix_end_foots = ix
        #remove '@'
        for ix_disp in range(ix_start_stops,ix_end_foots+1):
            lines[ix_disp] = lines[ix_disp].strip().strip('@')
        stops = extract_stops_part1_2(lines, ix_start_stops, ix_end_stops, ix_start_foots, ix_end_foots)
        divisions.append(stops)

        #another division? Next not empty line should start with @
        ix += 1
        while len(lines[ix].strip()) == 0:
            ix += 1
        if lines[ix].strip().startswith('@'):
            another_division = True
        else:
            another_division = False

    return divisions, ix_end_foots

def extract_specification_from_title_part1_2(title):
    parts = title.split("ispositie", 1)
    if len(parts) == 2:
        parts = [p.strip() for p in parts]
        return parts[1]
    else:
        raise Exception()

#It is a dispositie if a line starting with @ follows.
#Except:
#- end of text
#- another line that starts with 'Dispositie'
def assertDispositie_part1_2(lines, ix):
    for i in range(ix+1, len(lines)):
        if lines[i].strip().startswith('@'):
            return True
        if lines[i].strip().startswith('Dispositie'):
            return False
    return False

#remove markup (e.g. @T1:, @K2:, [T1], [x2], etc.)
def remove_markup_part1_2(lines):
    return [re.sub('@[a-zA-Z][0-9]:|\[[a-zA-Z][0-9]\]','',line) for line in lines]

def extract_dispositions_part1_2(lines):
    dispositions = []
    current = False #True for the disposition in 'Technische Gegevens'
    for ix, line in enumerate(lines):
        if line.strip().startswith("Technische gegevens"):
            current = True
        if line.strip().startswith("Bijzonderheden"):
            current = False #Maybe a disposition in section 'Bijzonderheden'
        if line.strip().startswith('Dispositie') or line.strip().startswith('Oorspronkelijke dispositie'):
            if assertDispositie_part1_2(lines, ix):
                #print(lines[ix])
                spec = extract_specification_from_title_part1_2(line)
                disp, ix_end_disp = extract_disposition_part1_2(lines, ix)
                disposition = {'description':spec, 'disposition':disp, 'footnote':'', 'line_block': ix, 'current': current}
                add_footnotes_part(disposition, lines, ix_end_disp+1)
                dispositions.append(disposition)
    return dispositions

In [8]:
for organid in organids:
    #print(organid)
    part_path = getPartFromID(organid)
    if re.match(r'Part0[12]$', part_path):
        fullinputpath = filename[organid]
        doc = docx2python(fullinputpath)
        docbody = preprocess_doc(doc.body, part_path)
        #print("Processing " + fullinputpath)
        print('.', end = '')
        dispositions = extract_dispositions_part1_2(docbody[0][0][0])
        result[organid] = {'dispositions': dispositions}

......................................................................................................................................................................................................................................................................

## This is for Parts 3 - 5

In [9]:
def extract_stops_part3_5(lines, ix_start_stops, ix_end_stops, ix_start_foots, ix_end_foots):
    #collect stop names (and strip)
    stopnames = [line.strip() for line in lines[ix_start_stops:ix_end_stops+1] ]
    pitches   = [line.strip() for line in lines[ix_start_foots:ix_end_foots+1] ]
    stoplist = list(
        zip_longest(
            stopnames,
            pitches,
            fillvalue = ''
        )
    )
    #find out whether there is the division name in the first row
    #heuristic: If something is in the second column, it is not a division name
    first_stop_ix = 0
    if len(stoplist[0][1].strip()) > 0:
        division_name = ''
    else:
        division_name = stoplist[0][0].strip()
        first_stop_ix = 1
    if 'stem' in stoplist[1][0]:
        #find first non empty row
        first_stop_ix += 1
    while(len(stoplist[first_stop_ix][0]) == 0 ):
        first_stop_ix += 1
    stoplist = stoplist[first_stop_ix:]
    stoplist = [{'stopname':stop[0].strip(), 'spec':stop[1].strip(), 'footnote':''} for stop in stoplist]
    return {'division_name':division_name, 'footnote':'', 'stoplist':stoplist}

def extract_disposition_part3_5(lines, ix):
    divisions = []
    ix += 1
    #skip empty line (if any)
    while len(lines[ix]) == 0:
        ix += 1
    #scann till line ending with \n
    another_division = True
    while another_division:
        ix_start_stops = ix
        while not lines[ix].endswith('\n'):
            ix += 1
        ix_end_stops = ix
        len_stops = ix_end_stops - ix_start_stops + 1
        ix += 1
        ix_start_foots = ix
        while ( not lines[ix].endswith('\n') ) and ( ix < ix_start_foots + len_stops - 1 ):
            ix += 1
        ix_end_foots = ix
        stops = extract_stops_part3_5(lines, ix_start_stops, ix_end_stops, ix_start_foots, ix_end_foots)
        divisions.append(stops)
        #another division?
        if lines[ix_end_foots].endswith('\n'):
            another_division = True
        else:
            another_division = False
        ix = ix + 1
    return divisions, ix_end_foots

def extract_specification_from_title_part3_5(title):
    parts = title.split("ispositie", 1)
    if len(parts) == 2:
        parts = [p.strip() for p in parts]
        return parts[1]
    else:
        raise Exception()

#It is a dispositie if a line ending with '\n'
#Except:
#- end of text
#- another line that starts with 'Dispositie'
def assertDispositie_part3_5(lines, ix):
    for i in range(ix+1, len(lines)):
        if lines[i].endswith('\n'):
            return True
        if lines[i].strip().startswith('Dispositie'):
            return False
    return False

def extract_dispositions_part3_5(lines):
    dispositions = []
    current = False
    for ix, line in enumerate(lines):
        if line.strip().startswith("Technische gegevens"):
            current = True
        if line.strip().startswith("Bijzonderheden"):
            current = False #Maybe a disposition in section 'Bijzonderheden'
        if line.strip().startswith('Dispositie') or line.strip().startswith('Oorspronkelijke dispositie'):
            if assertDispositie_part3_5(lines, ix):
                spec = extract_specification_from_title_part3_5(line)
                disp, ix_end_disp = extract_disposition_part3_5(lines, ix)
                disposition = {'description':spec, 'disposition':disp, 'footnote':'', 'line_block':ix, 'current': current}
                add_footnotes_part(disposition, lines, ix_end_disp+1)
                dispositions.append(disposition)
    return dispositions

In [10]:
for organid in organids:
#for organid in ['Part03_003HAREN']:
    part_path = getPartFromID(organid)
    if re.match(r'Part0[3-5]$', part_path):
        fullinputpath = filename[organid]
        doc = docx2python(fullinputpath)
        docbody = preprocess_doc(doc.body, part_path)
        #print("Processing " + fullinputpath)
        print('.', end='')
        dispositions = extract_dispositions_part3_5(docbody[0][0][0])
        result[organid] = {'dispositions': dispositions}

......................................................................................................................................................................................................................................................................................................................

## This is for Parts 6 - 16

In [11]:
def extract_stops_part6_16(cols):
    #re-arrange
    stoplist = list(zip_longest(cols[0], cols[1], fillvalue=''))
    #2 possibilities
    #- [name division, stop1, stop2, ...]
    #- [name division, number of stops, (empty), stop1, stop2, ...]
    #find out whether there is the division name in the first row
    #heuristic: If something is in the second column, it is not a division name
    first_stop_ix = 0
    if len(stoplist[0][1].strip()) > 0:
        division_name = ''
    else:
        division_name = stoplist[0][0].strip()
        first_stop_ix = 1
    if 'stem' in stoplist[1][0]:
        #find first non empty row
        first_stop_ix += 1
    while(len(stoplist[first_stop_ix][0]) == 0 ):
        first_stop_ix += 1
    stoplist = stoplist[first_stop_ix:]
    stoplist = [{'stopname':stop[0].strip(), 'spec':stop[1].strip(), 'footnote':''} for stop in stoplist]
    return {'division_name':division_name, 'footnote':'', 'stoplist':stoplist}
    
def extract_disposition_part6_16(dis):
    #traverse columns in paris of two (stopnames, pitch-heights)
    pairs = zip(dis[0][::2], dis[0][1::2])
    divisions = []
    for p in pairs:
        stops = extract_stops_part6_16(p)
        divisions.append(stops)
    return divisions

def extract_specification_from_title_part6_16(title):
    parts = title.split("ispositie", 1)
    if len(parts) == 2:
        parts = [p.strip() for p in parts]
        return parts[1]
    else:
        raise Exception()

def TechnischeGegevensInBlock(block):
    if isEmptyBlock(block):
        return False
    else:
        return '\nTechnische gegevens' in '\n' + '\n'.join([s.strip() for s in block[0][0]])

def BijzonderhedenInBlock(block):
    if isEmptyBlock(block):
        return False
    else:
        return '\nBijzonderheden' in '\n' + '\n'.join([s.strip() for s in block[0][0]])

def extract_dispositions_part6_16(docbody):
    dispositions = []
    current = False
    body_1storder = zip(docbody, docbody[1:], docbody[2:])
    for ix, b in enumerate(body_1storder):
        if TechnischeGegevensInBlock(b[0]):
            current = True
        if BijzonderhedenInBlock(b[0]):
            current = False
        lastline = get_last_text_line(b[0])
        if lastline.strip().startswith('Dispositie') or lastline.strip().startswith('Oorspronkelijke dispositie'):
            spec = extract_specification_from_title_part6_16(lastline)
            disp = extract_disposition_part6_16(b[1])
            disposition = {'description':spec, 'disposition':disp, 'footnote':'', 'line_block':ix+1, 'current': current}
            add_footnotes_part(disposition, b[2][0][0], 0)
            dispositions.append(disposition)
    return dispositions

In [12]:
for organid in organids:
    part_path = getPartFromID(organid)
    if re.match(r'(Part0[6-9]|Part1.)$', part_path):
        fullinputpath = filename[organid]
        doc = docx2python(fullinputpath)
        docbody = preprocess_doc(doc.body, part_path)
        #print(docbody)
        #print("Processing " + fullinputpath)
        print('.', end='')
        dispositions = extract_dispositions_part6_16(docbody)
        result[organid] = {'dispositions': dispositions}

........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

## Extract divisions from stop names

In [None]:
# with open(os.path.join(outputroot, 'dispositions.json'), 'r') as f:
#     result = json.load(f)

In [None]:
# just go over all stops, and see whether they end with 'D', 'B', 'B/D' (not 'Dd', 'd')
for organid in organids:
    for disp in result[organid]['dispositions']:
        for division in disp['disposition']:
            for stop in division['stoplist']:
                stop['partition'] = ''
                for div in [' B/D', ' D', ' B']: #space is important
                    if stop['stopname'].endswith(div):
                        stop['stopname'] = stop['stopname'][:-len(div)]
                        stop['partition'] = div[1:]



## Write to disk

In [None]:
#save results
with open(os.path.join(outputroot, 'dispositions.json'), 'w') as f:
    json.dump(result, f, indent=2, sort_keys=True)

In [None]:
#save as readible text

def formatStoplist(division_name, stoplist, longest_stoplist, footnoteno_next):
    footnotes = []
    #find out number of footnotes
    nofootnotes = footnoteno_next
    for stop in stoplist:
        if stop['footnote']:
            nofootnotes += 1
    if nofootnotes > 0:
        nofootnotes += 1 #space
    longest_stop = max(len(division_name), max([len(stop['stopname'])+len(stop['partition']) for stop in stoplist]) ) + 2 + nofootnotes
    longest_foot = max([len(stop['spec']) for stop in stoplist]) + 3
    result = [''] * (longest_stoplist + 2)
    result[0] = f'{division_name:{longest_stop}}{"":{longest_foot}}'
    for i in range(longest_stoplist):
        stopname   = ' '
        footheight = ' '
        if i < len(stoplist):
            stopname   = stoplist[i]['stopname']
            if stoplist[i]['partition'] != '':
                stopname = stopname + ' ' + stoplist[i]['partition']
            footheight = stoplist[i]['spec']
            if stoplist[i]['footnote']:
                footnoteno = footnoteno_next
                footnoteno_next += 1
                footnotesymbol = '*'*footnoteno
                stopname = stopname + ' ' + footnotesymbol
                footnotes.append(footnotesymbol+' '+stoplist[i]['footnote'])
        result[i+2] = f'{stopname:{longest_stop}}{footheight:{longest_foot}}'
    return result, footnotes, footnoteno_next

def formatDispositions(dispositions):
    result = []
    for disp in dispositions:
        footnotes = []
        footnoteno_next = 1
        longest_stoplist = max([len(division['stoplist']) for division in disp['disposition']])
        current = " (current)" if disp['current'] else ""
        result.append(f'Dispositie {disp["description"]}' + current)
        result.append('')
        stoplists = []
        for division in disp['disposition']:
            stoplist, new_footnotes, footnoteno_next = formatStoplist(division['division_name'], division['stoplist'], longest_stoplist, footnoteno_next)
            stoplists.append(stoplist)
            footnotes.extend(new_footnotes)
        for line in zip(*stoplists):
            result.append(''.join(line))
        result.append('')
        result.extend(footnotes)
        result.append('')
        result.append('')
    return result

In [None]:
original_stdout = sys.stdout
with open(os.path.join(outputroot, 'dispositions.txt'), 'w') as f:
    sys.stdout = f
    for r in sorted(organids):
        print('='*79)
        print()
        print(r)
        print()
        print('-'*len(r))
        print()
        disp = formatDispositions(result[r]['dispositions'])
        for line in disp:
            print(line)
    sys.stdout = original_stdout

In [None]:
#Check. Which organs do not have a current disposition
for organid in sorted(organids):
    current = False
    for disps in result[organid]['dispositions']:
        if disps['current']:
            current = True
    if current == False:
        print(organid)

Part01_006AMDAM
Part01_012AMDAM
Part01_015ABCOU
Part01_026GRON
Part01_063HEESB
Part01_078ROOSE
Part01_088VENLO
Part02_022HERTO
Part02_038THORN
Part02_056RUURL
Part04_026RUURL
Part04_084LEIDE
Part04_115KORNH
Part05_112ALPHE
Part05_120WOUDR
Part06_088aDenbosch
Part07_017Elburgmuseum
Part08_068Wadenoyen
Part08_081Warmond
Part09_039VlijmenHervormdeKerk
Part09_047Haarlem
Part11_011Vlijmen
Part11_078Opwierde
Part15_008Ternaard
