In [3]:
from docx2python import docx2python
import json
import os
import re
from more_itertools import seekable
import sys
from organ_utils import preprocess_line, preprocess_doc, getPartFromID, isEmptyBlock, isTable, get_last_line

In [4]:
os.getcwd()

'/Users/krane108/git/polifonia-project/organs-dataset/scripts'

In [5]:
#path to output data
outputroot = '../output/'

In [6]:
#load organids and filenames
with open('../output/organids.json', 'r') as f:
    organids = json.load(f)

with open('../output/filenames.json', 'r') as f:
    filename = json.load(f)

## Dictionary for all dispositions

In [6]:
result = {}

## This is for Part 1 - 5

In [7]:
def extract_compoundstop_line_part1_5(line):
    foots = re.split(r'-|–', line) #split on hyphen OR n-dash
    if len(foots) < 2:
        return None
    #print(foots)
    #first contains also stopname and keyname. Stopname could contain space.
    #find key name, which splits stopname and foots. Search backwards, First [a-cA-C][0-9]?
    key_match = [(m.group(),m.start(),m.end()) for m in re.finditer(r'[a-gA-G](is|es)?[0-9]?', foots[0])]
    if len(key_match) == 0:
        return None
    key_match = key_match[-1]
    keyname = key_match[0]
    stopname = foots[0][:key_match[1]].strip()
    first_foot = foots[0][key_match[2]:].strip()
    foots = [first_foot.strip()] + [foot.strip() for foot in foots[1:]]
    #print ( {'stopname': stopname, 'ranges':[{'key':keyname, 'composition':foots}]} )
    return {'stopname': stopname, 'ranges':[{'key':keyname, 'composition':foots}]}

def extract_compoundstop_table_part1_5(lines, ix): 
    next_ix = ix+1
    stopname = lines[ix].strip()
    foots = []
    keyname = ""
    ranges = []
    for scan_ix, line in enumerate(lines[ix+1:]):
        #Either key name, foot, or empty line. Something else -> break
        #OR composition on one line (e.g. in Part 2/001AMSTE)
        #try that first
        if len(ranges) == 0:
            compoundstop = extract_compoundstop_line_part1_5(line)
            if compoundstop:
                compoundstop['stopname'] = stopname
                return compoundstop, scan_ix + ix + 2
        keyname_match = re.match(r'^[a-gA-G]{1}(is|es)?[0-9]?$', line.strip())
        foot_match = re.match(r'(^[0-9]+( +[0-9]+/[0-9]+)?$)|(^[0-9]+/[0-9]+$)', line.strip())
        emptyline = ( len(line.strip()) == 0)
        if keyname_match:
            if len(foots) > 0:
                ranges.append({'key':keyname, 'composition': foots})
            foots = []
            keyname = keyname_match.group(0)
        elif foot_match:
            foots.append(foot_match.group(0))
        elif emptyline:
            pass
        else:
            next_ix = scan_ix + ix + 1
            break
    if len(foots) > 0: #add last one
        ranges.append({'key':keyname, 'composition': foots})
    return {'stopname':stopname, 'ranges':ranges }, next_ix


# either a table or one line
# one line: <stopname> <key> <foot> (- <foot>)+
# table
def extract_compoundstops_part1_5(lines, start_ix, end_ix): #start_ix points to line AFTER head (Samenstelling vulstem)
    compoundstops = []
    next_ix = start_ix
    for ix, line in enumerate(lines):
        if ix < next_ix:
            #print("SKIPPING: ", line)
            continue
        if ix == end_ix:
            break
        if len(line.strip())==0: #empty line
            continue
        #on one line? just try
        compoundstop = extract_compoundstop_line_part1_5(line)
        if compoundstop:
            compoundstops.append(compoundstop)
            continue
        #Assume, it is a table
        compoundstop, next_ix = extract_compoundstop_table_part1_5(lines, ix)
        if compoundstop:
            compoundstops.append(compoundstop)
    return compoundstops


#Assume "Toonhoogte" always follows "Samenstelling vulstemmen"
def find_compoundstops_part1_5(lines):
    start_ix = 0 # (block, line)
    end_ix = 0
    for line_ix, line in enumerate(lines):
        if line.strip().startswith('Samenstelling vulstem'):
            start_ix = line_ix
        if line.strip().startswith('Toonhoogte'):
            end_ix = line_ix
            break
    if end_ix > start_ix:
        return start_ix, end_ix
    return 0, 0 #not found

def process_compoundstops_part1_5(lines):
    compstops = [] #list of stops.
    start_ix, end_ix = find_compoundstops_part1_5(lines)
    #print(start_ix, end_ix)
    if start_ix == 0:
        return compstops
    if end_ix > start_ix:
        compstops = extract_compoundstops_part1_5(lines, start_ix + 1, end_ix)
    return compstops

In [8]:
for organid in organids:
    part_path = getPartFromID(organid)
    if re.match(r'Part0[1-5]$', part_path):
        fullinputpath = filename[organid]
        doc = docx2python(fullinputpath)
        lines = preprocess_doc(doc.body, part_path)[0][0][0]
        #print("Processing " + fullinputpath)
        print('.', end='')
        compoundstops = process_compoundstops_part1_5(lines)
        result[organid] = {'compoundstops' : compoundstops}

............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

## This is for Parts 6 - 16

In [9]:
def extract_compoundstop_table_part6_16(table):
    stopname = table[0][0][0]
    ranges = []
    for col in table[0][1:]:
        keyname = col[0]
        foots = [foot for foot in col[1:]]
        #print( {'key':keyname, 'composition': foots} )
        ranges.append({'key':keyname, 'composition': foots})
    return {'stopname':stopname, 'ranges':ranges }

def extract_compoundstop_line_part6_16(line):
    #print(line)
    foots = re.split(r'-|–', line) #split on hyphen OR n-dash
    #print(foots)
    if len(foots) < 2:
        return None
    #print(foots)
    #first contains also stopname and keyname. Stopname could contain space.
    #find key name, which splits stopname and foots. Search backwards, First [a-cA-C][0-9]?
    key_match = [(m.group(),m.start(),m.end()) for m in re.finditer(r'[a-gA-G](is|es)?[0-9]?', foots[0])][-1]
    keyname = key_match[0]
    stopname = foots[0][:key_match[1]].strip()
    first_foot = foots[0][key_match[2]:].strip()
    foots = [first_foot.strip()] + [foot.strip() for foot in foots[1:]]
    #print ( {'stopname': stopname, 'ranges':[{'key':keyname, 'composition':foots}]} )
    return {'stopname': stopname, 'ranges':[{'key':keyname, 'composition':foots}]}

# either a table or one line
# one line: <stopname> <key> <foot> (- <foot>)+
# table
def extract_compoundstops_part6_16(lines, start_ix, end_ix):
    compoundstops = []
    for block_ix in range(start_ix[0], end_ix[0]+1):
        #is it a table?
        if len(lines[block_ix][0]) > 1:
            #print("IX ", block_ix, lines[block_ix])
            #print(len(lines[block_ix][0]))
            stopcomposition = extract_compoundstop_table_part6_16(lines[block_ix])
            if stopcomposition:
                compoundstops.append(stopcomposition)
        else:
        #if not a table:
        #process lines
            #print("IX ", block_ix, lines[block_ix])
            #print(len(lines[block_ix][0]))
            first_line_ix = 0
            #if block is empty: skip
            if len(lines[block_ix]) == 0:
                continue
            if len(lines[block_ix][0]) == 0:
                continue
            last_line_ix = len(lines[block_ix][0][0])
            if block_ix == start_ix[0]:
                first_line_ix = start_ix[1]
            if block_ix == end_ix[0]:
                last_line_ix = end_ix[1]
            for line_ix in range(first_line_ix, last_line_ix):
                #print(block_ix, line_ix)
                stopcomposition = extract_compoundstop_line_part6_16(lines[block_ix][0][0][line_ix])
                if stopcomposition:
                    compoundstops.append(stopcomposition)
    return compoundstops


#Assume "Toonhoogte" always follows "Samenstelling vulstemmen"
def find_compoundstops_part6_16(docbody):
    start_ix = (0,0) # (block, line)
    end_ix = (0,0)
    for block_ix, b in enumerate(docbody):
        if b[0]:
            if b[0][0]:
                for line_ix, line in enumerate(b[0][0]):
                    if line.strip().startswith('Samenstelling vulstem'):
                        start_ix = (block_ix, line_ix)
                    if line.strip().startswith('Toonhoogte'):
                        end_ix = (block_ix, line_ix)
                        break
    if end_ix > start_ix:
        return start_ix, end_ix
    return (0,0), (0,0) #not found

def process_compoundstops_part6_16(docbody):
    compstops = [] #list of stops.
    start_ix, end_ix = find_compoundstops_part6_16(docbody)
    #print(start_ix, end_ix)
    if start_ix == (0,0):
        return compstops
    if end_ix > start_ix:
        compstops = extract_compoundstops_part6_16(doc.body, start_ix, end_ix)
    return compstops

In [10]:
for organid in organids:
    part_path = getPartFromID(organid)
    if re.match(r'(Part0[6-9]|Part1.)$', part_path):
        fullinputpath = filename[organid]
        doc = docx2python(fullinputpath)
        docbody = preprocess_doc(doc.body, part_path)
        #print("Processing " + fullinputpath)
        print('.', end='')
        compoundstops = process_compoundstops_part6_16(docbody)
        result[organid] = {'compoundstops' : compoundstops}

........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

In [12]:
#write to disk
with open(os.path.join(outputroot, 'compoundstops.json'), 'w') as f:
    json.dump(result, f, indent=2, sort_keys=True)

In [13]:
#write as readible text

def formatCompoundStop(compoundstop):
    res = []
    res.append(compoundstop['stopname'])
    for rang in compoundstop['ranges']:
        res.append('    ' + f'{rang["key"]:7}' + ' - '.join([f'{r:6}' for r in rang['composition']]))
    return res

def formatCompoundStops(compoundstops):
    res = []
    for cstop in compoundstops:
        for line in formatCompoundStop(cstop):
            res.append(line)
        res.append('')
    return res

In [15]:
original_stdout = sys.stdout
with open(os.path.join(outputroot, 'compoundstops.txt'), 'w') as f:
    sys.stdout = f
    for r in sorted(organids):
        print('='*79)
        print()
        print(r)
        print()
        print('-'*len(r))
        print()
        cstops = formatCompoundStops(result[r]['compoundstops'])
        for line in cstops:
            print(line)
    sys.stdout = original_stdout