In [2]:
import os, csv
from docx2python import docx2python
import re
from organ_utils import getPartFromID, preprocess_line, preprocess_doc, isTable, isEmptyBlock
import json

In [3]:
#path to output data
outputroot = '../output/'

In [4]:
#load organids and filenames
with open('../output/organids.json', 'r') as f:
    organids = json.load(f)

with open('../output/filenames.json', 'r') as f:
    filename = json.load(f)

with open('../output/texts_fulltexts.json', 'r') as f:
    fulltexts = json.load(f)


In [14]:
# extract organ identifiers
place = {}
year = {}
building = {}
whichorgan = {}
monumentnumber = {}
organnumber = {}

for organid in organids:
    
    fullinputpath = filename[organid]

    doc = docx2python(fullinputpath)
    docbody = preprocess_doc(doc.body, getPartFromID(organid))

    #in Part 2 there is an empty line after the city
    place_ix = 0
    building_ix = 1
    whichorgan_ix = 2
    if getPartFromID(organid) == 'Part02':
        building_ix = 2
        whichorgan_ix = 3

    if not isEmptyBlock(docbody[0]):
        if len(docbody[0][0][0]) > 3:
            placeyear = re.split(r'\\|/', docbody[0][0][0][place_ix])
            if len(placeyear) > 2: # / in place name. Part4/53: "Beek/Ubbergen/1805"
                placeyear = [ '/'.join(placeyear[:-1]), placeyear[-1]]
            place[organid] = placeyear[0].strip()
            if len(placeyear) > 1:
                year[organid] = placeyear[1].strip()
            else:
                year[organid] = ''
            building[organid] = docbody[0][0][0][building_ix].strip()
            whichorgan[organid] = docbody[0][0][0][whichorgan_ix].strip()
        else:
            print(fullinputpath, "NOT PRESENT")
    else:
        print(fullinputpath, "EMPTY FILE")
    
    monumentnumber_re = re.compile(r'[Mm]onumentnummer +([0-9]+)', re.S | re.M)
    orgelnumber_re = re.compile(r'[Oo]rgelnummer +([0-9]+)', re.S | re.M)
    monumentnumber_match = re.search(monumentnumber_re, fulltexts[organid])
    orgelnumber_match = re.search(orgelnumber_re, fulltexts[organid])

    if monumentnumber_match:
        monumentnumber[organid] = monumentnumber_match.groups()[0].strip()
    else:
        print(organid + " : No monument number")
        monumentnumber[organid] = ''
    
    if orgelnumber_match:
        organnumber[organid] = orgelnumber_match.groups()[0].strip()
    else:
        print(organid + " : No orgel number")
        organnumber[organid] = ''


Part14_000Brouwershaven : No monument number
Part14_000Brouwershaven : No orgel number
Part14_000Niezijl : No monument number
Part14_000Offengawier : No monument number
Part14_000AmsterdamDominicus : No monument number
Part14_000Roodeschool : No monument number
Part14_000Hallum : No monument number
Part14_000CappelleadIJssel : No monument number
Part14_000UithuizenRK : No monument number
Part14_000Delfstrahuizen : No monument number
Part14_000Loosduinen : No monument number
Part14_000Loosduinen : No orgel number
Part14_000Oostkapelle : No monument number
Part14_000Oostkapelle : No orgel number
Part14_000Leerdam : No monument number
Part14_000Leerdam : No orgel number
Part14_000Peperga : No orgel number
Part14_000Delft : No monument number
Part14_000Delft : No orgel number
Part14_000Zeist-HuisterHeide : No monument number
Part14_000Zeist-HuisterHeide : No orgel number
Part14_000Bergeijk : No monument number
Part14_000Bergeijk : No orgel number
Part14_000Meddo : No monument number
Part14

In [15]:
#construct names
name = {}

for organid in organids:
    name[organid] = place[organid] + ', ' + building[organid]
    if whichorgan[organid]:
        name[organid] = name[organid] + ', ' + whichorgan[organid] 

In [16]:
#write as tsv
with open(os.path.join(outputroot, 'base.tsv'), 'w') as f:
    writer = csv.writer(f, delimiter = '\t')
    writer.writerow(['organid', 'name', 'place', 'year', 'building', 'whichorgan','organnumber','monumentnumber'])
    for organid in organids:
        writer.writerow([
            organid,
            name[organid],
            place[organid],
            year[organid],
            building[organid],
            whichorgan[organid],
            organnumber[organid],
            monumentnumber[organid]
        ])

In [17]:
#write as json
organs_basedata = {}
for organid in organids:
    organs_basedata[organid] = {}
    organs_basedata[organid]['name'] = name[organid]
    organs_basedata[organid]['place'] = place[organid]
    organs_basedata[organid]['year'] = year[organid]
    organs_basedata[organid]['building'] = building[organid]
    organs_basedata[organid]['whichorgan'] = whichorgan[organid]
    organs_basedata[organid]['organnumber'] = organnumber[organid]
    organs_basedata[organid]['monumentnumber'] = monumentnumber[organid]

with open(os.path.join(outputroot, 'base.json'), 'w') as f:
    json.dump(organs_basedata, f, indent=2, sort_keys=True)