In [29]:
import os
from docx2python import docx2python
import re
from organ_utils import preprocess_line, preprocess_doc, getPartFromID, isEmptyBlock, isTable
import json
from collections import defaultdict

In [13]:
#path to output data
outputroot = '../output/'

In [14]:
#load organids and filenames
with open('../output/organids.json', 'r') as f:
    organids = json.load(f)

with open('../output/filenames.json', 'r') as f:
    filename = json.load(f)

In [15]:
#specific for history:

#replace '\n--' with '\n.' Then all bulleted items start with '.'
#replace '\n.\n' with '\n.' if part is part 1
def repair_bullets(block, part):
    block = block.replace('\n--', '\n.')
    if part == 'Part01':
        block = block.replace('\n.\n', '\n.')
    return block

In [92]:
#extract texts

fulltexts = {}
kunst_texts = {}
lit_texts = {}
hist_texts = {} #for 'Historische gegevens'
tech_texts = {}
bijz_texts = {}

offsets = defaultdict(
    lambda : {
        'kunst':-1,
        'lit'  :-1,
        'hist' :-1,
        'tech' :-1,
        'bijz' :-1
    }
) #line numbers of first line of block

kunst_re       = re.compile(r'^Kunsthistorische aspecten(.*?)(?:Monument|Historische|Literatuur)', re.S | re.M)
lit_re         = re.compile(r'^(?:Literatuur|Niet gepubliceerde bron|Niet gepubliceerde bronnen)(.*?)(?:Monument.*|Historische.*|[Oo]rgelnummer)', re.S | re.M)
hist_re        = re.compile(r'^Historische gegevens(.*?)Technische gegevens', re.S | re.M)
tech_re        = re.compile(r'^Technische gegevens(.*?)(?:Bijzonderheden.*)', re.S | re.M)
tech_nobijz_re = re.compile(r'^Technische gegevens(.*)', re.S | re.M) #use in case Bijzonderheden is missing
bijz_re        = re.compile(r'^Bijzonderheden(.*)', re.S | re.M)

for organid in organids:
#for organid in ['Part03_003HAREN','Part06_007NieuwBeijerland']:
#for organid in ['Part01_110IJSSE']:
    fullinputpath = filename[organid]
    part_path = getPartFromID(organid)
    doc = docx2python(fullinputpath)
    docbody = preprocess_doc(doc.body, part_path)
    doctext = []
    for ix, block in enumerate(docbody):
        if not isEmptyBlock(block):
            if not isTable(block):
                blocktext = '\n'.join([line.strip().replace('\t', '') for line in block[0][0]])
                doctext.append(blocktext)
            else:
                doctext.append('\nTABLE:'+str(ix)+'\n')
    doctext = repair_bullets('\n'.join(doctext), part_path)

    fulltexts[organid] = doctext
    
    kunst_match = re.search(kunst_re, doctext)
    lit_match   = re.search(lit_re, doctext)
    hist_match  = re.search(hist_re, doctext)
    tech_match  = re.search(tech_re, doctext)
    bijz_match  = re.search(bijz_re, doctext)
    if not bijz_match:
        tech_match = re.search(tech_nobijz_re, doctext)
    
    if kunst_match:
        kunst_texts[organid] = kunst_match.groups()[0].strip()
        lineno = doctext[:kunst_match.span()[0]].count('\n')
        offsets[organid]['kunst'] = lineno
    else:
        print(organid, ': Kunsthistorische aspecten missing')
        kunst_texts[organid] = ''
        
    if lit_match:
        lit_texts[organid] = lit_match.groups()[0].strip()
        lineno = doctext[:lit_match.span()[0]].count('\n')
        offsets[organid]['lit'] = lineno
    else:
        print(organid, ': Literatuur missing')
        lit_texts[organid] = ''

    if hist_match:
        hist_texts[organid] = hist_match.groups()[0].strip()
        lineno = doctext[:hist_match.span()[0]].count('\n')
        offsets[organid]['hist'] = lineno
    else:
        print(organid, ': Historische gegevens missing')
        hist_texts[organid] = ''

    if tech_match:
        tech_texts[organid] = tech_match.groups()[0].strip()
        lineno = doctext[:tech_match.span()[0]].count('\n')
        offsets[organid]['tech'] = lineno
    else:
        print(organid, ': Technische gegevens missing')
        tech_texts[organid] = ''

    if bijz_match:
        bijz_texts[organid] = bijz_match.groups()[0].strip()
        lineno = doctext[:bijz_match.span()[0]].count('\n')
        offsets[organid]['bijz'] = lineno
    else:
        print(organid, ': Bijzonderheden missing')
        bijz_texts[organid] = ''



Part14_000Zunderdorp : Literatuur missing
Part14_000Loosduinen : Literatuur missing
Part14_000Oosterland : Literatuur missing
Part14_000Scheulder : Literatuur missing
Part14_000DenHelder : Literatuur missing
Part14_000Maasbommel : Literatuur missing
Part14_000Borssele : Literatuur missing
Part14_000Breda : Literatuur missing
Part13_048Oudesluis : Literatuur missing
Part13_040Heerde : Literatuur missing
Part13_068Uithoorn : Literatuur missing
Part13_059Hem : Literatuur missing
Part13_129Burgerbrug : Literatuur missing
Part13_028Afferden : Literatuur missing
Part12_086Staphorst : Literatuur missing
Part12_066Echt-Mariahoop : Literatuur missing
Part12_081Oss : Literatuur missing
Part12_070Landgraaf-Rimburg : Literatuur missing
Part12_092Beesd : Literatuur missing
Part15_036RijssenHoeksteen : Bijzonderheden missing
Part15_027Oudenbosch : Literatuur missing
Part15_103Gulpen : Literatuur missing
Part15_088Kapel-Avezaath : Literatuur missing
Part15_049Hengelo-Beckum : Literatuur missing
Part1

In [93]:
with open(os.path.join(outputroot, 'texts_fulltexts.json'), 'w') as f:
    json.dump(fulltexts, f, indent=2, sort_keys=True)

with open(os.path.join(outputroot, 'texts_kunst.json'), 'w') as f:
    json.dump(kunst_texts, f, indent=2, sort_keys=True)

with open(os.path.join(outputroot, 'texts_lit.json'), 'w') as f:
    json.dump(lit_texts, f, indent=2, sort_keys=True)

with open(os.path.join(outputroot, 'texts_hist.json'), 'w') as f:
    json.dump(hist_texts, f, indent=2, sort_keys=True)

with open(os.path.join(outputroot, 'texts_tech.json'), 'w') as f:
    json.dump(tech_texts, f, indent=2, sort_keys=True)

with open(os.path.join(outputroot, 'texts_bijzonderheden.json'), 'w') as f:
    json.dump(bijz_texts, f, indent=2, sort_keys=True)

with open(os.path.join(outputroot, 'texts_offsets.json'), 'w') as f:
    json.dump(offsets, f, indent=2, sort_keys=True)
