In [49]:
import argparse, os, csv
from docx2python import docx2python
import re
from organ_utils import preprocess_line, preprocess_doc, getPartFromID, isEmptyBlock, isTable
from pprint import pprint
import json

In [50]:
#path to output data
outputroot = '../output/'

In [51]:
#load organids and filenames
with open('../output/organids.json', 'r') as f:
    organids = json.load(f)

with open('../output/filenames.json', 'r') as f:
    filename = json.load(f)

In [52]:
#load history texts
with open('../output/texts_hist.json', 'r') as f:
    texts = json.load(f)

In [53]:
def extract_bouwer(text):
    bouwer_re = re.compile(r'^Bouwer[^s]:?[ \n]*(.*?)\n')
    bouwers_re = re.compile(r'^Bouwers:?[ \n]*(.*?)\n')

    match_bouwer = re.search(bouwer_re, text)
    match_bouwers = re.search(bouwers_re, text)
    if match_bouwer:
        return match_bouwer.groups()[0].replace('1. ', '').strip()
    if match_bouwers:
        return match_bouwers.groups()[0].replace('1. ', '').strip()
    return ''

In [54]:
bouwers = {}
for organid in organids:
    bouwers[organid] = extract_bouwer(texts[organid])

In [55]:
def extract_bouwjaar(text):
    bouwjaar_re = re.compile(r'(?:Jaar van [Oo]plevering|Bouwjaar):?[ \n]*(.*?)\n')
    bouwjaren_re = re.compile(r'(?:Jaren van [Oo]plevering|Bouwjaar):?[ \n]*(.*?)\n')

    match_bouwjaar = re.search(bouwjaar_re, text)
    match_bouwjaren = re.search(bouwjaren_re, text)
    if match_bouwjaar:
        return match_bouwjaar.groups()[0].replace('1. ', '').strip()
    if match_bouwjaren:
        return match_bouwjaren.groups()[0].replace('1. ', '').strip()
    return ''

In [56]:
bouwjaren = {}
for organid in organids:
    bouwjaren[organid] = extract_bouwjaar(texts[organid])

In [57]:
def extract_oorspronkelijke_locatie(text):
    oorspr_locatie_re = re.compile(r'\n[Oo]orspronkelijke lo[ck]atie:?[ \n]*(.*?)\n')
    
    match_oorspr_locatie = re.search(oorspr_locatie_re, text)
    if match_oorspr_locatie:
        return match_oorspr_locatie.groups()[0].strip()
    return ''
                               

In [58]:
oorspr_locatie = {}
for organid in organids:
    oorspr_locatie[organid] = extract_oorspronkelijke_locatie(texts[organid])

In [59]:
#write base as json

history_base = {}
for organid in organids:
    history_base[organid] = {}
    history_base[organid]['builder'] = bouwers[organid]
    history_base[organid]['year'] = bouwjaren[organid]
    history_base[organid]['originallocation'] = oorspr_locatie[organid]
        
with open(os.path.join(outputroot, 'history_base.json'), 'w') as f:
    json.dump(history_base, f, indent=2, sort_keys=True)

In [60]:
#write base as tsv

with open(os.path.join(outputroot, 'history_base.tsv'), 'w') as f:
    writer = csv.writer(f, delimiter = '\t')
    writer.writerow(['organid', 'builder', 'year', 'originallocation'])
    for organid in sorted(organids):
        writer.writerow([
            organid,
            bouwers[organid],
            bouwjaren[organid],
            oorspr_locatie[organid]
        ])

In [61]:
def extract_changes_blocks(text):
    changes_block_re = re.compile('(?<=\n)[^\n]+?\n\..*?(?=\n\n|$)', re.S)
    return re.finditer(changes_block_re, text)

def extract_changes_head(block):
    #first line
    return block.split('\n')[0]

def extract_changes_items(block):
    return block.split('\n.')[1:]

def extract_name_date(head):
    name_date_re = re.compile(r'(.*?)((?:ca\.? *)?[12][0-9]{3})\s*$')
    name_two_dates_re = re.compile(r'(.*?)((?:[Tt]ussen *)?[12][0-9]{3}.*?[12][0-9]{3})\s*$')
    name_date_match = re.search(name_date_re, head)
    name_two_dates_match = re.search(name_two_dates_re, head)
    if name_two_dates_match:
        return name_two_dates_match.groups()[0].strip(), name_two_dates_match.groups()[1].strip()
    if name_date_match:
        return name_date_match.groups()[0].strip(), name_date_match.groups()[1].strip()
    else:
        return '',''

def extract_changes(text):
    changes = []
    blocks = extract_changes_blocks(text)
    for block in blocks:
        head = extract_changes_head(block.group())
        name, date = extract_name_date(head)
        items = extract_changes_items(block.group())
        lineno = text[:block.span()[0]].count('\n')
        changes.append({'name': name, 'date': date, 'description': head.strip(), 'changes': items, 'lineno': lineno})
    return changes

In [62]:
changes = {}
for organid in organids:
    changes[organid] = extract_changes(texts[organid])

In [63]:
#write as json
with open(os.path.join(outputroot, 'history_projects.json'), 'w') as f:
    json.dump(changes, f, indent=2, sort_keys=True)

In [64]:
#write human readible
def format_changes_block(block):
    res = ['']
    res.append('    description: ' + block['description'])
    res.append('    name       : ' + block['name'])
    res.append('    date       : ' + block['date'])
    res.append('    changes:')
    for c in block['changes']:
        res.append('     * ' + c.replace('\n','\n       '))
    return res

with open(os.path.join(outputroot, 'history_projects.txt'), 'w') as f:
    for organid in sorted(organids):
        f.write('='*47+'\n')
        f.write(organid + '\n')
        f.write('='*10+'\n')
        for change in changes[organid]:
            formatted = format_changes_block(change)
            for line in formatted:
                f.write(line+'\n')
        f.write('\n')
        f.write('\n')