In [3]:
import argparse, os, csv, json
import docx
import re
from organ_utils import preprocess_line, getPartFromID

In [4]:
#path to output data
outputroot = '../output/'

In [5]:
def clean_text(match):
    if match:
        groups = match.groups()
        if len(groups) == 2:
            m = match.groups()[0].strip()
            return m
    return ''

In [6]:
#load organids and filenames
with open('../output/organids.json', 'r') as f:
    organids = json.load(f)

with open('../output/filenames.json', 'r') as f:
    filename = json.load(f)

In [7]:
#regular expressions

tech_re = re.compile(r'Technische gegevens(.*)', re.S)

tech_aids_re = re.compile(
    r'''(?:\nWerktuiglijk\ register|\nWerktuiglijke\ registers)(.*?)(\nToonhoogte|\nSamenstelling|\nTemperatuur|\nManuaalomvang|\nKlavieromvang|\nPedaalomvang|\nWindvoorziening|\nWinddruk|\nPlaats|\nBijzonderheden)''',
    re.IGNORECASE | re.VERBOSE | re.S)
tech_pitch_re = re.compile(
    r'''\nToonhoogte(.*?)(\nWerktuiglijk|\nSamenstelling|\nTemperatuur|\nManuaalomvang|\nKlavieromvang|\nPedaalomvang|\nWindvoorziening|\nWinddruk|\nPlaats|\nBijzonderheden)''',
    re.IGNORECASE | re.VERBOSE | re.S)
tech_temperature_re = re.compile(
    r'''\nTemperatuur(.*?)(\nWerktuiglijk|\nSamenstelling|\nToonhoogte|\nManuaalomvang|\nKlavieromvang|\nPedaalomvang|\nWindvoorziening|\nWinddruk|\nPlaats|\nBijzonderheden)''',
    re.IGNORECASE | re.VERBOSE | re.S)
tech_range1_re = re.compile(
    r'''(?:\nManuaalomvang|\nKlavieromvang)(.*?)(\nWerktuiglijk|\nSamenstelling|\nToonhoogte|\nTemperatuur|\nPedaalomvang|\nWindvoorziening|\nWinddruk|\nPlaats|\nBijzonderheden)''',
    re.IGNORECASE | re.VERBOSE | re.S)
tech_range2_re = re.compile(
    r'''\nPedaalomvang(.*?)(\nWerktuiglijk|\nSamenstelling|\nToonhoogte|\nTemperatuur|\nManuaalomvang|\nKlavieromvang|\nWindvoorziening|\nWinddruk|\nPlaats|\nBijzonderheden)''',
    re.IGNORECASE | re.VERBOSE | re.S)
tech_windSys_re = re.compile(
    r'''\nWindvoorziening(.*?)(\nWerktuiglijk|\nSamenstelling|\nToonhoogte|\nTemperatuur|\nManuaalomvang|\nKlavieromvang|\nPedaalomvang|\nWinddruk|\nPlaats|\nBijzonderheden)''',
    re.IGNORECASE | re.VERBOSE | re.S)
tech_windPres_re = re.compile(
    r'''\nWinddruk(.*?)(\nWerktuiglijk|\nSamenstelling|\nToonhoogte|\nTemperatuur|\nManuaalomvang|\nKlavieromvang|\nPedaalomvang|\nWindvoorziening|\nPlaats|\nBijzonderheden)''',
    re.IGNORECASE | re.VERBOSE | re.S)
tech_consoleLoc_re = re.compile(
    r'''\nPlaats\ klaviatuur(.*?)(\nWerktuiglijk|\nSamenstelling|\nToonhoogte|\nTemperatuur|\nManuaalomvang|\nKlavieromvang|\nPedaalomvang|\nWindvoorziening|\nWinddruk|\nBijzonderheden)''',
    re.IGNORECASE | re.VERBOSE | re.S)

In [8]:
#output
technical_data = {}

In [9]:
#collect texts
Docs = {}
for organid in organids:
    file = filename[organid]
    try:
        doc = docx.Document(file)
    except:
        print('Error opening file: ', file)
        continue
    Docs[organid] = preprocess_line(
        '\n'.join([p.text for p in doc.paragraphs]).replace('\n\n','\n'),
        getPartFromID(organid)
    ) #doc.paragraphs

In [10]:
for organid, doc in Docs.items():
    tech = re.search(tech_re, doc)
    if tech:
        aids = clean_text(re.search(tech_aids_re, tech.group()))
        pitch = clean_text(re.search(tech_pitch_re, tech.group()))
        temperature = clean_text(re.search(tech_temperature_re, tech.group()))
        range1 = clean_text(re.search(tech_range1_re, tech.group()))
        range2 = clean_text(re.search(tech_range2_re, tech.group()))
        windSys = clean_text(re.search(tech_windSys_re, tech.group()))
        windPres = clean_text(re.search(tech_windPres_re, tech.group()))
        consoleLoc = clean_text(re.search(tech_consoleLoc_re, tech.group()))

        technical_data[organid] = {}
        technical_data[organid]['aids'] = aids
        technical_data[organid]['pitch'] = pitch
        technical_data[organid]['temperature'] = temperature
        technical_data[organid]['range1'] = range1
        technical_data[organid]['range2'] = range2
        technical_data[organid]['windSys'] = windSys
        technical_data[organid]['windPres'] = windPres
        technical_data[organid]['consoleLoc'] = consoleLoc

    else:
        print(organid, ": No technical information found.")
        technical_data[organid] = {}
        technical_data[organid]['aids'] = ''
        technical_data[organid]['pitch'] = ''
        technical_data[organid]['temperature'] = ''
        technical_data[organid]['range1'] = ''
        technical_data[organid]['range2'] = ''
        technical_data[organid]['windSys'] = ''
        technical_data[organid]['windPres'] = ''
        technical_data[organid]['consoleLoc'] = ''


In [11]:
#write results as json
with open(os.path.join(outputroot, 'tech.json'), 'w') as f:
    json.dump(technical_data, f, indent=2, sort_keys=True)

In [12]:
#write results as tsv
with open(os.path.join(outputroot, 'tech.tsv'), 'w') as f:
    writer = csv.writer(f, delimiter = '\t', quoting=csv.QUOTE_ALL)
    writer.writerow(['organid', 'aids', 'pitch', 'temperature', 'range1', 'range2', 'windSys', 'windPres', 'consoleLoc'])
    for organid, doc in Docs.items():
        writer.writerow([
            organid,
            technical_data[organid]['aids'],
            technical_data[organid]['pitch'],
            technical_data[organid]['temperature'],
            technical_data[organid]['range1'],
            technical_data[organid]['range2'],
            technical_data[organid]['windSys'],
            technical_data[organid]['windPres'],
            technical_data[organid]['consoleLoc']
        ])

