In [None]:
import pdfreader
import re
import string
import json
import csv
from pdfreader import PDFDocument, SimplePDFViewer

# settings
settings = {
    'path': 'reports',
    'prefix': 'export',
    'suffix': '',
    'delimiter': '\t',
    'extension': 'csv'
}

# open PDF report file
fd = open(settings['path'] + '/mag m 04-15.pdf', 'rb')
# fd = open(settings['path'] + '/oes 04-15 barrett.pdf', 'rb')
# fd = open(settings['path'] + '/oes 04-15 high.pdf', 'rb')
# fd = open(settings['path'] + '/oes m 04-15.pdf', 'rb')


In [None]:
viewer = SimplePDFViewer(fd)
print('pages:', viewer.doc.root['Pages']['Count'])

header = dict()
results = dict()
# iterating over all pages
for pageNumber in range(1, viewer.doc.root['Pages']['Count'] + 1):
    viewer.navigate(pageNumber)
    viewer.render()

    # get all lines of the page
    allLines = viewer.canvas.text_content.split('\n')

    # filter for lines with 'readable' text content only
    cleanLines = []
    for line in allLines:
        lineMatch = re.match('\[(.*)\]', line)
        if lineMatch is not None:
            cleanLines.append(lineMatch.group(1))

    # filter for characters
    text = []
    for line in cleanLines:
        text.append(''.join(re.findall('\((.)\)', line)))

    # parse for text header and text body
    textHeader = []
    textBody = []
    headerFlag = False
    for line in text:
        if not re.match('\d+PATH.*', line) and not headerFlag:
            textHeader.append(line)
        else:
            headerFlag = True
        if headerFlag:
            textBody.append(line)

    # get header information
    if viewer.current_page_number == 1:
        header = {
            'Lokalisation': textHeader[textHeader.index('Lokalisation: ') + 1] if not textHeader[textHeader.index('Lokalisation: ') + 1] == 'Diagnose: ' else '',
            'Start':  textHeader[textHeader.index('Zeitraum vom ') + 1],
            'End':  textHeader[textHeader.index(' bis ') + 1],
            'Untersuchungsmaterial':  textHeader[textHeader.index('Untersuchungsmaterial: ') + 1] if not textHeader.index('Untersuchungsmaterial: ') == len(textHeader) - 1 else '',
        }
        try:
            header['Diagnose'] = textHeader[textHeader.index('Diagnose: ') + 1] if not textHeader[textHeader.index('Diagnose: ') + 1] == 'Zeitraum vom ' else ''
        except:
            header['Diagnose'] = ''
        try:
            header['Veränderung'] = textHeader[textHeader.index('Veränderung: ') + 1] if not textHeader[textHeader.index('Veränderung: ') + 1] == 'Zeitraum vom ' else ''
        except:
            header['Veränderung'] = ''

    # split text body by items
    items = []
    currentItem = []
    for line in textBody:
        if re.match('\d+PATH.*', line):
            if len(currentItem) > 0:
                items.append(currentItem)
            currentItem = []
            currentItem.append(line)
        else:
            currentItem.append(line)
    items.append(currentItem)
    print('page:', viewer.current_page_number, '   items:', len(items))

    # parse items
    for item in items:
        itemNumber = re.match('(\d+)(.*)', item[0]).groups(0)
        birthdaySex = re.match('(.*), geb\. (.*)', item[-3]).groups(0)
        icd10 = ''
        diagnose = ''
        lineStart = 2
        lineEnd = -4

        # fix diagnose text and patient name merge
        # or empty diagnose text
        if len(item) == 6:
            try:
                diagnoseName = re.match('(.+\.)\s*?(\S.+)', item[-4])
                item.insert(-4, diagnoseName.group(1))
                item[-4] = diagnoseName.group(2)
            except:
                item.insert(-4, '')
        nameLines = item[-4]

        # fix multiline patient name
        if not re.match('(.*), (.*)', item[-4]):
            if item[-5][-1] in [' ', '-']:
                nameLines = item[-5] + item[-4]
            else:
                nameLines = item[-5] + ' ' + item[-4]
            lineEnd = -5
        name = re.match('(.*), (.*)', nameLines).groups(0)
        for line in item[lineStart:lineEnd]:
            if len(line) > 0:
                if line[-1] in [' ', '-']:
                    diagnose += line
                else:
                    diagnose += line + ' '
            if re.match('ICD-10: (.*)', line):
                icd10 = re.findall('(?<=([A-Z]))\s*(\d+(\.\d{1,4}\s*)?)', re.match('ICD-10: (.*)', line).groups(0)[0])

        # fix first empty line of diagnose subitem
        if re.match('(\d{2}\.\d{2}\.\d{4})(.+)', item[1]):
            if item[1][-1] in [' ', '-']:
                diagnose = re.match('(\d{2}\.\d{2}\.\d{4})(.+)', item[1]).group(2) + diagnose
            else:
                diagnose = re.match('(\d{2}\.\d{2}\.\d{4})(.+)', item[1]).group(2) + ' ' + diagnose
            item[1] = re.match('(\d{2}\.\d{2}\.\d{4})(.+)', item[1]).group(1)

        result = {
            itemNumber[1]: {
                'Datum': item[1],
                'KT-Diagnose/Diagnose': diagnose.strip().replace('\\', ')').replace('  ', ' '),
                'ICD-10': ', '.join(''.join(icd[0:2]) for icd in icd10),
                'Nachname': name[0],
                'Vorname': name[1],
                'Geschlecht': birthdaySex[0],
                'Geburtsdatum': birthdaySex[1],
                'Fallnummer': item[-2],
                'PID': item[-1]
            }
        }
        results.update(result)

In [None]:
# create filename
filename = settings['path']
if not settings['path'][-1] == '/':
    filename += '/'
filename += settings['prefix']
if not header['Lokalisation'] == '':
    filename += '_loc_' + header['Lokalisation']
if not header['Diagnose'] == '':
    filename += '_diag_' + header['Diagnose']
if not header['Veränderung'] == '':
    filename += '_chg_' + header['Veränderung']
if not header['Untersuchungsmaterial'] == '':
    filename += '_mat_' + header['Untersuchungsmaterial']
filename += '_' + header['Start'] + '-' + header['End']
if not settings['suffix'] == '':
    filename += '_' + settings['suffix']
filename += '.' + settings['extension']

# write CSV file
with open(filename, 'w') as f:
    colNames = ['#', 'Auftragsnummer', 'Datum', 'KT-Diagnose/Diagnose', 'ICD-10', 'Nachname', 'Vorname', 'Geschlecht', 'Geburtsdatum', 'Fallnummer', 'PID']
    CSVFile = csv.DictWriter(f, delimiter=settings['delimiter'], fieldnames=colNames)
    CSVFile.writeheader()
    count = 1
    for item in results:
        CSVFile.writerow({
            '#': count,
            'Auftragsnummer': item,
            'Datum': results[item]['Datum'],
            'KT-Diagnose/Diagnose': results[item]['KT-Diagnose/Diagnose'],
            'ICD-10': results[item]['ICD-10'],
            'Nachname': results[item]['Nachname'],
            'Vorname': results[item]['Vorname'],
            'Geschlecht': results[item]['Geschlecht'],
            'Geburtsdatum': results[item]['Geburtsdatum'],
            'Fallnummer':results[item]['Fallnummer'],
            'PID': results[item]['PID']
        })
        count += 1