In [1]:
%reset -f

## Imports

In [2]:
import fitz
import os
import re
from glob import glob
import pandas as pd
from runner import runner
from docx import Document
import ocrmypdf

## Functions

In [3]:
def findExtensions(dr, ext):
    return glob(os.path.join(dr, "*.{}".format(ext)))

def getSubsetText(t):
    start = None
    stop = None
    if len(re.findall(r'CASE %', t)) > 0:
        start = [i.start() for i in re.finditer(r'CASE ', t)][-1]
    else:
        if re.search(r'CASE %', t):
            start = re.search(r'CASE %', t).start()
    if re.search(r'DISCUSSION', t):
        stop = re.search(r'DISCUSSION', t).start()
        recordStart = False
    elif re.search(r'REFERENCES', t):
        stop = re.search(r'REFERENCES', t).start()
        recordStart = False
    
    if start is not None and stop is not None:
        return t[start:stop]
    elif start is not None and stop is None:
        return t[start:]
    elif start is None and stop is not None:
        return t[:stop]
    else:
        return t

## Variables

In [4]:
inputDir = '../data/input/'
outputDir = '../data/output/'
listOfPDF = findExtensions(inputDir, 'pdf')
listOfDocs = findExtensions(inputDir, 'docx')
settingsFile = '../settings.ini'
scannedPDFs = []

In [5]:
# Convert pdf to txt
def pdf2Text(listOfPDF):
    for pdf in listOfPDF:
        with fitz.open(pdf) as doc:
            text = ""
            for page in doc:
                text += page.getText()
        if text == '':
            scannedPDFs.append(pdf)
        subsetText = getSubsetText(text)
        #print(subsetText)
        fn = (pdf.split('/')[-1].replace('pdf', 'txt'))
        outFile = os.path.join(inputDir, fn)

        with open(outFile, 'w') as of:
            of.writelines(subsetText)
            print('File written: '+outFile)

In [6]:
pdf2Text(listOfPDF)

File written: ../data/input/CaseReport16.txt
File written: ../data/input/CaseReport17.txt
File written: ../data/input/CaseReport15.txt
File written: ../data/input/CaseReport14.txt
File written: ../data/input/CaseReport10.txt
File written: ../data/input/CaseReport11.txt
File written: ../data/input/CaseReport13.txt
File written: ../data/input/CaseReport12.txt
File written: ../data/input/CaseReport3.txt
File written: ../data/input/CaseReport2.txt
File written: ../data/input/Epicrisis24.txt
File written: ../data/input/CaseReport1.txt
File written: ../data/input/CaseReport5.txt
File written: ../data/input/CaseReport4.txt
File written: ../data/input/Epicrisis22.txt
File written: ../data/input/CaseReport6.txt
File written: ../data/input/CaseReport7.txt
File written: ../data/input/Epicrisis11.txt
File written: ../data/input/CaseReport9.txt
File written: ../data/input/CaseReport8.txt
File written: ../data/input/CaseReport23.txt
File written: ../data/input/CaseReport22.txt
File written: ../data/

In [7]:
for doc in scannedPDFs:
    ! ocrmypdf $doc $doc

Scanning contents: 100%|████████████████████████| 1/1 [00:00<00:00, 81.91page/s]
OCR: 100%|██████████████████████████████████| 1.0/1.0 [00:02<00:00,  2.19s/page]
Postprocessing...
PDF/A conversion: 100%|█████████████████████████| 1/1 [00:00<00:00,  6.78page/s]
[33mSome input metadata could not be copied because it is not permitted in PDF/A. You may wish to examine the output PDF's XMP metadata.[0m
JPEGs: 0image [00:00, ?image/s]
JBIG2: 0item [00:00, ?item/s]
Optimize ratio: 1.00 savings: 0.0%
Output file is a PDF/A-2B (as expected)
Scanning contents: 100%|███████████████████████| 2/2 [00:00<00:00, 136.04page/s]
Start processing 2 pages concurrently
OCR: 100%|██████████████████████████████████| 2.0/2.0 [00:02<00:00,  1.18s/page]
Postprocessing...
[33mSome input metadata could not be copied because it is not permitted in PDF/A. You may wish to examine the output PDF's XMP metadata.[0m
PDF/A conversion: 100%|█████████████████████████| 2/2 [00:00<00:00,  6.11page/s]
JPEGs: 0image [00:0

In [8]:
pdf2Text(scannedPDFs)

File written: ../data/input/Epicrisis24.txt
File written: ../data/input/Epicrisis22.txt


In [9]:
# Convert docx to txt
for docx in listOfDocs:
    doc = Document(docx)
    text = ''
    for para in doc.paragraphs:
        text += para.text
    fn = docx.split('/')[-1].replace('docx', 'txt')
    outFile = os.path.join(inputDir, fn)
    
    with open(outFile, 'w') as of:
        of.writelines(text)
        print('File written: '+outFile)
        

File written: ../data/input/Epicrisis21.txt
File written: ../data/input/Epicrisis2.txt
File written: ../data/input/Epicrisis17.txt
File written: ../data/input/Epicrisis16.txt
File written: ../data/input/Epicrisis3.txt
File written: ../data/input/Epicrisis20.txt
File written: ../data/input/Epicrisis8.txt
File written: ../data/input/Epicrisis4.txt
File written: ../data/input/Epicrisis5.txt
File written: ../data/input/Epicrisis10.txt
File written: ../data/input/Epicrisis9.txt
File written: ../data/input/Epicrisis13.txt
File written: ../data/input/Epicrisis6.txt
File written: ../data/input/Epicrisis25.txt
File written: ../data/input/Epicrisis7.txt
File written: ../data/input/Epicrisis12.txt
File written: ../data/input/Epicrisis19.txt
File written: ../data/input/Epicrisis23.txt
File written: ../data/input/Epicrisis15.txt
File written: ../data/input/Epicrisis14.txt
File written: ../data/input/Epicrisis1.txt
File written: ../data/input/Epicrisis18.txt


In [10]:
# Run runNER
runner.run_oger(settings=settingsFile)

In [11]:
# read runNER output
df = pd.read_csv(os.path.join(outputDir, 'runNER_Output.tsv'), sep='\t', low_memory=False)
df = df.drop(['ZONE', 'SENTENCE ID', 'UMLS CUI'], axis = 1)
df.head()

Unnamed: 0,DOCUMENT ID,TYPE,START POSITION,END POSITION,MATCHED TERM,PREFERRED FORM,ENTITY ID,ORIGIN,SENTENCE
0,Epicrisis24,biolink:ChemicalSubstance,15,16,G,glycine,CHEBI:15428_SYNONYM,hp.json,UK -3=t — ooo G. Wl 2les —> B2\ or \rar& ...
1,Epicrisis24,biolink:ChemicalSubstance,15,16,G,guanine,CHEBI:16235_SYNONYM,hp.json,UK -3=t — ooo G. Wl 2les —> B2\ or \rar& ...
2,Epicrisis24,biolink:AnatomicalEntity,217,222,blood,blood,UBERON:0000178,hp.json,"""= Re a ea Presenting Complaint: ve blood ..."
3,Epicrisis24,biolink:AnatomicalEntity,234,239,mouth,mouth,UBERON:0000165,hp.json,"""= Re a ea Presenting Complaint: ve blood ..."
4,Epicrisis24,biolink:AnatomicalEntity,234,239,mouth,oral opening,UBERON:0000166_SYNONYM,hp.json,"""= Re a ea Presenting Complaint: ve blood ..."


In [12]:
filteredDF = df[df['ENTITY ID'].str.startswith('HP:')].drop_duplicates()
filteredDF.head()

Unnamed: 0,DOCUMENT ID,TYPE,START POSITION,END POSITION,MATCHED TERM,PREFERRED FORM,ENTITY ID,ORIGIN,SENTENCE
9,Epicrisis24,biolink:PhenotypicFeature,344,349,Acute,Acute,HP:0011009,hp.json,| Primary Diagnosis: Acute idiopathic pulmonm...
10,Epicrisis24,biolink:PhenotypicFeature,415,421,sepsis,Sepsis,HP:0100806,hp.json,Secondary Diagnoses: Likely sepsis.
12,Epicrisis24,biolink:PhenotypicFeature,427,435,positive,Position,HP:0012830,hp.json,NPA positive for Rhinovirus/Enterovirus.
20,Epicrisis24,biolink:PhenotypicFeature,736,756,respiratory distress,Respiratory distress,HP:0002098,hp.json,On arrival to CED in Derby he was cyanotic and...
31,Epicrisis24,biolink:PhenotypicFeature,1055,1064,frequency,Frequency,HP:0040279,hp.json,>< remained intubated and ventilated for 5 day...


In [13]:
filteredDF.to_csv(os.path.join(outputDir,'filteredOutput.tsv'), sep='\t', index=None)