In [1]:
%reset -f

## Imports

In [2]:
import fitz
import os
import re
from glob import glob
import pandas as pd
from runner import runner
from docx import Document
import ocrmypdf

## Functions

In [3]:
def findExtensions(dr, ext):
    return glob(os.path.join(dr, "*.{}".format(ext)))

def getSubsetText(t):
    start = None
    stop = None
    if len(re.findall(r'CASE %', t)) > 0:
        start = [i.start() for i in re.finditer(r'CASE ', t)][-1]
    else:
        if re.search(r'CASE %', t):
            start = re.search(r'CASE %', t).start()
    if re.search(r'DISCUSSION', t):
        stop = re.search(r'DISCUSSION', t).start()
        recordStart = False
    elif re.search(r'REFERENCES', t):
        stop = re.search(r'REFERENCES', t).start()
        recordStart = False
    
    if start is not None and stop is not None:
        return t[start:stop]
    elif start is not None and stop is None:
        return t[start:]
    elif start is None and stop is not None:
        return t[:stop]
    else:
        return t

## Variables

In [4]:
inputDir = '../data/input/'
outputDir = '../data/output/'
listOfPDF = findExtensions(inputDir, 'pdf')
listOfDocs = findExtensions(inputDir, 'docx')
settingsFile = '../settings.ini'
scannedPDFs = []
termsDir = '../terms/'

## JSON => TSV

In [5]:
# One time run
#runner.json2tsv(termsDir+'hp-full.json', termsDir+'hp')
#runner.prepare_termlist(termsDir+'hp_nodes.tsv', termsDir+'hp_termlist.tsv')

In [6]:
# Convert pdf to txt
def pdf2Text(listOfPDF):
    for pdf in listOfPDF:
        with fitz.open(pdf) as doc:
            text = ""
            for page in doc:
                text += page.getText()
        if text == '':
            scannedPDFs.append(pdf)
        subsetText = getSubsetText(text)
        #print(subsetText)
        fn = (pdf.split('/')[-1].replace('pdf', 'txt'))
        outFile = os.path.join(inputDir, fn)

        with open(outFile, 'w') as of:
            of.writelines(subsetText)
            print('File written: '+outFile)

In [7]:
pdf2Text(listOfPDF)

File written: ../data/input/CaseReport16.txt
File written: ../data/input/CaseReport17.txt
File written: ../data/input/CaseReport15.txt
File written: ../data/input/CaseReport14.txt
File written: ../data/input/CaseReport10.txt
File written: ../data/input/CaseReport11.txt
File written: ../data/input/CaseReport13.txt
File written: ../data/input/CaseReport12.txt
File written: ../data/input/CaseReport3.txt
File written: ../data/input/CaseReport2.txt
File written: ../data/input/Epicrisis24.txt
File written: ../data/input/CaseReport1.txt
File written: ../data/input/CaseReport5.txt
File written: ../data/input/CaseReport4.txt
File written: ../data/input/Epicrisis22.txt
File written: ../data/input/CaseReport6.txt
File written: ../data/input/CaseReport7.txt
File written: ../data/input/Epicrisis11.txt
File written: ../data/input/CaseReport9.txt
File written: ../data/input/CaseReport8.txt
File written: ../data/input/CaseReport23.txt
File written: ../data/input/CaseReport22.txt
File written: ../data/

In [8]:
for doc in scannedPDFs:
    ! ocrmypdf $doc $doc

In [9]:
pdf2Text(scannedPDFs)

In [10]:
# Convert docx to txt
for docx in listOfDocs:
    doc = Document(docx)
    text = ''
    for para in doc.paragraphs:
        text += para.text
    fn = docx.split('/')[-1].replace('docx', 'txt')
    outFile = os.path.join(inputDir, fn)
    
    with open(outFile, 'w') as of:
        of.writelines(text)
        print('File written: '+outFile)
        

File written: ../data/input/Epicrisis21.txt
File written: ../data/input/Epicrisis2.txt
File written: ../data/input/Epicrisis17.txt
File written: ../data/input/Epicrisis16.txt
File written: ../data/input/Epicrisis3.txt
File written: ../data/input/Epicrisis20.txt
File written: ../data/input/Epicrisis8.txt
File written: ../data/input/Epicrisis4.txt
File written: ../data/input/Epicrisis5.txt
File written: ../data/input/Epicrisis10.txt
File written: ../data/input/Epicrisis9.txt
File written: ../data/input/Epicrisis13.txt
File written: ../data/input/Epicrisis6.txt
File written: ../data/input/Epicrisis25.txt
File written: ../data/input/Epicrisis7.txt
File written: ../data/input/Epicrisis12.txt
File written: ../data/input/Epicrisis19.txt
File written: ../data/input/Epicrisis23.txt
File written: ../data/input/Epicrisis15.txt
File written: ../data/input/Epicrisis14.txt
File written: ../data/input/Epicrisis1.txt
File written: ../data/input/Epicrisis18.txt


In [11]:
# Run runNER
runner.run_oger(settings=settingsFile)

In [12]:
# read runNER output
df = pd.read_csv(os.path.join(outputDir, 'runNER_Output.tsv'), sep='\t', low_memory=False)
df.head()

Unnamed: 0,DOCUMENT ID,TYPE,START POSITION,END POSITION,MATCHED TERM,PREFERRED FORM,ENTITY ID,ZONE,SENTENCE ID,ORIGIN,UMLS CUI,SENTENCE
0,Epicrisis24,biolink:PhenotypicFeature,344,349,Acute,Acute,HP:0011009,,S6,hp-full.json,CUI-less,| Primary Diagnosis: Acute idiopathic pulmonm...
1,Epicrisis24,biolink:PhenotypicFeature,415,421,sepsis,Sepsis,HP:0100806,,S7,hp-full.json,CUI-less,Secondary Diagnoses: Likely sepsis.
2,Epicrisis24,biolink:PhenotypicFeature,427,435,positive,Position,HP:0012830,,S8,hp-full.json,CUI-less,NPA positive for Rhinovirus/Enterovirus.
3,Epicrisis24,biolink:PhenotypicFeature,736,756,respiratory distress,Respiratory distress,HP:0002098,,S11,hp-full.json,CUI-less,On arrival to CED in Derby he was cyanotic and...
4,Epicrisis24,biolink:PhenotypicFeature,1055,1064,frequency,Frequency,HP:0040279,,S15,hp-full.json,CUI-less,>< remained intubated and ventilated for 5 day...


In [13]:
filteredDF = df[df['ENTITY ID'].str.startswith('HP:')].drop_duplicates()
filteredDF.head()

Unnamed: 0,DOCUMENT ID,TYPE,START POSITION,END POSITION,MATCHED TERM,PREFERRED FORM,ENTITY ID,ZONE,SENTENCE ID,ORIGIN,UMLS CUI,SENTENCE
0,Epicrisis24,biolink:PhenotypicFeature,344,349,Acute,Acute,HP:0011009,,S6,hp-full.json,CUI-less,| Primary Diagnosis: Acute idiopathic pulmonm...
1,Epicrisis24,biolink:PhenotypicFeature,415,421,sepsis,Sepsis,HP:0100806,,S7,hp-full.json,CUI-less,Secondary Diagnoses: Likely sepsis.
2,Epicrisis24,biolink:PhenotypicFeature,427,435,positive,Position,HP:0012830,,S8,hp-full.json,CUI-less,NPA positive for Rhinovirus/Enterovirus.
3,Epicrisis24,biolink:PhenotypicFeature,736,756,respiratory distress,Respiratory distress,HP:0002098,,S11,hp-full.json,CUI-less,On arrival to CED in Derby he was cyanotic and...
4,Epicrisis24,biolink:PhenotypicFeature,1055,1064,frequency,Frequency,HP:0040279,,S15,hp-full.json,CUI-less,>< remained intubated and ventilated for 5 day...


In [14]:
filteredDF.to_csv(os.path.join(outputDir,'filteredOutput.tsv'), sep='\t', index=None)