## Text radiology repors

In [15]:
raw_report = "EXAM: Chest CT Angiography with IV contrast enhancement using pulmonary embolism protocol, including 3D image post processing.COMPARISON: Chest radiographs 2018/02. FINDINGS: Positive for acute pulmonary embolism. There are bilateral subsegmental pulmonary artery emboli. In the right lower lobe there is an occlusive subsegmental pulmonary artery emboli associated with a wedge-shaped pulmonary infarct. This was seen on the chest radiograph earlier today. IMPRESSION: 1. Positive for acute pulmonary embolism. 2. Wedge-shaped pulmonary infarct in the right lower lobe."

In [20]:
raw_report

'EXAM: Chest CT Angiography with IV contrast enhancement using pulmonary embolism protocol, including 3D image post processing.COMPARISON: Chest radiographs 2018/02. FINDINGS: Positive for acute pulmonary embolism. There are bilateral subsegmental pulmonary artery emboli. In the right lower lobe there is an occlusive subsegmental pulmonary artery emboli associated with a wedge-shaped pulmonary infarct. This was seen on the chest radiograph earlier today. IMPRESSION: 1. Positive for acute pulmonary embolism. 2. Wedge-shaped pulmonary infarct in the right lower lobe.'

## Segmenting the sentences

In [17]:
# Tokenizing text into sentences
from nltk.tokenize import sent_tokenize

sentences = sent_tokenize(raw_report)
sentences


['EXAM: Chest CT Angiography with IV contrast enhancement using pulmonary embolism protocol, including 3D image post processing.COMPARISON: Chest radiographs 2018/02.',
 'FINDINGS: Positive for acute pulmonary embolism.',
 'There are bilateral subsegmental pulmonary artery emboli.',
 'In the right lower lobe there is an occlusive subsegmental pulmonary artery emboli associated with a wedge-shaped pulmonary infarct.',
 'This was seen on the chest radiograph earlier today.',
 'IMPRESSION: 1.',
 'Positive for acute pulmonary embolism.',
 '2.',
 'Wedge-shaped pulmonary infarct in the right lower lobe.']

## Segmenting the sections

In [18]:
## section segmentation
import re
def extract(txt):
    txt = txt.encode("ascii", errors="ignore").decode()
    txt = txt.lower()
    txt = txt.replace('\n', ' ')
    txt = txt.replace('\r', ' ')
    txt = txt.replace('\t', ' ')
    re1 = '(\\()'  # Any Single Character 1
    re2 = '.*?'  # Non-greedy match on filler
    re3 = '(\\))'  # Any Single Character 2
    rg = re.compile(re1 + re2 + re3, re.IGNORECASE | re.DOTALL)
    tags = ['clinical indication: ','support devices: ','comparison: ','findings: ', 'impression:', 'critical findings', 'important findings']
    sections = {'clinical indication: ': ' ','support devices: ': ' ','comparison: ': ' ','findings: ': ' ', 'impression: ': ' '}
    for t in sections.keys():
        try:
            tmp = txt.split(t)[1]
            for l in tags:
                if t !=l:
                    tmp = tmp.split(l)[0]
                    sections[t] = re.sub(rg, ' ', tmp.split('these findings: ')[0])
        except:
            sections[t] = ' '
    return sections

In [19]:
extract(raw_report)

{'clinical indication: ': ' ',
 'support devices: ': ' ',
 'comparison: ': 'chest radiographs 2018/02. ',
 'findings: ': 'positive for acute pulmonary embolism. there are bilateral subsegmental pulmonary artery emboli. in the right lower lobe there is an occlusive subsegmental pulmonary artery emboli associated with a wedge-shaped pulmonary infarct. this was seen on the chest radiograph earlier today. ',
 'impression: ': '1. positive for acute pulmonary embolism. 2. wedge-shaped pulmonary infarct in the right lower lobe.'}

In [9]:
# Tokenizing text into bags of words
from nltk.tokenize import word_tokenize
tokenized_docs = [word_tokenize(doc) for doc in sentences]
print(tokenized_docs)

## count number of words
print('\nNumber of words:'+str(sum( [len(listElem) for listElem in tokenized_docs])))

[['EXAM', ':', 'Chest', 'CT', 'Angiography', 'with', 'IV', 'contrast', 'enhancement', 'using', 'pulmonary', 'embolism', 'protocol', ',', 'including', '3D', 'image', 'post', 'processing.COMPARISON', ':', 'Chest', 'radiographs', '2018/02', '.'], ['FINDINGS', ':', 'Positive', 'for', 'acute', 'pulmonary', 'embolism', '.'], ['There', 'are', 'bilateral', 'subsegmental', 'pulmonary', 'artery', 'emboli', '.'], ['In', 'the', 'right', 'lower', 'lobe', 'there', 'is', 'an', 'occlusive', 'subsegmental', 'pulmonary', 'artery', 'emboli', 'associated', 'with', 'a', 'wedge-shaped', 'pulmonary', 'infarct', '.'], ['This', 'was', 'seen', 'on', 'the', 'chest', 'radiograph', 'earlier', 'today', '.'], ['IMPRESSION', ':', '1', '.'], ['Positive', 'for', 'acute', 'pulmonary', 'embolism', '.'], ['2', '.'], ['Wedge-shaped', 'pulmonary', 'infarct', 'in', 'the', 'right', 'lower', 'lobe', '.']]

Number of words:91


In [10]:
# Removing punctuation
import re
import string
regex = re.compile('[%s]' % re.escape(string.punctuation)) #see documentation here: http://docs.python.org/2/library/string.html

tokenized_docs_no_punctuation = []

for review in tokenized_docs:
    new_review = []
    for token in review:
        new_token = regex.sub(u'', token)
        if not new_token == u'':
            new_review.append(new_token)
    
    tokenized_docs_no_punctuation.append(new_review)
    
print(tokenized_docs_no_punctuation)

## count number of words
print('\nNumber of words after dropping punctuation:'+str(sum( [len(listElem) for listElem in tokenized_docs_no_punctuation])))

[['EXAM', 'Chest', 'CT', 'Angiography', 'with', 'IV', 'contrast', 'enhancement', 'using', 'pulmonary', 'embolism', 'protocol', 'including', '3D', 'image', 'post', 'processingCOMPARISON', 'Chest', 'radiographs', '201802'], ['FINDINGS', 'Positive', 'for', 'acute', 'pulmonary', 'embolism'], ['There', 'are', 'bilateral', 'subsegmental', 'pulmonary', 'artery', 'emboli'], ['In', 'the', 'right', 'lower', 'lobe', 'there', 'is', 'an', 'occlusive', 'subsegmental', 'pulmonary', 'artery', 'emboli', 'associated', 'with', 'a', 'wedgeshaped', 'pulmonary', 'infarct'], ['This', 'was', 'seen', 'on', 'the', 'chest', 'radiograph', 'earlier', 'today'], ['IMPRESSION', '1'], ['Positive', 'for', 'acute', 'pulmonary', 'embolism'], ['2'], ['Wedgeshaped', 'pulmonary', 'infarct', 'in', 'the', 'right', 'lower', 'lobe']]

Number of words after dropping punctuation:77


In [11]:
# Cleaning text of stopwords
from nltk.corpus import stopwords

tokenized_docs_no_stopwords = []

for doc in tokenized_docs_no_punctuation:
    new_term_vector = []
    for word in doc:
        if not word in stopwords.words('english'):
            new_term_vector.append(word)
    
    tokenized_docs_no_stopwords.append(new_term_vector)

print(tokenized_docs_no_stopwords)

## count number of words
print('\nNumber of words after dropping stopwords:'+str(sum( [len(listElem) for listElem in tokenized_docs_no_stopwords])))

[['EXAM', 'Chest', 'CT', 'Angiography', 'IV', 'contrast', 'enhancement', 'using', 'pulmonary', 'embolism', 'protocol', 'including', '3D', 'image', 'post', 'processingCOMPARISON', 'Chest', 'radiographs', '201802'], ['FINDINGS', 'Positive', 'acute', 'pulmonary', 'embolism'], ['There', 'bilateral', 'subsegmental', 'pulmonary', 'artery', 'emboli'], ['In', 'right', 'lower', 'lobe', 'occlusive', 'subsegmental', 'pulmonary', 'artery', 'emboli', 'associated', 'wedgeshaped', 'pulmonary', 'infarct'], ['This', 'seen', 'chest', 'radiograph', 'earlier', 'today'], ['IMPRESSION', '1'], ['Positive', 'acute', 'pulmonary', 'embolism'], ['2'], ['Wedgeshaped', 'pulmonary', 'infarct', 'right', 'lower', 'lobe']]

Number of words after dropping stopwords:62


In [12]:
# Stemming and Lemmatizing
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

porter = PorterStemmer()
snowball = SnowballStemmer('english')
wordnet = WordNetLemmatizer()

preprocessed_docs = []

for doc in tokenized_docs_no_stopwords:
    final_doc = []
    for word in doc:
        final_doc.append(porter.stem(word))
        #final_doc.append(snowball.stem(word))
        #final_doc.append(wordnet.lemmatize(word))
    
    preprocessed_docs.append(final_doc)

print(preprocessed_docs)


[['exam', 'chest', 'CT', 'angiographi', 'IV', 'contrast', 'enhanc', 'use', 'pulmonari', 'embol', 'protocol', 'includ', '3D', 'imag', 'post', 'processingcomparison', 'chest', 'radiograph', '201802'], ['find', 'posit', 'acut', 'pulmonari', 'embol'], ['there', 'bilater', 'subsegment', 'pulmonari', 'arteri', 'emboli'], ['In', 'right', 'lower', 'lobe', 'occlus', 'subsegment', 'pulmonari', 'arteri', 'emboli', 'associ', 'wedgeshap', 'pulmonari', 'infarct'], ['thi', 'seen', 'chest', 'radiograph', 'earlier', 'today'], ['impress', '1'], ['posit', 'acut', 'pulmonari', 'embol'], ['2'], ['wedgeshap', 'pulmonari', 'infarct', 'right', 'lower', 'lobe']]
