## Extracting data from radiology reports - anatomy

In [139]:
import pandas as pd

### Read vocabulary

In [117]:
onto_path = './Ontology/'

In [118]:
df_FMA = pd.read_csv(onto_path+'FMA_mod.csv')
df_FMA = df_FMA.fillna('null')

In [119]:
FMAwordMap = {}
FMADterms = []
FMACterms = []
for i in range(df_FMA.shape[0]):
    current_entry = df_FMA.iloc[i]
    prefname = current_entry['Preferred Label']
    prefname = prefname.replace(" ", "_")
    if(current_entry['Synonyms']!= 'null'):
        Sysplit = current_entry['Synonyms'].split('|')
        for k in range(len(Sysplit)):
            FMAwordMap[' ' + Sysplit[k].lower() + ' '] = ' ' + prefname + ' '
            FMADterms.append(' ' + Sysplit[k].lower() + ' ')
    else:
        prefnameSlip = prefname.replace("_", " ")
        FMAwordMap[' ' + prefnameSlip.lower() + ' '] = ' ' + prefname + ' '
        FMADterms.append(' ' + prefnameSlip.lower() + ' ')

In [120]:
def cstr(s, color='black'):
    return "<text style=color:{}>{}</text>".format(color, s)

## Radiology report as text

In [121]:
Radiology_report = "CT Abdomen with contrast, Triphasic Liver Protocol INDICATION: Hepatocellular carcinoma. History of alcoholic cirrhosis status post TIPS placement. COMPARISON: 4/10/2013 TECHNIQUE: After IV administration of 86 ml of Isovue 370, helical CT imaging of the abdomen was performed axially during the arterial, portal venous, and delayed phases. Coronal and sagittal reformatted images were reviewed. Based on a 32-cm body phantom, the estimated radiation dose (CTDIvol [mGy]) for each series in this exam are: 1.99, 11.95, 6.47, 6.43, 6.50. The estimated cumulative dose (DLP [mGy-cm]) is: 512. NOTE: The radiation dose indicators for CT -- the 'volume CT Dose Index' (CTDIvol) given in milli-Gray (mGy), and the Dose Length Product (DLP) given in mGy-centimeters (mGy.cm) -- are generated from the CT scanner to estimate radiation exposure based on technical study parameters and a reference phantom. The CTDI and DLP may therefore substantially over- or underestimate an individual's absorbed dose based on patient size and other factors.  FINDINGS: Lung bases are clear. Heart size is normal. No pleural or pericardial effusion. ABDOMEN: Hepatic morphology suggestive of cirrhosis is again demonstrated, including hypertrophy of the lateral segment, widening of the fissures, mildly nodular hepatic contour, and corkscrew appearance of the hepatic arteries. Multiple small hypervascular hepatic lesions with associated delayed washout appear essentially unchanged and again are diagnostic of hepatocellular carcinoma in this context. Lesions include the following: 10-mm segment 7 lesion (5:76)9 mm segment 8 lesion (5:64)8-mm segment 2 lesion (5:75)7-mm segment 4a lesion (5:47) The following two lesions do not as clearly exhibit washout, likely due to their small size. They are therefore nonspecific but remain concerning.5-mm segment 5 lesion (5:99)5-mm segment 1 lesion (5:64) There is no evidence of vascular invasion by any of these tumors. The TIPS extending from the distal main portal vein, right portal vein, and right hepatic vein appears widely patent. Major hepatic vasculature appears patent. Accessory left hepatic artery is again demonstrated. The gallbladder has been removed. The spleen, pancreas, adrenal glands, and kidneys appear normal. Imaged portions of small and large bowel appear normal. The abdominal aorta is normal in course and caliber. Major abdominal vasculature is patent. No abdominal lymphadenopathy. No ascites. No free intraperitoneal air. No aggressive osseous lesion is identified in the abdomen. IMPRESSION: 1. ESSENTIALLY UNCHANGED APPEARANCE OF 4 SMALL (8-10 MM) HCC LESIONS AS DETAILED. TWO SMALLER ENHANCING LESIONS ARE ALSO CONCERNING BUT NONSPECIFIC AS THEY DO NOT AS CLEARLY EXHIBIT WASHOUT, LIKELY DUE TO THEIR SMALL SIZE. 2. HEPATIC MORPHOLOGY SUGGESTIVE OF CIRRHOSIS.  SUMMARY:4-POSSIBLY SIGNIFICANT FINDING, MAY NEED ACTIONI have personally reviewed the images for this examination and agreedwith the report transcribed above."

In [122]:
html_print(Radiology_report)

In [140]:
from IPython.display import HTML as html_print

report_mod = Radiology_report.lower()
for term in FMADterms: 
    if term in report_mod:
        report_mod = report_mod.replace(term , cstr(term.rstrip(' ') +'|FMA ', color='red'))

In [141]:
html_print(report_mod)

## Extracting data from DICOM header

In [60]:
def myprint(dataset, indent=0):
    """Go through all items in the dataset and print them with custom format

    Modelled after Dataset._pretty_str()
    """
    dont_print = ['Pixel Data', 'File Meta Information Version']

    indent_string = "   " * indent
    next_indent_string = "   " * (indent + 1)

    for data_element in dataset:
        if data_element.VR == "SQ":   # a sequence
            print(indent_string, data_element.name)
            for sequence_item in data_element.value:
                myprint(sequence_item, indent + 1)
                print(next_indent_string + "---------")
        else:
            if data_element.name in dont_print:
                print("""<item not printed -- in the "don't print" list>""")
            else:
                repr_value = repr(data_element.value)
                if len(repr_value) > 50:
                    repr_value = repr_value[:50] + "..."
                print("{0:s} {1:s} = {2:s}".format(indent_string,
                                                   data_element.name,
                                                   repr_value))


In [61]:
ds = pydicom.dcmread('./Data/t2/000000.dcm')

myprint(ds)

 Specific Character Set = 'ISO_IR 100'
 Image Type = ['ORIGINAL', 'PRIMARY', 'OTHER']
 SOP Class UID = '1.2.840.10008.5.1.4.1.1.4'
 SOP Instance UID = '1.3.6.1.4.1.14519.5.2.1.1706.4001.279933547068626...
 Study Date = '19970608'
 Series Date = '19970608'
 Acquisition Date = '19970608'
 Content Date = '19970608'
 Study Time = '090647'
 Series Time = '104416'
 Acquisition Time = '104418'
 Content Time = '104418'
 Accession Number = '2819497684894126'
 Modality = 'MR'
 Manufacturer = 'GE MEDICAL SYSTEMS'
 Referring Physician's Name = ' '
 Station Name = ''
 Study Description = 'MRI, BRAIN W&W/O CONTRAMR'
 Procedure Code Sequence
    Code Value = '6500477'
    Code Meaning = 'MRI, BRAIN W&W/O CONTRAST'
   ---------
 Series Description = '2D WAND T2 WEIGHTED'
 Manufacturer's Model Name = 'GENESIS_SIGNA'
 Patient's Name = ' '
 Patient ID = 'TCGA-02-0003'
 Patient's Birth Date = ''
 Patient's Sex = 'M'
 Patient's Age = '050Y'
 Patient's Weight = "99.790000"
 Additional Patient History = 'L-T

## Parts of speach tagging

In [62]:
import nltk

text = 'there are tiny mediastinal lymph nodes measuring up to 0.1 mm with no mediastinal adenopathy'


In [63]:
def tag_noun_adjtive(text):
    text = nltk.word_tokenize(text)
    result = nltk.pos_tag(text)
    adjective_tags = ["JJ", "JJR", "JJS"]
    noun_tags = ['NN', 'NNS']
    adjective = []
    noun = []
    for a in result: 
        if a[1] in adjective_tags:
            adjective.append(a[0])
        if a[1] in noun_tags:
            noun.append(a[0])
    return adjective, noun

In [64]:
tag_noun_adjtive(text)

(['tiny', 'mediastinal', 'mediastinal'],
 ['lymph', 'nodes', 'mm', 'adenopathy'])

## Steming, stop words, and punctuations removal

In [66]:
Radiology_report

"CT Abdomen with contrast, Triphasic Liver Protocol INDICATION: Hepatocellular carcinoma. History of alcoholic cirrhosis status post TIPS placement. COMPARISON: 4/10/2013 TECHNIQUE: After IV administration of 86 ml of Isovue 370, helical CT imaging of the abdomen was performed axially during the arterial, portal venous, and delayed phases. Coronal and sagittal reformatted images were reviewed. Based on a 32-cm body phantom, the estimated radiation dose (CTDIvol [mGy]) for each series in this exam are: 1.99, 11.95, 6.47, 6.43, 6.50. The estimated cumulative dose (DLP [mGy-cm]) is: 512. NOTE: The radiation dose indicators for CT -- the 'volume CT Dose Index' (CTDIvol) given in milli-Gray (mGy), and the Dose Length Product (DLP) given in mGy-centimeters (mGy.cm) -- are generated from the CT scanner to estimate radiation exposure based on technical study parameters and a reference phantom. The CTDI and DLP may therefore substantially over- or underestimate an individual's absorbed dose bas

## length of the original report in words

In [68]:
len(Radiology_report)

2979

## stop words

In [49]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [73]:
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
import re
import string


stemmer = SnowballStemmer("english")
exclude_list =  set(stopwords.words('english'))


In [90]:
words = Radiology_report.split(' ')
words = [stemmer.stem(str(word)) for word in words]
newContent = ' '.join([word for word in words if word not in exclude_list])
newContent = ''.join([i for i in newContent if not i.isdigit()])
newContent = re.sub('['+string.punctuation+']', ' ', newContent)

In [91]:
len(newContent)

2183

In [92]:
newContent

'ct abdomen contrast  triphas liver protocol indication  hepatocellular carcinoma  histori alcohol cirrhosi status post tip placement  comparison     technique  iv administr  ml isovu   helic ct imag abdomen perform axial dure arterial  portal venous  delay phases  coron sagitt reformat imag reviewed  base  cm bodi phantom  estim radiat dose  ctdivol  mgy   seri exam are                 estim cumul dose  dlp  mgy cm   is    note  radiat dose indic ct    volum ct dose index  ctdivol  given milli gray  mgy   dose length product  dlp  given mgy centimet  mgy cm     generat ct scanner estim radiat exposur base technic studi paramet refer phantom  ctdi dlp may therefor substanti over  underestim individu absorb dose base patient size factors   findings  lung base clear  heart size normal  pleural pericardi effusion  abdomen  hepat morpholog suggest cirrhosi demonstrated  includ hypertrophi later segment  widen fissures  mild nodular hepat contour  corkscrew appear hepat arteries  multipl sm

## Document segmentation

### segment impression section

In [94]:
Radiology_report.split('IMPRESSION:')[1].split('SUMMARY:')[0]

' 1. ESSENTIALLY UNCHANGED APPEARANCE OF 4 SMALL (8-10 MM) HCC LESIONS AS DETAILED. TWO SMALLER ENHANCING LESIONS ARE ALSO CONCERNING BUT NONSPECIFIC AS THEY DO NOT AS CLEARLY EXHIBIT WASHOUT, LIKELY DUE TO THEIR SMALL SIZE. 2. HEPATIC MORPHOLOGY SUGGESTIVE OF CIRRHOSIS.  '

### segment finding section

In [97]:
Radiology_report.split('FINDINGS:')[1].split('IMPRESSION:')[0]

' Lung bases are clear. Heart size is normal. No pleural or pericardial effusion. ABDOMEN: Hepatic morphology suggestive of cirrhosis is again demonstrated, including hypertrophy of the lateral segment, widening of the fissures, mildly nodular hepatic contour, and corkscrew appearance of the hepatic arteries. Multiple small hypervascular hepatic lesions with associated delayed washout appear essentially unchanged and again are diagnostic of hepatocellular carcinoma in this context. Lesions include the following: 10-mm segment 7 lesion (5:76)9 mm segment 8 lesion (5:64)8-mm segment 2 lesion (5:75)7-mm segment 4a lesion (5:47) The following two lesions do not as clearly exhibit washout, likely due to their small size. They are therefore nonspecific but remain concerning.5-mm segment 5 lesion (5:99)5-mm segment 1 lesion (5:64) There is no evidence of vascular invasion by any of these tumors. The TIPS extending from the distal main portal vein, right portal vein, and right hepatic vein app

## Sentence segmentation

In [102]:
from nltk import sent_tokenize
 
sents = sent_tokenize(Radiology_report.split('FINDINGS:')[1].split('IMPRESSION:')[0])

for i in range(len(sents)):
    print('Sentence'+str(i)+': '+sents[i])

Sentence0:  Lung bases are clear.
Sentence1: Heart size is normal.
Sentence2: No pleural or pericardial effusion.
Sentence3: ABDOMEN: Hepatic morphology suggestive of cirrhosis is again demonstrated, including hypertrophy of the lateral segment, widening of the fissures, mildly nodular hepatic contour, and corkscrew appearance of the hepatic arteries.
Sentence4: Multiple small hypervascular hepatic lesions with associated delayed washout appear essentially unchanged and again are diagnostic of hepatocellular carcinoma in this context.
Sentence5: Lesions include the following: 10-mm segment 7 lesion (5:76)9 mm segment 8 lesion (5:64)8-mm segment 2 lesion (5:75)7-mm segment 4a lesion (5:47) The following two lesions do not as clearly exhibit washout, likely due to their small size.
Sentence6: They are therefore nonspecific but remain concerning.5-mm segment 5 lesion (5:99)5-mm segment 1 lesion (5:64) There is no evidence of vascular invasion by any of these tumors.
Sentence7: The TIPS ex

## Bag of words

In [133]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(sents[10:20])


In [134]:
print(vectorizer.get_feature_names())

['abdomen', 'abdominal', 'adrenal', 'aggressive', 'air', 'and', 'aorta', 'appear', 'ascites', 'been', 'bowel', 'caliber', 'course', 'free', 'gallbladder', 'glands', 'has', 'identified', 'imaged', 'in', 'intraperitoneal', 'is', 'kidneys', 'large', 'lesion', 'lymphadenopathy', 'major', 'no', 'normal', 'of', 'osseous', 'pancreas', 'patent', 'portions', 'removed', 'small', 'spleen', 'the', 'vasculature']


In [135]:
print(X.toarray())

[[0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
  0 1 0]
 [0 0 1 0 0 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0
  1 1 0]
 [0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 1 0 0 0 1 0 1
  0 0 0]
 [0 1 0 0 0 1 1 0 0 0 0 1 1 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0
  0 1 0]
 [0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0
  0 0 1]
 [0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0
  0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
  0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
  0 0 0]
 [1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 1 0 0 1 0 0 1 0 0 0 0 0
  0 1 0]]


### first sentence

In [137]:
sents[10]

'The gallbladder has been removed.'

### it's vector representation

In [138]:
X[0].toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0]], dtype=int64)

## Information extraction

In [113]:
Radiology_report

"CT Abdomen with contrast, Triphasic Liver Protocol INDICATION: Hepatocellular carcinoma. History of alcoholic cirrhosis status post TIPS placement. COMPARISON: 4/10/2013 TECHNIQUE: After IV administration of 86 ml of Isovue 370, helical CT imaging of the abdomen was performed axially during the arterial, portal venous, and delayed phases. Coronal and sagittal reformatted images were reviewed. Based on a 32-cm body phantom, the estimated radiation dose (CTDIvol [mGy]) for each series in this exam are: 1.99, 11.95, 6.47, 6.43, 6.50. The estimated cumulative dose (DLP [mGy-cm]) is: 512. NOTE: The radiation dose indicators for CT -- the 'volume CT Dose Index' (CTDIvol) given in milli-Gray (mGy), and the Dose Length Product (DLP) given in mGy-centimeters (mGy.cm) -- are generated from the CT scanner to estimate radiation exposure based on technical study parameters and a reference phantom. The CTDI and DLP may therefore substantially over- or underestimate an individual's absorbed dose bas

### get all the measurements

In [114]:
re1='(\\d+)'	# Integer Number 1
re2='(.)'	# Any Single Character 1
re3='(mm)'	# Word 1

measurement1 = re.compile(re1+re2+re3,re.DOTALL)
M1 = re.findall(measurement1, Radiology_report)

In [115]:
M1

[('10', '-', 'mm'),
 ('9', ' ', 'mm'),
 ('8', '-', 'mm'),
 ('7', '-', 'mm'),
 ('5', '-', 'mm'),
 ('5', '-', 'mm')]