In [1]:
import pymysql
import pandas as pd
import getpass
from textblob import TextBlob
import re
from collections import Counter

In [2]:
conn = pymysql.connect(host="mysql",
                       port=3306,user="jovyan",
                       passwd=getpass.getpass("Enter MySQL passwd for jovyan"),db='mimic2')
cursor = conn.cursor()

Enter MySQL passwd for jovyan········


### Use Pandas and SQL to create a dataframe with the following:
* subject_id
* hospital admission id
* text of the radiology report
* Limit the number of reports to 10000

In [3]:
rad_data = \
pd.read_sql("""SELECT DISTINCT noteevents.subject_id, 
                      noteevents.hadm_id,
                      noteevents.text 
               FROM noteevents
               WHERE noteevents.category = 'RADIOLOGY_REPORT' LIMIT 5000""",conn)
rad_data.head(5)

Unnamed: 0,subject_id,hadm_id,text
0,56,28766.0,\n\n\n DATE: [**2644-1-17**] 10:53 AM\n ...
1,56,28766.0,\n\n\n DATE: [**2644-1-17**] 10:43 AM\n ...
2,56,28766.0,\n\n\n DATE: [**2644-1-17**] 6:37 AM\n ...
3,56,28766.0,\n\n\n DATE: [**2644-1-19**] 12:09 PM\n ...
4,37,18052.0,\n\n\n DATE: [**3264-8-14**] 6:06 AM\n ...


In [4]:
rad_data.shape

(5000, 3)

In [5]:
print(rad_data.iloc[20]["text"])




     DATE: [**3352-7-10**] 1:35 PM
     CHEST (PORTABLE AP)                                             Clip # [**Clip Number (Radiology) 1633**]
     Reason: Neutropenic fever, day +10 s/p bmt                          
     Admitting Diagnosis: NON-HODGKIN'S LYMPHOMIA\BONE MARROW TRANSPLANT
     ______________________________________________________________________________
     UNDERLYING MEDICAL CONDITION:
        54 year old man with NHL here for auto BMT                                    
     REASON FOR THIS EXAMINATION:
      Neutropenic fever, day +10 s/p bmt                                              
     ______________________________________________________________________________
                                     FINAL REPORT
     INDICATIONS:  10 days post BMT, ? pneumonia. 
     
     PORTABLE AP CHEST:  Comparison is made to the prior study from [**3352-7-6**]. Exam
     remains unchanged from previous study. No new infiltrate is seen. No pleural
     effusions 

### Write a function that returns true or false depending on whether the report contains an impression section

#### Hints

* Not every report will have an impression section
* "INTERPRETATION:" and "CONCLUSIONS:" might be synonyms for "IMPRESSION:"

In [6]:
def count_impression(report):
    for word in report.split():
        if word == "INTERPRETATION:" or word == "CONCLUSIONS:" or word == "IMPRESSION:":
            return True
        pass
    pass


In [7]:
def count_impression2(report):
    headings = ["INTERPRETATION:", "CONCLUSIONS:", "IMPRESSION:"]
    for h in headings:
        if h in report:
            return True
    
    return False


In [8]:
%timeit count_impression2(rad_data.iloc[0]["text"])

284 µs ± 4.91 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [9]:
%timeit count_impression(rad_data.iloc[0]["text"])

359 µs ± 4.17 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [10]:
sum([count_impression(report) for report in rad_data["text"]])

TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'

### Write a function that returns the impression section of a report



In [11]:
def get_impression(report):
    headings = ["INTERPRETATION:", "CONCLUSIONS:", "IMPRESSION:"]
    for h in headings:
        if h in report:
            return report[report.find(h):]
    
    return report

In [12]:
get_impression(rad_data.iloc[20]["text"])

'IMPRESSION: No acute change from previous exam.\n\n'

### Define Regular expressions for data cleansing

We have a lot of patterns that are unique and don't convey meaningful information

* De-identified names, dates, etc. 
    * `[**Clip Number (Radiology) 12569**]`
    * `DATE: [**3352-7-10**] `
    * `[**Hospital 12568**]`
    * `[**Last Name (NamePattern4) 337**]` 
    * `[**First Name8 (NamePattern2) 12565**]` 
    * `[**Last Name (NamePattern1) 12566**]`
* Separators  (e.g. `__________________`)

### Split into groups and write and test regular expressions to capture these patterns
* Write a regular expression to replace dates in the reports with ``[**DATE**]``
* Write a regular expression to replace times in the reports with ``[**TIME**]``
* Write a regular expression to replaces digits with "d", (e.g. "43 cc" would become "dd cm")

In [13]:
date=re.compile(r"""((?P<month>[A-Z][a-z]{2,}(\.)?) (?P<day>[0-9]{1,2}))""")
last_name=re.compile(r"""(\[\*\*Last Name \((NamePattern|STitle)(\d+)?\) [0-9]*\*\*\])""")
clip = re.compile(r"""\[\*\*Clip Number \(Radiology\) \d+\*\*\]""")
date2 = re.compile(r"""DATE\: \[\*\*\d+-\d+-\d+\*\*\]""")
hospital=re.compile(r"""(\[\*\*Hospital \d+\*\*\])""")
unders = re.compile(r"""_{2,}""")

age2 = re.compile(r"""(?P<age>[0-9]+)(-|\s)y(ear(s)?|\.)(-|\s)?o(ld|\.)""")
age3 = re.compile(r"""\bage(d)? (?P<age>[0-9]+)""")
digits = re.compile(r"""\d""")
def age_in_decades(m):
    age = int(m.group("age"))
    
    return "[** Age in %ss**]"%(int(age/10)*10,)

age_in_decades(next(age2.finditer("74-year-old")))
#tmp = re.sub(age2, age_in_decades, re.sub(age3, age_in_decades, report_txt))

'[** Age in 70s**]'

In [14]:
unders.findall(rad_data.iloc[20]["text"])

['______________________________________________________________________________',
 '______________________________________________________________________________']

In [15]:
hospital.findall("""Dr. [**Last Name (STitle) 13    * `[**Hospital 12568**]`
927**] and Dr. [**Last Name (STitle) 13928**] on""")

['[**Hospital 12568**]']

In [16]:
clip.findall("""BONE SCAN                                                       Clip # [**Clip Number (Radiology) 13926**]""")

['[**Clip Number (Radiology) 13926**]']

In [17]:
date2.findall("""DATE: [**3352-7-10**]""")

['DATE: [**3352-7-10**]']

#### Hints

* Look at some sample reports to see what dates and times look like in the reports
* What order would you need to apply the regular expressions?

In [18]:
def preprocess(report):
    return digits.sub("d",unders.sub("\n",
               hospital.sub("HOSPITAL", 
                            date2.sub("DATE", 
                                      clip.sub("CLIP", 
                                               last_name.sub("LASTNAME", report))))))


In [19]:
for i in range(10):
    print(preprocess(rad_data.iloc[i]['text']))
    print("*"*42,"\n\n")




     DATE dd:dd AM
     MR HEAD W & W/O CONTRAST; MR CONTRAST GADOLIN                   Clip # CLIP
     Reason: R ICB and HX brain mets - eval - also with DWI for CVA Do MR
      Contrast: MAGNEVIST Amt: dd
     

     UNDERLYING MEDICAL CONDITION:
      [**Age over dd **] year old woman with lung CA- mets to brain                                   
     REASON FOR THIS EXAMINATION:
      R ICB and HX brain mets - eval - also with DWI for CVA Do MRI both with and 
      without contast please
     

                                     FINAL REPORT
     EXAMINATION:  MRI of the brain with and without gadolinium.
     
     INDICATION:  [**Age over dd **] year old woman with lung cancer and right intracranial bleed
     and history of brain metastases.  Please evaluate for acute infarct.
     
     TECHNIQUE:  Multiplanar Td and Td-weighted images of the brain with gadolinium
     according to standard departmental protocol.  No prior study for comparison.
     
     FINDINGS:  On d

### You can use the Pandas `iloc` method to grab specific reports

In [20]:
print(rad_data.iloc[0]["text"])




     DATE: [**2644-1-17**] 10:53 AM
     MR HEAD W & W/O CONTRAST; MR CONTRAST GADOLIN                   Clip # [**Clip Number (Radiology) 12569**]
     Reason: R ICB and HX brain mets - eval - also with DWI for CVA Do MR
      Contrast: MAGNEVIST Amt: 15
     ______________________________________________________________________________
     UNDERLYING MEDICAL CONDITION:
      [**Age over 90 **] year old woman with lung CA- mets to brain                                   
     REASON FOR THIS EXAMINATION:
      R ICB and HX brain mets - eval - also with DWI for CVA Do MRI both with and 
      without contast please
     ______________________________________________________________________________
                                     FINAL REPORT
     EXAMINATION:  MRI of the brain with and without gadolinium.
     
     INDICATION:  [**Age over 90 **] year old woman with lung cancer and right intracranial bleed
     and history of brain metastases.  Please evaluate for acute infar

In [21]:
print(rad_data.iloc[0]["text"])




     DATE: [**2644-1-17**] 10:53 AM
     MR HEAD W & W/O CONTRAST; MR CONTRAST GADOLIN                   Clip # [**Clip Number (Radiology) 12569**]
     Reason: R ICB and HX brain mets - eval - also with DWI for CVA Do MR
      Contrast: MAGNEVIST Amt: 15
     ______________________________________________________________________________
     UNDERLYING MEDICAL CONDITION:
      [**Age over 90 **] year old woman with lung CA- mets to brain                                   
     REASON FOR THIS EXAMINATION:
      R ICB and HX brain mets - eval - also with DWI for CVA Do MRI both with and 
      without contast please
     ______________________________________________________________________________
                                     FINAL REPORT
     EXAMINATION:  MRI of the brain with and without gadolinium.
     
     INDICATION:  [**Age over 90 **] year old woman with lung cancer and right intracranial bleed
     and history of brain metastases.  Please evaluate for acute infar

### Create a new column named "impression" for storing just the impression portion of the report

In [22]:
rad_data["impression"] = \
rad_data.apply(lambda row: preprocess(get_impression(row["text"])).lower(), axis=1)

In [23]:
rad_data.head()

Unnamed: 0,subject_id,hadm_id,text,impression
0,56,28766.0,\n\n\n DATE: [**2644-1-17**] 10:53 AM\n ...,\n\n\n date dd:dd am\n mr head w & w/o...
1,56,28766.0,\n\n\n DATE: [**2644-1-17**] 10:43 AM\n ...,impression: stable appearance of right pariet...
2,56,28766.0,\n\n\n DATE: [**2644-1-17**] 6:37 AM\n ...,impression:\n \n cardiomegaly and mild...
3,56,28766.0,\n\n\n DATE: [**2644-1-19**] 12:09 PM\n ...,impression:\n \n marked improvement in...
4,37,18052.0,\n\n\n DATE: [**3264-8-14**] 6:06 AM\n ...,impression: stable cardiomegaly with pulmonary...


In [24]:
print(rad_data.iloc[20]["impression"])

impression: no acute change from previous exam.




### How many unique words occur in the corpus?

#### Hint

1. Use TextBlob
1. Put all the reports into a single string

#### I got 8658

In [25]:
#" ".join(rad_data["impression"])

In [26]:
blob = TextBlob(" ".join(rad_data["impression"]))
unique_impression_words = set(blob.words)
print(len(unique_impression_words))



7402


In [27]:
c = Counter(blob.words)
c.most_common()

[('the', 11920),
 ('of', 8221),
 ('d', 5718),
 ('and', 4548),
 ('is', 4467),
 ('with', 4256),
 ('impression', 4096),
 ('in', 4087),
 ('dd', 3800),
 ('right', 3218),
 ('no', 3184),
 ('left', 2783),
 ('to', 2764),
 ('for', 2465),
 ('clip', 2367),
 ('a', 2061),
 ('chest', 2025),
 ('reason', 2014),
 ('there', 1824),
 ('this', 1650),
 ('tube', 1548),
 ('pleural', 1451),
 ('date', 1339),
 ('report', 1294),
 ('final', 1225),
 ('evidence', 1210),
 ('are', 1199),
 ('at', 1187),
 ('on', 1138),
 ('ct', 1113),
 ('contrast', 1096),
 ('or', 1083),
 ('placement', 1025),
 ('examination', 1008),
 ('be', 988),
 ('small', 978),
 ('bilateral', 974),
 ('pulmonary', 966),
 ('pneumothorax', 949),
 ('effusion', 942),
 ('ap', 925),
 ('underlying', 902),
 ('lower', 895),
 ('lobe', 864),
 ('lung', 847),
 ('interval', 843),
 ('line', 839),
 ('portable', 819),
 ('medical', 809),
 ('study', 788),
 ('findings', 775),
 ('condition', 773),
 ('pm', 767),
 ('prior', 762),
 ('diagnosis', 749),
 ('change', 744),
 ('tip', 

In [28]:
unique_impression_words

{'infiltrative',
 'embolized',
 'hypoxic',
 'caking',
 'gyrus',
 'tendon',
 'exact',
 'wider',
 'esophagogram',
 'unstable',
 'visualizing',
 'complicated',
 'fibrotic',
 'linear',
 'discussion',
 'urologists',
 'conus',
 'intrapelvic',
 'pneumoperitoneum',
 'relayed',
 'increase/development',
 'microvenous',
 'organisms',
 'consolidation',
 'discrete',
 'derived',
 'use',
 'rightclip',
 'wheezing',
 'rectum',
 'steps',
 'posterofrontal',
 'd-dimer',
 'sg',
 'csru',
 'diverticulosis',
 'gave',
 'g/j',
 'vertical',
 'approaching',
 'ray',
 'reformatted',
 'good',
 'o.r',
 'reverses',
 'labeled',
 'observed',
 'dish',
 'aneurysms',
 'give',
 'termination',
 'breathing',
 'updated',
 'compartments',
 'cxrs',
 'obscures',
 'reformats',
 'effussion',
 'gasless',
 'underinflated',
 'lock',
 'sutured',
 'barely',
 'uncal',
 'presented',
 'edema',
 'aliasing',
 'comparison',
 'reactive',
 'plate',
 'huberon',
 'e.g',
 'fu',
 'atheromatous',
 'delineated',
 'overlies',
 'which',
 'bicarb',
 'gu

In [29]:
from gensim.parsing.preprocessing import STOPWORDS
STOPWORDS

frozenset({'a',
           'about',
           'above',
           'across',
           'after',
           'afterwards',
           'again',
           'against',
           'all',
           'almost',
           'alone',
           'along',
           'already',
           'also',
           'although',
           'always',
           'am',
           'among',
           'amongst',
           'amoungst',
           'amount',
           'an',
           'and',
           'another',
           'any',
           'anyhow',
           'anyone',
           'anything',
           'anyway',
           'anywhere',
           'are',
           'around',
           'as',
           'at',
           'back',
           'be',
           'became',
           'because',
           'become',
           'becomes',
           'becoming',
           'been',
           'before',
           'beforehand',
           'behind',
           'being',
           'below',
           'beside',
           'besides'

In [30]:
my_stop_words = frozenset(["a", "am", "an", "and", "are", "as", "at", "be", "for", "is", "the", "is", "of", "which", ])

## Create a single string with all the reports

#### Hints, etc.
* Use List Comprehension
* Use string joins
* Iterate over the rows of the data frame

### Define a vector space for the radiology corpus

#### Hints

1. How would you build a corpus from words only occuring more than N times?

### Create a new column named `"impression no stops"` where [stop words](https://en.wikipedia.org/wiki/Stop_words) have been dropped from the impression

* The gensim package has stop words defined (``from gensim.parsing.preprocessing import STOPWORDS``

#### Hints
1. Do you agree with dropping all the stop words?
1. How could we create a new stopwords frozen set absent the terms we wan't to keep (double negative?)
1. You could use a regular expressions substitution or token the report first and operate on the list of words.

In [31]:
rad_data["impression no stops"] = \
rad_data.apply(lambda row: \
               " ".join([word for word in row["impression"].split() if  word not in my_stop_words]), 
   axis=1)
rad_data.head()

Unnamed: 0,subject_id,hadm_id,text,impression,impression no stops
0,56,28766.0,\n\n\n DATE: [**2644-1-17**] 10:53 AM\n ...,\n\n\n date dd:dd am\n mr head w & w/o...,date dd:dd mr head w & w/o contrast; mr contra...
1,56,28766.0,\n\n\n DATE: [**2644-1-17**] 10:43 AM\n ...,impression: stable appearance of right pariet...,impression: stable appearance right parietal l...
2,56,28766.0,\n\n\n DATE: [**2644-1-17**] 6:37 AM\n ...,impression:\n \n cardiomegaly and mild...,impression: cardiomegaly mild chf. nasogastric...
3,56,28766.0,\n\n\n DATE: [**2644-1-19**] 12:09 PM\n ...,impression:\n \n marked improvement in...,impression: marked improvement in left perihil...
4,37,18052.0,\n\n\n DATE: [**3264-8-14**] 6:06 AM\n ...,impression: stable cardiomegaly with pulmonary...,impression: stable cardiomegaly with pulmonary...


In [32]:
rad_data.iloc[0]["impression"]

"\n\n\n     date dd:dd am\n     mr head w & w/o contrast; mr contrast gadolin                   clip # clip\n     reason: r icb and hx brain mets - eval - also with dwi for cva do mr\n      contrast: magnevist amt: dd\n     \n\n     underlying medical condition:\n      [**age over dd **] year old woman with lung ca- mets to brain                                   \n     reason for this examination:\n      r icb and hx brain mets - eval - also with dwi for cva do mri both with and \n      without contast please\n     \n\n                                     final report\n     examination:  mri of the brain with and without gadolinium.\n     \n     indication:  [**age over dd **] year old woman with lung cancer and right intracranial bleed\n     and history of brain metastases.  please evaluate for acute infarct.\n     \n     technique:  multiplanar td and td-weighted images of the brain with gadolinium\n     according to standard departmental protocol.  no prior study for comparison.\n 

In [33]:
rad_data.iloc[0]["impression no stops"]

"date dd:dd mr head w & w/o contrast; mr contrast gadolin clip # clip reason: r icb hx brain mets - eval - also with dwi cva do mr contrast: magnevist amt: dd underlying medical condition: [**age over dd **] year old woman with lung ca- mets to brain reason this examination: r icb hx brain mets - eval - also with dwi cva do mri both with without contast please final report examination: mri brain with without gadolinium. indication: [**age over dd **] year old woman with lung cancer right intracranial bleed history brain metastases. please evaluate acute infarct. technique: multiplanar td td-weighted images brain with gadolinium according to standard departmental protocol. no prior study comparison. findings: on diffusion-weighted images there small area restricted diffusion along falx within left occipitotemporal lobe. it also bright on flair-weighted images may represent subacute infarct. clinical correlation recommended. on gradient echo images there large area intraparenchymal hemor

### What are the unique words in our vocabulary?

In [34]:
unique_impression_words = set(TextBlob(" ".join(rad_data["impression no stops"])).words)


### We'll create a vocabulary with `zip` and `dict`

In [35]:
word_map = dict(zip(unique_impression_words,range(len(unique_impression_words))))

In [36]:
len(word_map)

7396

In [37]:
word_map

{'infiltrative': 0,
 'quadriceps': 3654,
 'embolized': 1,
 'hypoxic': 2,
 'deterioration': 3657,
 'caking': 3,
 'gyrus': 4,
 'tendon': 5,
 'exact': 6,
 'wider': 7,
 'esophagogram': 8,
 'window': 3661,
 'ris': 3663,
 'vertebra': 3664,
 'slight': 3665,
 'congestion/acute': 3666,
 'swallow': 3667,
 'intraoperative': 3655,
 'unstable': 9,
 'two-view': 3668,
 'visualizing': 10,
 'complicated': 11,
 'o': 3656,
 'fibrotic': 12,
 'linear': 13,
 'ill-defined': 1292,
 'worsened': 3669,
 'urologists': 15,
 'extracted': 3670,
 'get': 3671,
 's': 3672,
 'conus': 16,
 'mvmt': 3675,
 'intrapelvic': 17,
 'pneumoperitoneum': 18,
 'cardiomediastinal': 3676,
 'liter': 5561,
 'microvenous': 21,
 'organisms': 22,
 'exclude': 3678,
 'seizure': 3679,
 'interactive': 6235,
 'dddf': 3680,
 'consolidation': 23,
 'discrete': 24,
 'murmur': 3682,
 'rightclip': 27,
 'use': 26,
 'productive': 3683,
 'extensions': 6567,
 'wheezing': 28,
 'steps': 30,
 'posterofrontal': 31,
 'd-dimer': 32,
 'sg': 33,
 'csru': 34,
 'f

### Save for use in our next notebook

In [38]:
import gzip
import pickle

with open("rad_data.pickle.gz", "wb") as f0:
    pickle.dump(rad_data, f0)

In [39]:
with open("rad_vocabulary.pickle.gz", "wb") as f0:
    pickle.dump(word_map, f0)