# MIMIC2 Radiology Impression Section Extractor

Identify impression section and extract

In [None]:
%matplotlib inline

In [None]:
import sqlite3 as sq
import re
from textblob import TextBlob
import zipfile
import gzip
import os
import numpy as np
from IPython.display import clear_output
import pickle
import seaborn as sns

In [None]:
DATADIR = os.path.join(os.path.expanduser("~"),"Bdrive","Radiology","NLP","DBs")
print(os.path.exists(DATADIR))

In [None]:
conn = sq.connect("./mimic_radreports.sqlite")
cursor = conn.cursor()

### Grab the text from the database

In [None]:
cursor.execute("""SELECT text FROM reports""")
reports = [r[0].strip() for r in cursor.fetchall()]

### A simple regex to identify numbers

In [None]:

rdigit = re.compile(r"""\d""")


## Define code to extract impression section

I provide some "synonyms" to impression and try splitting report with the prioritized list. To get a sense of the relative importance, I also return what phrase I finally split on.

In [None]:
splits = ["IMPRESSION:", "INTERPRETATION:", "CONCLUSIONS:", "FINDINGS:"]


def get_split_location(report, splits=None):
    if splits == None:
        splits = ["IMPRESSION:", "INTERPRETATION:", "CONCLUSIONS:", "FINDINGS:"]
    for s in splits:
        try:
            return report.index(s),s
        except:
            pass
    return -1,"NA"


def get_impressions(reports, splits=None):
    impression_loc = [get_split_location(r,splits=splits) for r in reports]
    return[(d[0][d[1][0]:],d[1]) for d in zip(reports, impression_loc) if d[1][0] != -1]


def get_reports(fname="pah_mimic2.sqlite", query="""SELECT text FROM mimic_pah_radiology"""):
    """My Docstring"""
    conn = sq.connect(os.path.join(DATADIR,fname))
    cursor = conn.cursor()
    cursor.execute(query)
    return [r[0] for r in cursor.fetchall()]

In [None]:
pah_impressions = get_impressions(get_reports(), splits=splits)
copd_impressions =get_impressions(get_reports(query="""SELECT text FROM mimic_copd_radiology"""), splits=splits)
print(len(pah_impressions))
print(len(copd_impressions))



### What does an impression look like?

In [None]:
pah_impressions[0]

### Get ride of our split phrase

In [None]:
ipah = [p[0].split(p[1][1])[1] for p in pah_impressions]
icopd = [p[0].split(p[1][1])[1] for p in copd_impressions]

In [None]:
ipah2 = [ [[w for w in s.words] for s in TextBlob(rdigit.sub("""d""", r.strip().lower())).sentences] for r in ipah]
icopd2 = [ [[w for w in s.words] for s in TextBlob(rdigit.sub("""d""", r.strip().lower())).sentences] for r in icopd]

### Read in Radiology phrase generators

In [None]:
with gzip.open(os.path.join(DATADIR,"mimic2_demo_n_gram_generators.pickle.gz"),"rb") as f0:
    ngp = pickle.load(f0)

In [None]:
pah_phrases = [ngp["3-gram"][ngp["2-gram"][s]] for s in ipah2]
copd_phrases = [ngp["3-gram"][ngp["2-gram"][s]] for s in icopd2]

In [None]:
def get_impression_words_from_phrases(sp):
    return [ss for s in sp for ss in s]

In [None]:
def view_impression_phrases(sp):
    return " ".join(get_impression_words_from_phrases(sp))

In [None]:
view_impression_phrases(pah_phrases[135])

In [None]:
pah_impression_phrases = [get_impression_words_from_phrases(sp) for sp in pah_phrases]
copd_impression_phrases = [get_impression_words_from_phrases(sp) for sp in copd_phrases]

### How long is our longest impression section

In [None]:
pah_lengths = [len(i) for i in pah_impression_phrases]
copd_lengths = [len(i) for i in copd_impression_phrases]

In [None]:
pah_sizes = (np.mean(pah_lengths),np.max(pah_lengths),np.min(pah_lengths))
copd_sizes = (np.mean(copd_lengths),np.max(copd_lengths),np.min(copd_lengths))
print(pah_sizes,copd_sizes)

In [None]:
pah_sizes[0:50]

In [None]:
sns.distplot(pah_lengths)
sns.distplot(copd_lengths)

In [None]:
while True:
    #clear_output()
    i = int(input("Enter sentence #\n"))
    print(" ".join(pah_phrases[i]))

In [None]:

for i in impressions:
    try:
        clear_output()
        print(i[0])
        print(i[1])
        input('continue')
    except:
        break

### Define Regular expressions for further cleansing

Here are some regular expression for finding dates and times. I decided to do a simple conversion from digits to the letter ``d`` leaving everything in place.

In [None]:
with gzip.open(os.path.join(os.path.expanduser("~"), 
                            "Bdrive/Radiology/NLP/DBs", 
                            "mimic2_radsentences.txt.gz"), "rt") as f:
    sentences = f.readlines()

In [None]:
sentences[0]

In [None]:
pah_impressions = get_impressions(pah_reports)
copd_impressions = get_impressions(copd_reports)