This notepad does the following:
 * Extract data from the MIMIC-III database
 * Inspect the data and choose an appropriate subset of documents
 * Divide the documents into a structured and free-text component
 * Parse the structured comonent of the documents
 * Convert the free text into bag-of-words (BOW) format
 * Write the documents as an arff format

In [1]:
import os
import pandas as pd
from random import sample

In [6]:
import sys
sys.path.insert(1, '..')

In [7]:
from multidriftdetector.bow_machine import BOWMachine

In [8]:
import wasabi
msg = wasabi.Printer()

In [9]:
MIMIC_path = os.path.abspath('../../FeatureCat/data/raw/NOTEEVENTS.csv')
data = pd.read_csv(MIMIC_path)
data.head()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,ISERROR,TEXT
0,174,22532,167853.0,2151-08-04,,,Discharge summary,Report,,,Admission Date: [**2151-7-16**] Dischar...
1,175,13702,107527.0,2118-06-14,,,Discharge summary,Report,,,Admission Date: [**2118-6-2**] Discharg...
2,176,13702,167118.0,2119-05-25,,,Discharge summary,Report,,,Admission Date: [**2119-5-4**] D...
3,177,13702,196489.0,2124-08-18,,,Discharge summary,Report,,,Admission Date: [**2124-7-21**] ...
4,178,26880,135453.0,2162-03-25,,,Discharge summary,Report,,,Admission Date: [**2162-3-3**] D...


## Data Inspection

In [10]:
data.dtypes

ROW_ID           int64
SUBJECT_ID       int64
HADM_ID        float64
CHARTDATE       object
CHARTTIME       object
STORETIME       object
CATEGORY        object
DESCRIPTION     object
CGID           float64
ISERROR        float64
TEXT            object
dtype: object

In [11]:
data['CATEGORY'].value_counts()

Nursing/other        822497
Radiology            522279
Nursing              223556
ECG                  209051
Physician            141624
Discharge summary     59652
Echo                  45794
Respiratory           31739
Nutrition              9418
General                8301
Rehab Services         5431
Social Work            2670
Case Management         967
Pharmacy                103
Consult                  98
Name: CATEGORY, dtype: int64

In [12]:
data['DESCRIPTION'].value_counts()[:20]

Report                               1132519
Nursing Progress Note                 191836
CHEST (PORTABLE AP)                   169270
Physician Resident Progress Note       62698
CHEST (PA & LAT)                       43158
CT HEAD W/O CONTRAST                   34485
Respiratory Care Shift Note            31105
Nursing Transfer Note                  30773
Intensivist Note                       26144
CHEST PORT. LINE PLACEMENT             21596
Physician Attending Progress Note      21023
Physician Resident Admission Note      10654
Clinical Nutrition Note                 9395
PORTABLE ABDOMEN                        8143
CHEST (PRE-OP PA & LAT)                 8064
CT CHEST W/CONTRAST                     8001
CT ABDOMEN W/CONTRAST                   7304
MR HEAD W & W/O CONTRAST                7062
CT CHEST W/O CONTRAST                   6745
Generic Note                            6649
Name: DESCRIPTION, dtype: int64

## Look at n examples from each category.

In [13]:
n_examples = 2 

for cat in data['CATEGORY'].unique():
    cat_text = data[ data['CATEGORY']==cat ]['TEXT']
    cat_sample = sample(list(cat_text), n_examples)
    for i, example in enumerate(cat_sample):
        msg.divider(f'{cat} {i+1}')
        print()
        print(example)
        print()

[1m

Name:  [**Known lastname 11465**],[**Known firstname 11466**]                Unit No:   [**Numeric Identifier 11467**]

Admission Date:  [**2200-1-27**]              Discharge Date:   [**2200-2-7**]

Date of Birth:  [**2110-5-17**]             Sex:   F

Service: SURGERY

Allergies:
Heparin Agents

Attending:[**First Name3 (LF) 231**]
Addendum:
[**2200-2-6**] Patient has been noted to have recurrent SOb on arrising
in Am but afterward and remaing of the day is without SOB.
Repeat cxr has been negative for CHF. Echo [**2200-1-28**] showed aortic
valve area of 0.08cm2, ( moderate stenosis ) with mild AI EF
50-55%. Patient awaiting screening for rehab.
[**2200-2-7**] stable. No SOB this am. excellent result from bowel
regment. D/c to rehab.


Discharge Disposition:
Extended Care

Facility:
[**Hospital **] [**Hospital 11468**] Hospital TCU

                             [**First Name11 (Name Pattern1) 77**] [**Last Name (NamePattern1) 237**] MD [**MD Number(1) 238**]

Completed by:[**2

## Parse Discharge Summaries

In [28]:
from multidriftdetector import bow_machine

In [29]:
from importlib import reload  

bow_machine = reload(bow_machine)

In [30]:
# 0.4 and 0.6
chosen_data = data[ data['CATEGORY']=='Discharge summary' ]
free_text = chosen_data['TEXT'].values

mimic_bm = bow_machine.BOWMachine()
bow_df = mimic_bm.convert_train_data(free_text) 

In [31]:
bow_df.head()

Unnamed: 0,abdomen,acute,alert,appointment,artery,aspirin,b,bid,bilaterally,bowel,...,sounds,surgery,tablet,telephone,total,treated,use,wbc,week,weeks
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,3,5,0,0,1,...,2,0,0,1,0,1,1,0,1,2
2,0,0,0,0,1,0,1,1,0,0,...,0,1,10,1,1,0,0,1,1,1
3,1,3,1,2,4,5,0,0,1,1,...,1,0,29,3,0,0,4,2,0,1
4,4,2,0,0,1,0,1,0,1,2,...,0,4,2,1,0,0,1,1,0,1


In [32]:
bow_df.shape

(59652, 110)

In [33]:
bow_df.to_csv('mimic_bow.csv', index=False)

In [25]:
# 0.1 and 0.9
chosen_data = data[ data['CATEGORY']=='Discharge summary' ]
free_text = chosen_data['TEXT'].values

mimic_bm = bow_machine.BOWMachine()
bow_df = mimic_bm.convert_train_data(free_text) 

In [26]:
bow_df.head()

Unnamed: 0,abd,abdomen,abdominal,able,abnormal,abnormalities,abnormality,abuse,acetaminophen,acid,...,x,xii,xs,y,year,years,yeast,yellow,yo,zosyn
0,0,0,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,4,0,2,0,0,1,0,...,5,1,0,0,1,2,0,0,0,0
2,1,0,0,0,0,0,0,0,1,0,...,2,0,0,1,1,3,0,0,1,0
3,0,1,0,0,1,0,0,0,0,2,...,6,0,0,0,0,2,0,0,2,0
4,2,4,0,1,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0


In [27]:
bow_df.shape

(59652, 1101)

In [16]:
# 0.01 and 0.009
chosen_data = data[ data['CATEGORY']=='Discharge summary' ]
free_text = chosen_data['TEXT'].values

mimic_bm = BOWMachine()
bow_df = mimic_bm.convert_train_data(free_text)# max_df=0.9, min_df=0.1, 

In [17]:
bow_df.head()

Unnamed: 0,aa,aaa,aado,aao,aaox,ab,abd,abdomen,abdominal,abdominial,...,zestril,zetia,zinc,zocor,zofran,zoloft,zolpidem,zone,zosyn,zyprexa
0,0,0,0,0,0,0,0,0,2,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,2,0,0,0
4,0,0,0,0,0,0,2,4,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
bow_df.shape

(59652, 5102)