This notepad does the following:
 * Extract data from the MIMIC-III database
 * Inspect the data and choose an appropriate subset of documents
 * Divide the documents into a structured and free-text component
 * Parse the structured comonent of the documents
 * Convert the free text into bag-of-words (BOW) format
 * Write the documents as an arff format

In [3]:
import os
import pandas as pd
from random import sample

In [4]:
from medtext_streams.bow_machine import BOWMachine

In [5]:
import wasabi
msg = wasabi.Printer()

In [6]:
MIMIC_path = os.path.abspath('../../FeatureCat/data/raw/NOTEEVENTS.csv')
data = pd.read_csv(MIMIC_path)
data.head()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,ISERROR,TEXT
0,174,22532,167853.0,2151-08-04,,,Discharge summary,Report,,,Admission Date: [**2151-7-16**] Dischar...
1,175,13702,107527.0,2118-06-14,,,Discharge summary,Report,,,Admission Date: [**2118-6-2**] Discharg...
2,176,13702,167118.0,2119-05-25,,,Discharge summary,Report,,,Admission Date: [**2119-5-4**] D...
3,177,13702,196489.0,2124-08-18,,,Discharge summary,Report,,,Admission Date: [**2124-7-21**] ...
4,178,26880,135453.0,2162-03-25,,,Discharge summary,Report,,,Admission Date: [**2162-3-3**] D...


## Data Inspection

In [7]:
data.dtypes

ROW_ID           int64
SUBJECT_ID       int64
HADM_ID        float64
CHARTDATE       object
CHARTTIME       object
STORETIME       object
CATEGORY        object
DESCRIPTION     object
CGID           float64
ISERROR        float64
TEXT            object
dtype: object

In [8]:
data['CATEGORY'].value_counts()

Nursing/other        822497
Radiology            522279
Nursing              223556
ECG                  209051
Physician            141624
Discharge summary     59652
Echo                  45794
Respiratory           31739
Nutrition              9418
General                8301
Rehab Services         5431
Social Work            2670
Case Management         967
Pharmacy                103
Consult                  98
Name: CATEGORY, dtype: int64

In [9]:
data['DESCRIPTION'].value_counts()[:20]

Report                               1132519
Nursing Progress Note                 191836
CHEST (PORTABLE AP)                   169270
Physician Resident Progress Note       62698
CHEST (PA & LAT)                       43158
CT HEAD W/O CONTRAST                   34485
Respiratory Care Shift Note            31105
Nursing Transfer Note                  30773
Intensivist Note                       26144
CHEST PORT. LINE PLACEMENT             21596
Physician Attending Progress Note      21023
Physician Resident Admission Note      10654
Clinical Nutrition Note                 9395
PORTABLE ABDOMEN                        8143
CHEST (PRE-OP PA & LAT)                 8064
CT CHEST W/CONTRAST                     8001
CT ABDOMEN W/CONTRAST                   7304
MR HEAD W & W/O CONTRAST                7062
CT CHEST W/O CONTRAST                   6745
Generic Note                            6649
Name: DESCRIPTION, dtype: int64

## Look at n examples from each category.

In [10]:
n_examples = 2 

for cat in data['CATEGORY'].unique():
    cat_text = data[ data['CATEGORY']==cat ]['TEXT']
    cat_sample = sample(list(cat_text), n_examples)
    for i, example in enumerate(cat_sample):
        msg.divider(f'{cat} {i+1}')
        print()
        print(example)
        print()

[1m

Admission Date:  [**2117-1-29**]       Discharge Date:  [**2117-2-3**]

Date of Birth:   [**2117-1-29**]       Sex:  F

Service:  NEONATOLOGY
HISTORY OF THE PRESENT ILLNESS:  [**Known lastname 8463**] [**Known lastname 48993**] is a 36
week gestational age female referred to the Newborn Intensive
Care Unit at the request of Dr. [**First Name (STitle) **] for assessment of a
cardiac murmur.

II, para I now II mother with unremarkable prenatal screens.
Blood type O positive, direct antibody test negative,
hepatitis B surface antigen negative, RPR nonreactive,
rubella immune, group beta strep negative. There is a history
of congenital heart disease and trisomy 21 in a paternal
uncle.

for estimated gestational age of 36 2/7 weeks.  This was a
repeat cesarean section under spinal anesthesia.  No maternal
fever or fetal tachycardia.  Membranes were ruptured at the
time of delivery yielding clear amniotic fluid.  At the
delivery, the infant received bulb suctioning, tactile
stimulation

## Parse Discharge Summaries

In [16]:
import medtext_streams.bow_machine

In [26]:
from importlib import reload  

bow_machine = reload(bow_machine)

In [32]:
chosen_data = data[ data['CATEGORY']=='Discharge summary' ]
free_text = chosen_data['TEXT'].values

mimic_bm = BOWMachine()
bow_df = mimic_bm.convert_train_data(free_text)

In [33]:
bow_df.head()

Unnamed: 0,aa,aaa,aado,aao,aaox,ab,abd,abdomen,abdominal,abdominial,...,zestril,zetia,zinc,zocor,zofran,zoloft,zolpidem,zone,zosyn,zyprexa
0,0,0,0,0,0,0,0,0,2,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,2,0,0,0
4,0,0,0,0,0,0,2,4,0,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
len(mimic_bm.vocab)

5102

In [35]:
bow_df.to_csv('mimic_bow.csv', index=False)

In [37]:
pd.DataFrame({'x': [1,2,3], 'y': [4,5,6]})

Unnamed: 0,x,y
0,1,4
1,2,5
2,3,6


In [38]:
pd.DataFrame({'x': [1,2,3], 'y': [4,5,6]}).sample(2)

Unnamed: 0,x,y
2,3,6
1,2,5
