In [1]:
import codecs
import sys
sys.path.append('../')

from constants import *
import datasets
import log_reg
from dataproc import extract_wvs
from dataproc import get_discharge_summaries
from dataproc import concat_and_split
from dataproc import build_vocab
from dataproc import word_embeddings

from nltk.tokenize import RegexpTokenizer
import pandas as pd
from tqdm import tqdm

from collections import Counter, defaultdict
import csv
import operator
import re

## Filter to notes that are not effectively empty

In [2]:
## Code lightly edited from that for the following paper:
## 
## Adler Perotte, Rimma Pivovarov, Karthik Natarajan, Nicole Weiskopf, Frank Wood, Noemie Elhadad
## Diagnosis Code Assignment: Models and Evaluation Metrics, JAMIA, 2013
## 
## Columbia University
## Biomedical Informatics
## Author: Adler Perotte
# Basically this removes some of the boilerplate parts of notes, and if nothing remains, exclude this note
term_pattern = re.compile('[A-Za-z]+')
with open('%s/MIMIC_RAW_DSUMS' % (MIMIC_2_DIR), 'r') as f:
    with open('%s/MIMIC_FILTERED_DSUMS' % (MIMIC_2_DIR), 'w') as f2:
        for i, line in enumerate(f):
            raw_dsum = line.split('|')[6]

            raw_dsum = re.sub(r'\[[^\]]+\]', ' ', raw_dsum)
            raw_dsum = re.sub(r'admission date:', ' ', raw_dsum, flags=re.I)
            raw_dsum = re.sub(r'discharge date:', ' ', raw_dsum, flags=re.I)
            raw_dsum = re.sub(r'date of birth:', ' ', raw_dsum, flags=re.I)
            raw_dsum = re.sub(r'sex:', ' ', raw_dsum, flags=re.I)
            raw_dsum = re.sub(r'service:', ' ', raw_dsum, flags=re.I)
            raw_dsum = re.sub(r'dictated by:.*$', ' ', raw_dsum, flags=re.I)
            raw_dsum = re.sub(r'completed by:.*$', ' ', raw_dsum, flags=re.I)
            raw_dsum = re.sub(r'signed electronically by:.*$', ' ', raw_dsum, flags=re.I)

            tokens = [token.lower() for token in re.findall(term_pattern, raw_dsum)]
            tokens = [token for token in tokens if len(token) > 1]

            # Determine if this DSUM should stay, if so, write to filtered DSUM file
            if len(tokens) > 0:
                f2.write(line)

## Tokenize/preprocess raw text

In [3]:
tokenizer = RegexpTokenizer(r'\w+')

In [4]:
with codecs.open('%s/MIMIC_FILTERED_DSUMS' % MIMIC_2_DIR, 'r', encoding='latin-1') as f:
    with open('%s/proc_dsums.csv' % MIMIC_2_DIR, 'w') as of:
        r = csv.reader(f, delimiter='|')
        #header
        next(r)
        w = csv.writer(of)
        w.writerow(['SUBJECT_ID', 'HADM_ID', 'CHARTTIME', 'TEXT', 'LABELS'])
        for row in tqdm(r):
            note = row[6].replace('[NEWLINE]', '\n')
            tokens = [t.lower() for t in tokenizer.tokenize(note) if not t.isnumeric()]
            text = ' '.join(tokens)
            codes = ';'.join(row[5].split(','))
            w.writerow([row[0], row[1], row[2], text, codes])

22815it [00:31, 733.09it/s]


## split data using given id's

In [5]:
train_ids = set()
test_ids = set()
with open('%s/training_indices.data' % MIMIC_2_DIR) as f:
    for row in f:
        train_ids.add(int(row.rstrip()))
        
with open('%s/testing_indices.data' % MIMIC_2_DIR) as f:
    for row in f:
        test_ids.add(int(row.rstrip()))

In [6]:
with open('%s/proc_dsums.csv' % MIMIC_2_DIR, 'r') as nf:
    with open('%s/test_dsums.csv' % MIMIC_2_DIR, 'w') as test_f:
        with open('%s/train_dsums.csv' % MIMIC_2_DIR, 'w') as train_f:
            r = csv.reader(nf, delimiter=',')
            test_w = csv.writer(test_f)
            train_w = csv.writer(train_f)
            #header
            header = next(r)
            #don't need chart time
            del(header[2])
            test_w.writerow(header)
            train_w.writerow(header)
            for i,row in enumerate(r):
                #don't need chart time
                del(row[2])
                if i in train_ids:
                    train_w.writerow(row)
                elif i in test_ids:
                    test_w.writerow(row)

# Create vocabulary from training data

In [7]:
vfile = build_vocab.build_vocab(3, '%s/train_dsums.csv' % MIMIC_2_DIR, '%s/vocab.csv' % MIMIC_2_DIR)

reading in data...
removing rare terms
30688 terms qualify out of 77895 total
writing output


## Sort by length to get final data ready for models

In [8]:
df = pd.read_csv('%s/train_dsums.csv' % MIMIC_2_DIR)
df['length'] = df.apply(lambda row: len(row[2].split()) if not pd.isnull(row[2]) else 0, axis=1)
df = df[df['length'] > 1]
df = df.sort_values(['length'])
df.to_csv('%s/train.csv' % MIMIC_2_DIR, index=False)

df = pd.read_csv('%s/test_dsums.csv' % MIMIC_2_DIR)
df['length'] = df.apply(lambda row: len(row[2].split()) if not pd.isnull(row[2]) else 0, axis=1)
df = df[df['length'] > 1]
df = df.sort_values(['length'])
df.to_csv('%s/test.csv' % MIMIC_2_DIR, index=False)

## Pre-train word embeddings

In [9]:
w2v_file = word_embeddings.word_embeddings('full', '%s/proc_dsums.csv' % MIMIC_2_DIR, 100, 3, 5)

building word2vec vocab on /nethome/jmullenbach3/replication/cnn-medical-text/mimicdata/mimic2//proc_dsums.csv...
training...
writing embeddings to /nethome/jmullenbach3/replication/cnn-medical-text/mimicdata/mimic2//processed_full.w2v


## Save embeddings to be read in and used to initialize embedding layers later

In [10]:
import gensim
model = gensim.models.Word2Vec.load('%s/processed_full.w2v' % MIMIC_2_DIR)

In [11]:
wv = model.wv
del(model)

In [12]:
from collections import defaultdict
ind2w = defaultdict(str)
vocab = set()
with open('%s/vocab.csv' % MIMIC_2_DIR, 'r') as f:
    for i, line in enumerate(f):
        vocab.add(line.rstrip())
ind2w = {i+1:w for i,w in enumerate(sorted(vocab))}

In [13]:
W, words = extract_wvs.build_matrix(ind2w, wv)

100%|██████████| 30688/30688 [01:02<00:00, 491.23it/s]


In [14]:
with open('%s/processed_full.embed' % MIMIC_2_DIR, 'w') as f:
    for i in range(len(words)):
        line = [words[i]]
        line.extend([str(d) for d in W[i]])
        f.write(" ".join(line) + "\n")

## Write description vectors with vocab

In [15]:
desc_dict = defaultdict(str)
with open('%s/MIMIC_ICD9_mapping' % MIMIC_2_DIR, 'r') as f:
    r = csv.reader(f)
    #header
    next(r)
    for row in r:
        desc_dict[str(row[1])] = str(row[2])

In [16]:
w2ind = {w:i for i,w in ind2w.items()}

In [17]:
with open('%s/description_vectors.vocab' % MIMIC_2_DIR, 'w') as of:
    w = csv.writer(of, delimiter=' ')
    w.writerow(["CODE", "VECTOR"])
    for code, desc in tqdm(desc_dict.items()):
        tokens = [t.lower() for t in tokenizer.tokenize(desc) if not t.isnumeric()]
        inds = [w2ind[t] if t in w2ind.keys() else len(w2ind)+1 for t in tokens]
        w.writerow([code] + [str(i) for i in inds])

100%|██████████| 7042/7042 [00:00<00:00, 50081.62it/s]
