In [1]:
import sys
sys.path.append('../')
import datasets
import log_reg
from dataproc import extract_wvs
from dataproc import get_discharge_summaries
from dataproc import concat_and_split
from dataproc import build_vocab
from dataproc import vocab_index_descriptions
from dataproc import word_embeddings



import numpy as np
import pandas as pd

from collections import Counter, defaultdict
import csv
import math
import operator

In [16]:
# MIMIC_4_DIR='/Users/kathyx/Documents/研二下课程/6120/final_project/data/mimic4/mimic-iv-3.1'
MIMIC_4_DIR='../../data/mimic4/mimic-iv-3.1'
MIMIC_4_SAVE_DIR='../mimicdata/mimic4_icd9'

Let's do some data processing in a much better way, with a notebook.

First, let's define some stuff.

In [3]:
Y = 'full' #use all available labels in the dataset for prediction
notes_file = f'{MIMIC_4_DIR}/note/discharge.csv' # raw note events downloaded from MIMIC-III
vocab_size = 'full' #don't limit the vocab size to a specific number
vocab_min = 3 #discard tokens appearing in fewer than this many documents

# Data processing

## Combine diagnosis and procedure codes and reformat them

The codes in MIMIC-IV are given in separate files for procedures and diagnoses, and the codes are given without periods, which might lead to collisions if we naively combine them. So we have to add the periods back in the right place.

In [4]:
dfproc = pd.read_csv(f'{MIMIC_4_DIR}/hosp/procedures_icd.csv',
                     dtype={"icd_code": str})
dfdiag = pd.read_csv(f'{MIMIC_4_DIR}/hosp/diagnoses_icd.csv',
                     dtype={"icd_code": str})

In [5]:
print(len(dfproc[dfproc['icd_version']==9]))
print(len(dfproc[dfproc['icd_version']==10]))
print(len(dfproc['icd_version']))

469209
390446
859655


MIMIC IV contains both ICD-9 and ICD-10 code versions; however, for the purposes of processing ICD9-9 codes, we will restrict our focus to ICD-9 codes only.


In [6]:
dfproc9=dfproc[dfproc['icd_version']==9]
dfdiag9=dfdiag[dfdiag['icd_version']==9]

In [7]:
n=5
dfproc9.head(n)

Unnamed: 0,subject_id,hadm_id,seq_num,chartdate,icd_code,icd_version
0,10000032,22595853,1,2180-05-07,5491,9
1,10000032,22841357,1,2180-06-27,5491,9
2,10000032,25742920,1,2180-08-06,5491,9
3,10000068,25022803,1,2160-03-03,8938,9
5,10000280,25852320,1,2151-03-18,8938,9


In [8]:
n=5
dfdiag9.head(n)

Unnamed: 0,subject_id,hadm_id,seq_num,icd_code,icd_version
0,10000032,22595853,1,5723,9
1,10000032,22595853,2,78959,9
2,10000032,22595853,3,5715,9
3,10000032,22595853,4,7070,9
4,10000032,22595853,5,496,9


In [9]:
dfdiag9['absolute_code'] = dfdiag9.apply(lambda row: str(datasets.reformat(str(row[3]), True)), axis=1)
dfproc9['absolute_code'] = dfproc9.apply(lambda row: str(datasets.reformat(str(row[4]), False)), axis=1)

  dfdiag9['absolute_code'] = dfdiag9.apply(lambda row: str(datasets.reformat(str(row[3]), True)), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfdiag9['absolute_code'] = dfdiag9.apply(lambda row: str(datasets.reformat(str(row[3]), True)), axis=1)
  dfproc9['absolute_code'] = dfproc9.apply(lambda row: str(datasets.reformat(str(row[4]), False)), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfproc9['absolute_code'] = dfproc9.apply(lambda row: str(datasets.reformat(str(row[4]), False)), axis=1)


In [10]:
n=5
dfdiag9.head(n)

Unnamed: 0,subject_id,hadm_id,seq_num,icd_code,icd_version,absolute_code
0,10000032,22595853,1,5723,9,572.3
1,10000032,22595853,2,78959,9,789.59
2,10000032,22595853,3,5715,9,571.5
3,10000032,22595853,4,7070,9,70.7
4,10000032,22595853,5,496,9,496.0


In [11]:
n=5
dfproc9.head(n)

Unnamed: 0,subject_id,hadm_id,seq_num,chartdate,icd_code,icd_version,absolute_code
0,10000032,22595853,1,2180-05-07,5491,9,54.91
1,10000032,22841357,1,2180-06-27,5491,9,54.91
2,10000032,25742920,1,2180-08-06,5491,9,54.91
3,10000068,25022803,1,2160-03-03,8938,9,89.38
5,10000280,25852320,1,2151-03-18,8938,9,89.38


In [12]:
dfcodes9 = pd.concat([dfdiag9, dfproc9])

In [13]:
n=5
dfcodes9.head(n)

Unnamed: 0,subject_id,hadm_id,seq_num,icd_code,icd_version,absolute_code,chartdate
0,10000032,22595853,1,5723,9,572.3,
1,10000032,22595853,2,78959,9,789.59,
2,10000032,22595853,3,5715,9,571.5,
3,10000032,22595853,4,7070,9,70.7,
4,10000032,22595853,5,496,9,496.0,


In [17]:
dfcodes9.to_csv(f'{MIMIC_4_SAVE_DIR}/ALL_CODES.csv', index=False,
               columns=['subject_id', 'hadm_id', 'seq_num', 'absolute_code'],
               header=['subject_id', 'hadm_id', 'seq_num', 'ICD9_CODE'])

## How many codes are there?

In [18]:
#In the full dataset (not just discharge summaries)
df = pd.read_csv('%s/ALL_CODES.csv' % MIMIC_4_SAVE_DIR, dtype={"ICD9_CODE": str})
len(df['ICD9_CODE'].unique())

11700

In [19]:
n=5
df.head(n)

Unnamed: 0,subject_id,hadm_id,seq_num,ICD9_CODE
0,10000032,22595853,1,572.3
1,10000032,22595853,2,789.59
2,10000032,22595853,3,571.5
3,10000032,22595853,4,70.7
4,10000032,22595853,5,496.0


## Tokenize and preprocess raw text

Preprocessing time!

This will:
- Select only discharge summaries and their addenda
- remove punctuation and numeric-only tokens, removing 500 but keeping 250mg
- lowercase all tokens

In [20]:
"""
    Reads NOTEEVENTS file, finds the discharge summaries, preprocesses them and writes out the filtered dataset.
"""
import csv

from nltk.tokenize import RegexpTokenizer

from tqdm import tqdm

#retain only alphanumeric
tokenizer = RegexpTokenizer(r'\w+')

def write_discharge_summaries(out_file, data_dir):
    # notes_file = '%s/NOTEEVENTS.csv' % (data_dir)
    notes_file = f'{data_dir}/note/discharge.csv'
    print("processing notes file")
    with open(notes_file, 'r', encoding='utf-8') as csvfile:
        with open(out_file, 'w', encoding='utf-8') as outfile:
            print("writing to %s" % (out_file))
            outfile.write(','.join(['subject_id', 'hadm_id', 'charttime', 'text']) + '\n')
            notereader = csv.reader(csvfile)
            #header
            next(notereader)
            i = 0
            for line in tqdm(notereader):
                subj = int(line[1])
                category = line[3]
                if category == "DS":
                    note = line[7]
                    #tokenize, lowercase and remove numerics
                    tokens = [t.lower() for t in tokenizer.tokenize(note) if not t.isnumeric()]
                    text = '"' + ' '.join(tokens) + '"'
                    outfile.write(','.join([line[1], line[2], line[5], text]) + '\n')
                i += 1
    return out_file

In [21]:
#This reads all notes, selects only the discharge summaries, and tokenizes them, returning the output filename
disch_full_file = write_discharge_summaries(out_file=f"{MIMIC_4_SAVE_DIR}/disch_9_full.csv",data_dir= MIMIC_4_DIR)

processing notes file
writing to ../mimicdata/mimic4_icd9/disch_9_full.csv


331793it [02:19, 2377.03it/s]


Let's read this in and see what kind of data we're working with

In [22]:
df = pd.read_csv(f"{MIMIC_4_SAVE_DIR}/disch_9_full.csv", dtype={'subject_id': str, 'hadm_id': str}, encoding='utf-8', engine='python')

In [23]:
n=5
df.head(n)

Unnamed: 0,subject_id,hadm_id,charttime,text
0,10000032,22595853,2180-05-07 00:00:00,name ___ unit no ___ admission date ___ discha...
1,10000032,22841357,2180-06-27 00:00:00,name ___ unit no ___ admission date ___ discha...
2,10000032,29079034,2180-07-25 00:00:00,name ___ unit no ___ admission date ___ discha...
3,10000032,25742920,2180-08-07 00:00:00,name ___ unit no ___ admission date ___ discha...
4,10000084,23052089,2160-11-25 00:00:00,name ___ unit no ___ admission date ___ discha...


In [24]:
#How many admissions?
len(df['hadm_id'].unique())

331793

In [25]:
print(len(df['subject_id'].unique()))
print(len(df))

145914
331793


In [26]:
#Tokens and types
types = set()
num_tok = 0
for row in df.itertuples():
    for w in row[4].split():
        types.add(w)
        num_tok += 1

In [27]:
print("Num types", len(types))
print("Num tokens", str(num_tok))

Num types 370511
Num tokens 508950426


In [28]:
#Let's sort by SUBJECT_ID and HADM_ID to make a correspondence with the MIMIC-3 label file
df = df.sort_values(['subject_id', 'hadm_id'])

In [29]:
#Sort the label file by the same
dfl = pd.read_csv(f'{MIMIC_4_SAVE_DIR}/ALL_CODES.csv',dtype={"ICD9_CODE": str})
dfl = dfl.sort_values(['subject_id', 'hadm_id'])

In [30]:
len(df['hadm_id'].unique()), len(dfl['hadm_id'].unique())

(331793, 291156)

## Consolidate labels with set of discharge summaries

Looks like there were some HADM_ID's that didn't have discharge summaries, so they weren't included with our notes

In [31]:
#Let's filter out these HADM_ID's
hadm_ids = set(df['hadm_id'])
with open(f'{MIMIC_4_SAVE_DIR}/ALL_CODES.csv', 'r') as lf:
    with open(f'{MIMIC_4_SAVE_DIR}/ALL_CODES_filtered.csv','w') as of:
        w = csv.writer(of)
        w.writerow(['subject_id', 'hadm_id', 'icd9_code', 'admittime', 'dischtime'])
        r = csv.reader(lf)
        #header
        next(r)
        for i,row in enumerate(r):
            hadm_id = row[1]
            #print(hadm_id)
            #break
            if hadm_id in hadm_ids:
                w.writerow(row[:2] + [row[-1], '', ''])

There are also some HADM_ID's that didn't have labels, so they weren't included with our notes.
We need to remove the note.

In [32]:
dfl = pd.read_csv(f'{MIMIC_4_SAVE_DIR}/ALL_CODES_filtered.csv',dtype={"icd9_code": str}, index_col=None)

In [33]:
len(dfl)

2815287

In [34]:
len(dfl['hadm_id'].unique())

209330

In [35]:
#Let's filter out these HADM_ID in the note but not in the label
hadm_lids = set(dfl['hadm_id'])
with open(f'{MIMIC_4_SAVE_DIR}/disch_9_full.csv', 'r', encoding='utf-8') as lf:
    with open(f'{MIMIC_4_SAVE_DIR}/disch_9_filtered.csv','w', encoding='utf-8') as of:
        w = csv.writer(of)
        w.writerow(['subject_id', 'hadm_id', 'charttime', 'text'])
        r = csv.reader(lf)
        #header
        next(r)
        for i,row in enumerate(r):
            hadm_id = int(row[1])
            # hadm_id = row[1]
            #print(hadm_id)
            #break
            if hadm_id in hadm_lids:
                w.writerow(row)

In [36]:
n=5
dfl.head(n)

Unnamed: 0,subject_id,hadm_id,icd9_code,admittime,dischtime
0,10000032,22595853,572.3,,
1,10000032,22595853,789.59,,
2,10000032,22595853,571.5,,
3,10000032,22595853,70.7,,
4,10000032,22595853,496.0,,


In [37]:
#we still need to sort it by HADM_ID
dfl = dfl.sort_values(['subject_id', 'hadm_id'])
dfl.to_csv(f'{MIMIC_4_SAVE_DIR}/ALL_CODES_filtered.csv', index=False)

## Append labels to notes in a single file

In [38]:
#Now let's append each instance with all of its codes
#this is pretty non-trivial so let's use this script I wrote, which requires the notes to be written to file
df = pd.read_csv(f'{MIMIC_4_SAVE_DIR}/disch_9_filtered.csv', index_col=None, encoding='utf-8', engine='python')
df = df.sort_values(['subject_id', 'hadm_id'])
sorted_file = f'{MIMIC_4_SAVE_DIR}/disch_9_filtered.csv'
df.to_csv(sorted_file, index=False, encoding='utf-8')

In [39]:
print(len(df['hadm_id'].unique()))
print(len(dfl['hadm_id'].unique()))
set(dfl['hadm_id'].unique()).issubset(set(df['hadm_id'].unique()))

209330
209330


True

In [40]:
type(df['hadm_id'][0])

numpy.int64

In [41]:
print(len(df['subject_id'].unique()))
print(len(dfl['subject_id'].unique()))
set(int(x) for x in dfl['subject_id'].unique()).issubset(set(df['subject_id'].unique()))

97708
97708


True

In [42]:
"""
    Concatenate the labels with the notes data and split using the saved splits
"""
import csv
from datetime import datetime
import random


import pandas as pd

DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S"

def concat_data(labelsfile, notes_file):
    """
        INPUTS:
            labelsfile: sorted by hadm id, contains one label per line
            notes_file: sorted by hadm id, contains one note per line
    """
    with open(labelsfile, 'r', encoding='utf-8') as lf:
        print("CONCATENATING")
        with open(notes_file, 'r', encoding='utf-8') as notesfile:
            outfilename = f'{MIMIC_4_SAVE_DIR}/notes_labeled_icd9_filtered.csv'
            with open(outfilename, 'w', encoding='utf-8') as outfile:
                w = csv.writer(outfile)
                w.writerow(['subject_id', 'hadm_id', 'text', 'labels'])

                labels_gen = next_labels(lf)
                notes_gen = next_notes(notesfile)

                for i, (subj_id, text, hadm_id) in enumerate(notes_gen):
                    if i % 10000 == 0:
                        print(str(i) + " done")
                    cur_subj, cur_labels, cur_hadm = next(labels_gen)

                    if cur_hadm == hadm_id:
                        w.writerow([subj_id, str(hadm_id), text, ';'.join(cur_labels)])
                    else:
                        print("couldn't find matching hadm_id. data is probably not sorted correctly")
                        break
                    
    return outfilename

def next_labels(labelsfile):
    """
        Generator for label sets from the label file
    """
    labels_reader = csv.reader(labelsfile)
    #header
    next(labels_reader)

    first_label_line = next(labels_reader)

    cur_subj = int(first_label_line[0])
    cur_hadm = int(first_label_line[1])
    cur_labels = [first_label_line[2]]

    for row in labels_reader:
        subj_id = int(row[0])
        hadm_id = int(row[1])
        code = row[2]
        #keep reading until you hit a new hadm id
        if hadm_id != cur_hadm or subj_id != cur_subj:
            yield cur_subj, cur_labels, cur_hadm
            cur_labels = [code]
            cur_subj = subj_id
            cur_hadm = hadm_id
        else:
            #add to the labels and move on
            cur_labels.append(code)
    yield cur_subj, cur_labels, cur_hadm

def next_notes(notesfile):
    """
        Generator for notes from the notes file
        This will also concatenate discharge summaries and their addenda, which have the same subject and hadm id
    """
    nr = csv.reader(notesfile)
    #header
    next(nr)

    first_note = next(nr)

    cur_subj = int(first_note[0])
    cur_hadm = int(first_note[1])
    cur_text = first_note[3]
    
    for row in nr:
        subj_id = int(row[0])
        hadm_id = int(row[1])
        text = row[3]
        #keep reading until you hit a new hadm id
        if hadm_id != cur_hadm or subj_id != cur_subj:
            yield cur_subj, cur_text, cur_hadm
            cur_text = text
            cur_subj = subj_id
            cur_hadm = hadm_id
        else:
            #concatenate to the discharge summary and move on
            cur_text += " " + text
    yield cur_subj, cur_text, cur_hadm


In [43]:
df.head(10)

Unnamed: 0,subject_id,hadm_id,charttime,text
0,10000032,22595853,2180-05-07 00:00:00,name ___ unit no ___ admission date ___ discha...
1,10000032,22841357,2180-06-27 00:00:00,name ___ unit no ___ admission date ___ discha...
3,10000032,25742920,2180-08-07 00:00:00,name ___ unit no ___ admission date ___ discha...
2,10000032,29079034,2180-07-25 00:00:00,name ___ unit no ___ admission date ___ discha...
4,10000248,20600184,2192-11-30 00:00:00,name ___ unit no ___ admission date ___ discha...
5,10000560,28979390,2189-10-17 00:00:00,name ___ unit no ___ admission date ___ discha...
6,10000764,27897940,2132-10-19 00:00:00,name ___ unit no ___ admission date ___ discha...
7,10000826,20032235,2146-12-12 00:00:00,name ___ unit no ___ admission date ___ discha...
8,10000826,21086876,2146-12-24 00:00:00,name ___ unit no ___ admission date ___ discha...
9,10000826,28289260,2147-01-02 00:00:00,name ___ unit no ___ admission date ___ discha...


In [44]:
dfl.head(10)

Unnamed: 0,subject_id,hadm_id,icd9_code,admittime,dischtime
0,10000032,22595853,572.3,,
1,10000032,22595853,789.59,,
2,10000032,22595853,571.5,,
3,10000032,22595853,070.70,,
4,10000032,22595853,496,,
5,10000032,22595853,296.80,,
6,10000032,22595853,309.81,,
7,10000032,22595853,V15.82,,
2441408,10000032,22595853,54.91,,
8,10000032,22841357,070.71,,


In [46]:
#For this cell, I do not recommend to run this cell directly.
#You can run through the file data_mimic_IV_concate_note_label.py
#Remember to change the directories there: MIMIC_4_SAVE_DIR, labelsfile,notes_file, output_note_labeled_file
labeled = concat_data(f'{MIMIC_4_SAVE_DIR}/ALL_CODES_filtered.csv', f'{MIMIC_4_SAVE_DIR}/disch_9_filtered.csv')

CONCATENATING
0 done
10000 done
20000 done
30000 done
40000 done
50000 done
60000 done
70000 done
80000 done
90000 done
100000 done
110000 done
120000 done
130000 done
140000 done
150000 done
160000 done
170000 done
180000 done
190000 done
200000 done


Let's sanity check the combined data we just made. Do we have all hadm id's accounted for, and the same vocab stats?

In [48]:
dfnl = pd.read_csv(f'{MIMIC_4_SAVE_DIR}/notes_labeled_icd9_filtered.csv', encoding='utf-8', engine='python')
#Tokens and types
types = set()
num_tok = 0
for row in dfnl.itertuples():
    for w in row[3].split():
        types.add(w)
        num_tok += 1

In [49]:
print("num types", len(types), "num tokens", num_tok)

num types 298427 num tokens 305530808


In [50]:
len(dfnl['hadm_id'].unique())

209330

In [51]:
len(dfnl)

209330

In [52]:
len(dfnl['subject_id'].unique())

97708

## Create train/dev/test splits

In [53]:
def split_data(labeledfile, base_name):
    print("SPLITTING")
    #create and write headers for train, dev, test
    train_name = '%s_train_split.csv' % (base_name)
    dev_name = '%s_dev_split.csv' % (base_name)
    test_name = '%s_test_split.csv' % (base_name)
    train_file = open(train_name, 'w', encoding='utf-8')
    dev_file = open(dev_name, 'w', encoding='utf-8')
    test_file = open(test_name, 'w', encoding='utf-8')
    train_file.write(','.join(['subject_id', 'hadm_id', 'text', 'labels']) + "\n")
    dev_file.write(','.join(['subject_id', 'hadm_id', 'text', 'labels']) + "\n")
    test_file.write(','.join(['subject_id', 'hadm_id', 'text', 'labels']) + "\n")

    hadm_ids = {}

    #read in train, dev, test splits
    for splt in ['train', 'dev', 'test']:
        hadm_ids[splt] = set()
        with open('%s/%s_full_hadm_ids.csv' % (MIMIC_4_SAVE_DIR, splt), 'r') as f:
            for line in f:
                hadm_ids[splt].add(line.rstrip())

    with open(labeledfile, 'r', encoding='utf-8') as lf:
        reader = csv.reader(lf)
        next(reader)
        i = 0
        cur_hadm = 0
        for row in reader:
            #filter text, write to file according to train/dev/test split
            if i % 10000 == 0:
                print(str(i) + " read")

            hadm_id = row[1]

            if hadm_id in hadm_ids['train']:
                train_file.write(','.join(row) + "\n")
            elif hadm_id in hadm_ids['dev']:
                dev_file.write(','.join(row) + "\n")
            elif hadm_id in hadm_ids['test']:
                test_file.write(','.join(row) + "\n")
            else:
                print("Error")

            i += 1

        train_file.close()
        dev_file.close()
        test_file.close()
    return train_name, dev_name, test_name

In [56]:
fname = f'{MIMIC_4_SAVE_DIR}/notes_labeled_icd9_filtered.csv'
base_name = "%s/disch" % MIMIC_4_SAVE_DIR #for output
tr, dv, te = split_data(fname, base_name=base_name)

SPLITTING
0 read
Error
10000 read
Error
20000 read
30000 read
40000 read
50000 read
60000 read
70000 read
Error
80000 read
Error
90000 read
100000 read
Error
110000 read
120000 read
130000 read
140000 read
Error
150000 read
160000 read
170000 read
180000 read
Error
190000 read
200000 read


In [57]:
train_df=pd.read_csv('%s/disch_train_split.csv' % (MIMIC_4_SAVE_DIR), encoding='utf-8',engine='python',dtype={"icd_code": str})
dev_df=pd.read_csv('%s/disch_dev_split.csv' % (MIMIC_4_SAVE_DIR), encoding='utf-8',engine='python',dtype={"icd_code": str})
test_df=pd.read_csv('%s/disch_test_split.csv' % (MIMIC_4_SAVE_DIR), encoding='utf-8',engine='python',dtype={"icd_code": str})
# train_df=pd.read_csv(tr, encoding='utf-8',engine='python')
# dev_df=pd.read_csv(dv, encoding='utf-8',engine='python')
# test_df=pd.read_csv(te, encoding='utf-8',engine='python')

In [58]:
print(len(dfnl['subject_id']))
print(len(train_df['subject_id']))
print(len(dev_df['subject_id']))
print(len(test_df['subject_id']))

209330
188508
7110
13705


In [59]:
print(len(dfnl['subject_id'].unique()))
print(len(train_df['subject_id'].unique()))
print(len(dev_df['subject_id'].unique()))
print(len(test_df['subject_id'].unique()))

97708
87935
3257
6515


In [60]:
print(len(dfnl['subject_id'].unique()))
print(len(train_df['subject_id'].unique()))
print(len(dev_df['subject_id'].unique()))
print(len(test_df['subject_id'].unique()))

97708
87935
3257
6515


## Build vocabulary from training data

In [61]:
import csv
import numpy as np
import operator

from collections import defaultdict
from scipy.sparse import csr_matrix

def build_vocab(vocab_min, infile, vocab_filename):
    """
        INPUTS:
            vocab_min: how many documents a word must appear in to be kept
            infile: (training) data file to build vocabulary from
            vocab_filename: name for the file to output
    """
    with open(infile, 'r', encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile)
        #header
        next(reader)

        #0. read in data
        print("reading in data...")
        #holds number of terms in each document
        note_numwords = []
        #indices where notes start
        note_inds = [0]
        #indices of discovered words
        indices = []
        #holds a bunch of ones
        data = []
        #keep track of discovered words
        vocab = {}
        #build lookup table for terms
        num2term = {}
        #preallocate array to hold number of notes each term appears in
        note_occur = np.zeros(400000, dtype=int)
        i = 0
        for row in reader:
            text = row[2]
            numwords = 0
            for term in text.split():
                #put term in vocab if it's not there. else, get the index
                index = vocab.setdefault(term, len(vocab))
                indices.append(index)
                num2term[index] = term
                data.append(1)
                numwords += 1
            #record where the next note starts
            note_inds.append(len(indices))
            indset = set(indices[note_inds[-2]:note_inds[-1]])
            #go thru all the word indices you just added, and add to the note occurrence count for each of them
            for ind in indset:
                note_occur[ind] += 1
            note_numwords.append(numwords)
            i += 1
        #clip trailing zeros
        note_occur = note_occur[note_occur>0]

        #turn vocab into a list so indexing doesn't get fd up when we drop rows
        vocab_list = np.array([word for word,ind in sorted(vocab.items(), key=operator.itemgetter(1))])

        #1. create sparse document matrix
        C = csr_matrix((data, indices, note_inds), dtype=int).transpose()
        #also need the numwords array to be a sparse matrix
        note_numwords = csr_matrix(1. / np.array(note_numwords))
        
        #2. remove rows with less than 3 total occurrences
        print("removing rare terms")
        #inds holds indices of rows corresponding to terms that occur in < 3 documents
        inds = np.nonzero(note_occur >= vocab_min)[0]
        print(str(len(inds)) + " terms qualify out of " + str(C.shape[0]) + " total")
        #drop those rows
        C = C[inds,:]
        note_occur = note_occur[inds]
        vocab_list = vocab_list[inds]

        print("writing output")
        with open(vocab_filename, 'w', encoding='utf-8') as vocab_file:
            for word in vocab_list:
                vocab_file.write(word + "\n")

In [62]:
vocab_min = 3
vname = '%s/vocab.csv' % MIMIC_4_SAVE_DIR
build_vocab(vocab_min, '%s/disch_train_split.csv' % (MIMIC_4_SAVE_DIR), vname)

reading in data...
removing rare terms
102923 terms qualify out of 282160 total
writing output


## Sort each data split by length for batching

In [63]:
for splt in ['train', 'dev', 'test']:
    filename = '%s/disch_%s_split.csv' % (MIMIC_4_SAVE_DIR, splt)
    df = pd.read_csv(filename, encoding='utf-8', engine='python',dtype={"icd_code": str})
    df['length'] = df.apply(lambda row: len(str(row['text']).split()), axis=1)
    df = df.sort_values(['length'])
    df.to_csv('%s/%s_full.csv' % (MIMIC_4_SAVE_DIR, splt), index=False, encoding='utf-8')

## Filter each split to the top 50 diagnosis/procedure codes

In [71]:
Y = 50

In [None]:
# #first calculate the top k
# counts = Counter()
# dfnl = pd.read_csv('%s/notes_labeled.csv' % MIMIC_3_DIR)
# for row in dfnl.itertuples():
#     for label in str(row[4]).split(';'):
#         counts[label] += 1
# codes_50 = sorted(counts.items(), key=operator.itemgetter(1), reverse=True)
# codes_50 = [code[0] for code in codes_50[:Y]]

In [66]:
MIMIC_4_DIR_ICD9 = '../mimicdata/mimic4_icd9'
with open(f"{MIMIC_4_DIR_ICD9}/top50_icd9_code_list.txt", "r") as fd:
    codes_50=[x.strip() for x in fd.readlines()]
print(codes_50)

with open('%s/TOP_%s_CODES.csv' % (MIMIC_4_DIR_ICD9, str(Y)), 'w') as of:
    w = csv.writer(of)
    for code in codes_50:
        w.writerow([code])

['274.9', 'V49.86', 'V58.67', '327.23', '428.32', '428.0', '585.6', '300.00', '584.9', 'V45.82', '401.9', '403.90', '427.31', '285.1', '412', '96.6', '38.97', '530.81', '427.89', '585.9', '272.0', '338.29', 'V15.82', '285.9', 'V58.61', 'V45.81', 'V12.54', '599.0', '278.00', '414.00', '486', '244.9', '305.1', 'V12.51', '564.00', '272.4', '250.00', '287.5', '493.90', '311', '276.51', '357.2', '414.01', 'V58.66', '38.93', '276.1', '276.2', '600.00', '496', '733.00']


In [72]:
for splt in ['train', 'dev', 'test']:
    print(splt)
    hadm_ids = set()
    with open('%s/%s_50_hadm_ids.csv' % (MIMIC_4_DIR_ICD9, splt), 'r') as f:
        for line in f:
            hadm_ids.add(line.rstrip())
    with open('%s/notes_labeled_icd9_filtered.csv' % MIMIC_4_DIR_ICD9, 'r', encoding='utf-8') as f:
        with open('%s/%s_%s.csv' % (MIMIC_4_DIR_ICD9, splt, str(Y)), 'w', encoding='utf-8') as of:
            r = csv.reader(f)
            w = csv.writer(of)
            #header
            w.writerow(next(r))
            i = 0
            for row in r:
                hadm_id = row[1]
                if hadm_id not in hadm_ids:
                    continue
                codes = set(str(row[3]).split(';'))
                filtered_codes = codes.intersection(set(codes_50))
                if len(filtered_codes) > 0:
                    w.writerow(row[:3] + [';'.join(filtered_codes)])
                    i += 1

train
dev
test


In [73]:
for splt in ['train', 'dev', 'test']:
    filename = '%s/%s_%s.csv' % (MIMIC_4_DIR_ICD9, splt, str(Y))
    df = pd.read_csv(filename, encoding='utf-8', engine='python')
    df['length'] = df.apply(lambda row: len(str(row['text']).split()), axis=1)
    df = df.sort_values(['length'])
    df.to_csv('%s/%s_%s.csv' % (MIMIC_4_DIR_ICD9, splt, str(Y)), index=False, encoding='utf-8')