In [None]:
import pandas as pd
import numpy as np
import pickle

BASEDIR_MIMIC = '/Volumes/ExternalData/Data/mimiciii/1.4'

In [None]:
def get_note_events():
    n_rows = 100000

    icd9_code = pd.read_csv(f"{BASEDIR_MIMIC}/DIAGNOSES_ICD.csv", index_col = None)
    # create the iterator
    noteevents_iterator = pd.read_csv(
        f"{BASEDIR_MIMIC}/NOTEEVENTS.csv",
        iterator=True,
        chunksize=n_rows)

    #noteevents = pd.read_csv(
    #    f"{BASEDIR}/NOTEEVENTS.csv")

    # concatenate according to a filter to get our noteevents data
    noteevents = pd.concat(
        [noteevents_chunk[np.logical_and(noteevents_chunk.CATEGORY.isin(["Discharge summary"]),
                                         noteevents_chunk.DESCRIPTION.isin(["Report"]))]
        for noteevents_chunk in noteevents_iterator])

    noteevents.HADM_ID = noteevents.HADM_ID.astype(int)
    try:
        assert len(noteevents.drop_duplicates(["SUBJECT_ID","HADM_ID"])) == len(noteevents)
    except AssertionError as e:
        print("There are duplicates on Primary Key Set")
        
    noteevents.CHARTDATE  = pd.to_datetime(noteevents.CHARTDATE , format = '%Y-%m-%d %H:%M:%S', errors = 'coerce')
    pd.set_option('display.max_colwidth',50)
    noteevents.sort_values(["SUBJECT_ID","HADM_ID","CHARTDATE"], inplace =True)
    noteevents.drop_duplicates(["SUBJECT_ID","HADM_ID"], inplace = True)

    noteevents.reset_index(drop = True, inplace = True)
    
    top_values = (icd9_code.groupby('ICD9_CODE').
              agg({"SUBJECT_ID": "nunique"}).
              reset_index().sort_values(['SUBJECT_ID'], ascending = False).ICD9_CODE.tolist()[:15])

    # icd9_code = icd9_code[icd9_code.ICD9_CODE.isin(top_values)]
    icd9_code = icd9_code[icd9_code.ICD9_CODE.isin(top_values)]
    
    import re
    import itertools

    def clean_text(text):
        return [x for x in list(itertools.chain.from_iterable([t.split("<>") for t in text.replace("\n"," ").split("|")])) if len(x) > 0]


    #most_frequent_tags = [re.match("^(.*?):",x).group() for text in noteevents.TEXT for x in text.split("\n\n") if pd.notnull(re.match("^(.*?):",x))]
    #pd.Series(most_frequent_tags).value_counts().head(10)
    irrelevant_tags = ["Admission Date:", "Date of Birth:", "Service:", "Attending:", "Facility:", "Medications on Admission:", "Discharge Medications:", "Completed by:",
                       "Dictated By:" , "Department:" , "Provider:"]

    updated_text = ["<>".join(["|".join(re.split("\n\d|\n\s+",re.sub("^(.*?):","",x).strip())) for x in text.split("\n\n") if pd.notnull(re.match("^(.*?):",x)) and re.match("^(.*?):",x).group() not in irrelevant_tags ]) for text in noteevents.TEXT]
    updated_text = [re.sub("(\[.*?\])", "", text) for text in updated_text]

    updated_text = ["|".join(clean_text(x)) for x in updated_text]
    noteevents["CLEAN_TEXT"] = updated_text
    
    return noteevents

noteevents = get_note_events()


In [None]:
def mapNotes(dataset):
    print(f"Mapping notes on {dataset}.")
    df = pickle.load(open(f'./train_data_mimic3/{dataset}', 'rb'))
    
    BASEDIR_MIMIC = '/Volumes/ExternalData/Data/mimiciii/1.4'
    icustays = pd.read_csv(f"{BASEDIR_MIMIC}/ICUSTAYS.csv", index_col = None)
    
    # SUBJECT_ID "_" ICUSTAY_ID "_episode" episode "_timeseries_readmission.csv"

    import re

    regex = r"(\d+)_(\d+)_episode(\d+)_timeseries_readmission\.csv"

    l = df['names']
    sid = []
    icustayid = [] # ICUSTAYS.csv ICUSTAY_ID
    episode = []
    for r in l:
        matches = re.finditer(regex, r) #, re.MULTILINE)

        for matchNum, match in enumerate(matches, start=1):

            #print ("Match {matchNum} was found at {start}-{end}: {match}".format(matchNum = matchNum, start = match.start(), end = match.end(), match = match.group()))

            sid.append(match.group(1))
            icustayid.append(match.group(2))
            episode.append(int(match.group(3)))
            for groupNum in range(0, len(match.groups())):
                groupNum = groupNum + 1

                #print ("Group {groupNum} found at {start}-{end}: {group}".format(groupNum = groupNum, start = match.start(groupNum), end = match.end(groupNum), group = match.group(groupNum)))
            
    df2 = pd.DataFrame({'sid':sid, 'icustayid':icustayid, 'episode':episode})
    episodes = df['names']
    
    regex = r"(\d+)_(\d+)_episode(\d+)_timeseries_readmission\.csv"

    sid = []
    hadmids = []
    icustayid = [] # ICUSTAYS.csv ICUSTAY_ID
    episode = []
    notestexts = []
    notextepis = []
    for epi in episodes:
        match = re.findall(regex, epi) #, re.MULTILINE)
        sid.append(int(match[0][0]))
        icustayid.append(int(match[0][1]))
        episode.append(int(match[0][2]))
        hadmid = icustays[icustays['ICUSTAY_ID']==int(match[0][1])]['HADM_ID']
        hadmids.append(int(hadmid))
        try:
            #text = noteevents[noteevents['HADM_ID']==int(hadmid)]['TEXT'].iloc[0]
            text = noteevents[noteevents['HADM_ID']==int(hadmid)]['CLEAN_TEXT'].iloc[0]
        except:
            notextepis.append(int(hadmid))
            text = ''
        notestexts.append(text)

    print(len(episodes), len(notextepis), len(set(notextepis)))
    print(len(sid), len(hadmids), len(df['names']))
    
    notesfull = pd.DataFrame({'SUBJECT_ID':sid, 'HADM_ID':hadmids, 'ICUSTAY_ID':icustayid, 'EPISODE':episode, 'CLEAN_TEXT':notestexts})
    
    # save full data
    filename = f'./clinical_notes_{dataset}'
    #full_data.to_csv(filename + '.csv', index = None)

    with open(filename + '.pickle', 'wb') as handle:
        #pickle.dump(full_data, handle, protocol=pickle.HIGHEST_PROTOCOL)
        pickle.dump(notesfull, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
    print(f"Finished mapping notes on {dataset}.\n")

In [None]:
def combineData(dataset):
    print(f"Combining data for all {dataset}.")
    df = pickle.load(open(f'./train_data_mimic3/{dataset}', 'rb'))
    print(df.keys(), len(df['data']),len(df['names']), df['data'][0].shape, len(df['data'][1]), len(df['names']))

    notes = pickle.load(open(f'clinical_notes_{dataset}.pickle', 'rb'))

    # how many empty text rows
    # np.where(notes.applymap(lambda x: x == ''))

    # how many empty text rows
    print(f"There are {len(list(notes[notes['CLEAN_TEXT'] == ''].index))} empty rows in notes.")
    X = df['data'][0]
    y = np.array(df['data'][1])
    N = list(notes.CLEAN_TEXT)

    # check if all three data sets have the same size/length
    assert len(X) == len(y) == len(N)

    empty_ind = list(notes[notes['CLEAN_TEXT'] == ''].index)
    N_ = np.array(N)
    #N_[empty_ind]

    mask = np.ones(len(notes), np.bool)
    mask[empty_ind] = 0
    good_notes = N_[mask]
    good_X = X[mask]
    good_y = y[mask]

    print(f"Final shapes = {good_X.shape, good_y.shape, good_notes.shape}")

    data = {'inputs': good_X,
            'labels': good_y,
            'notes': good_notes}

    # save full data
    filename = f'./full_{dataset}'
    #full_data.to_csv(filename + '.csv', index = None)

    with open(filename + '.pickle', 'wb') as handle:
        #pickle.dump(full_data, handle, protocol=pickle.HIGHEST_PROTOCOL)
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
    print("finished.\n")

In [None]:
all_datasets = ['train_data'] #, 'test_data', 'val_data']

for dataset in all_datasets:
    print(f"Processing dataset {dataset}.")
    mapNotes(dataset)
    combineData(dataset)
    