In this notebook, we preprocess all the notes in MIMICIII. And do the first step of the preprocessing, including:
1. Remove bullet points.
2. Fix 'dr.' and 'm.d.' abbreviation.
3. Remove '-' and '=='.
4. Remove space, keep digits for later preprocessing.

Later in the next preprocessing procedure (not included), we will do 
1. Delete brackets -> Replace all brackets with meaningful tokens - Hausing
2. Abbreviation - Hausing
3. Replace digit with [num] tokens. - Chutang

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from tqdm import tqdm
import string
import os

In [2]:
# STEP 1: load Note datasets
# update these constants to run this script
preprocessed_data_folder = './Preprocessed_Data/'
if not os.path.exists(preprocessed_data_folder):
    os.mkdir(preprocessed_data_folder)

OUTPUT_DIR =  preprocessed_data_folder#this path will contain tokenized notes. This dir will be the input dir for create_pretrain_data.sh
MIMIC_NOTES_FILE = './physionet.org/files/mimiciii/1.4/NOTEEVENTS.csv' #this is the path to mimic data if you're reading from a csv. Else uncomment the code to read from database below

df_notes = pd.read_csv(MIMIC_NOTES_FILE)

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# # STEP 2: Create the raw file for 
# number = "all"
# # numer = 'test'
# file=open(OUTPUT_DIR + 'RAW_clinical_sentences_category_{}.txt'.format(number),'w')
# all_text_value = df_notes['TEXT'].values
# for i in tqdm(range(len(all_text_value))):
#     if len(all_text_value[i]) > 0:
#         # remove the one token note
#         note = all_text_value[i]
#         file.write(note+'\n')
#         file.write('\n')

1. Delete brackets -> Replace all brackets with meaningful tokens - Hausing
2. Abbreviation - Hausing 
3. Replace digit with [num] tokens. - Chutang

---- **DONE** ----
3. Remove bullet point
4. Keep "date"
5. Keep digit 

In [4]:
# STEP 3: Preprocessing
def preprocess1(x):
    '''
    This preprocessing does
    1. Remove bullet points.
    2. Fix 'dr.' and 'm.d.' abbreviation.
    3. Remove '-' and '=='.
    4. Remove space, keep digits for later preprocessing.
    '''
    
#     y=re.sub('\\[(.*?)\\]','',x) #remove de-identified brackets
    # remove bullepoint like number (what if it's 5.5 mg)
    y=re.sub('[0-9]+\.','',x) #remove 1.2. since the segmenter segments based on this
    y=re.sub('dr\.','doctor',y)
    y=re.sub('m\.d\.','md',y)
    # these two kind of make sense
    # y=re.sub('admission date:','',y)
    # y=re.sub('discharge date:','',y)
    
    y=re.sub('--|__|==','',y)
    
    # remove all digits, spaces
    y = y.translate(str.maketrans("", ""))
    # y = y.translate(str.maketrans("", "", string.digits))
    y = " ".join(y.split())
    return y


def preprocessing(df_notes): 
    df_notes['TEXT']=df_notes['TEXT'].fillna(' ')
    # set to different paragraph
    df_notes['TEXT']=df_notes['TEXT'].str.replace('\n\n','<paragraph>')
    df_notes['TEXT']=df_notes['TEXT'].str.replace('\n',' ')
    df_notes['TEXT']=df_notes['TEXT'].str.replace('\r',' ')
    df_notes['TEXT']=df_notes['TEXT'].apply(str.strip)
    df_notes['TEXT']=df_notes['TEXT'].str.lower()

    df_notes['TEXT']=df_notes['TEXT'].apply(lambda x: preprocess1(x))
    
    return df_notes

df_notes_fold = preprocessing(df_notes)

In [5]:
# STEP 4: Create Pretraining File
number = 'with_number'
file=open(OUTPUT_DIR + 'Preproc0_clinical_sentences_all_{}.txt'.format(number),'w')
pretrain_para_value = df_notes_fold['TEXT'].values
for i in tqdm(range(len(df_notes_fold['TEXT']))):
    if len(pretrain_para_value[i]) > 0:
        # remove the one token note
        note = pretrain_para_value[i].replace('<paragraph>','\n')
        file.write(note+'\n')
    file.write('\n')

100%|██████████| 2083180/2083180 [00:12<00:00, 160330.15it/s]


In [None]:
# STEP 5: Create the preproc0 file for train and val for pretrained longformer 

ori_fn = OUTPUT_DIR + 'Preproc0_clinical_sentences_all_with_number.txt'
train_fn = OUTPUT_DIR + 'Preproc0_clinical_sentences_all_with_number_train.txt'
val_fn = OUTPUT_DIR + 'Preproc0_clinical_sentences_all_with_number_val.txt'

p = 0.7
trainfile = open(train_fn, "w")
valfile = open(val_fn, "w")
with open(ori_fn) as bigfile:
    for line in bigfile:
        writing_file = trainfile
        if random.random() > p:
            writing_file = valfile  
        writing_file.write(line)
trainfile.close()
valfile.close()