# Preprocess text data


In [62]:
# import sent2vec
from nltk import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from scipy.spatial import distance
import pickle
from tqdm import tqdm
import numpy as np
import pandas as pd
# import torch
# from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import re

In [63]:
stop_words = set(stopwords.words('english'))
punctuation_less = '"#$%&\'()*+,-/:;<=>@[\\]^_`{|}~'

def preprocess_sentence(text):
    text = text.replace('/', ' / ')
    text = text.replace('.-', ' .- ')
    text = text.replace('.', ' . ')
    text = text.replace('\'', ' \' ')
    text = text.lower()

    tokens = [token for token in word_tokenize(text) if token not in punctuation and token not in stop_words]

    return ' '.join(tokens)

def preprocess_sentence_leave_dot(text):
    text = text.replace('/', ' / ')
    text = text.replace('.-', ' .- ')
    text = text.replace('.', ' . ')
    text = text.replace('\'', ' \' ')
    text = text.lower()

    tokens = [token for token in word_tokenize(text) if token not in punctuation_less and token not in stop_words]

    return ' '.join(tokens)

In [71]:
# source: https://github.com/kexinhuang12345/clinicalBERT/blob/master/preprocess.py
# Huang, Kexin, Jaan Altosaar, and Rajesh Ranganath. “ClinicalBERT: Modeling Clinical Notes and Predicting Hospital Readmission,” April 10, 2019. https://arxiv.org/abs/1904.05342.

def preprocess1(x):
    y=re.sub('\\[(.*?)\\]','',x) #remove de-identified brackets
    y=re.sub('[0-9]+\.','',y) #remove 1.2. since the segmenter segments based on this
    y=re.sub('dr\.','doctor',y)
    y=re.sub('m\.d\.','md',y)
    y=re.sub('admission date:','',y)
    y=re.sub('discharge date:','',y)
    y=re.sub('--|__|==','',y)
    return y

def preprocessing(df, column='notes'): 
    df[column]=df[column].fillna(' ')
    df[column]=df[column].str.replace('\n',' ')
    df[column]=df[column].str.replace('\r',' ')
    df[column]=df[column].apply(str.strip)
    df[column]=df[column].str.lower()

    df[column]=df[column].apply(lambda x: preprocess1(x))
    #df[column]=df[column].apply(lambda x: preprocess_sentence(x))
    return df

# 0. Pre-processing of all datasets

In [86]:
# data_path = '/Users/jplasser/Documents/AI Master/WS2021/MastersThesis/code.nosync/CNEP/src/data/mimic3/'
data_path = '../data/mimic3/'

datasets = ['train','val','test']

for dataset in datasets:
    train_data = pickle.load(open(f'{data_path}new_{dataset}_data_unique_CNEP.pickle', 'rb'))
    
    df = pd.DataFrame({'notes': train_data['notes'],
                       'eventsnotes': train_data['eventsnotes']})
    # preprocess train_data['notes']
    df = preprocessing(df, 'notes')

    # preprocess train_data['eventsnotes']
    df = preprocessing(df, 'eventsnotes')

    print(f"Pre-processing of file {data_path}new_{dataset}_data_unique_CNEP.pickle finished!")
    
    train_data['notes'] = np.array(df['notes'])
    train_data['eventsnotes'] = np.array(df['eventsnotes'])
    
    pickle.dump(train_data, open(f'{data_path}new_{dataset}_data_unique_CNEP_prepro.pickle', 'wb'))
    print(f"File {data_path}new_{dataset}_data_unique_CNEP_prepro.pickle written!")

Pre-processing of file ../data/mimic3/new_train_data_unique_CNEP.pickle finished!
File ../data/mimic3/new_train_data_unique_CNEP_prepro.pickle written!
Pre-processing of file ../data/mimic3/new_val_data_unique_CNEP.pickle finished!
File ../data/mimic3/new_val_data_unique_CNEP_prepro.pickle written!
Pre-processing of file ../data/mimic3/new_test_data_unique_CNEP.pickle finished!
File ../data/mimic3/new_test_data_unique_CNEP_prepro.pickle written!
