# Text Cleaning

In [None]:
# Import libraries
import numpy as np
import pandas as pd
import os
import string
import re
from IPython.display import display, HTML
import spacy
import scispacy
from spacy.lang.en.stop_words import STOP_WORDS
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

##for clustering
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk import word_tokenize
try:
  from unidecode import unidecode
except:
  !pip install unidecode
  from unidecode import unidecode

import sys

import warnings
warnings.filterwarnings("ignore")

import spacy
try:
    nlp = spacy.load('en_core_web_sm', disable = ['parser','ner'])
except:
    !python -m spacy download en_core_web_sm
    nlp = spacy.load('en_core_web_sm', disable = ['parser','ner'])

In [None]:
try:
  from google.colab import drive
  IN_COLAB=True
except:
  IN_COLAB=False

if IN_COLAB:
  print("We're running Colab")

if IN_COLAB:  
  # Mount the Google Drive at mount
  mount='/content/gdrive'
  print("Colab: mounting Google drive on ", mount)
  # connect your colab with the drive
  drive.mount(mount)

 # Switch to the directory on the Google Drive that you want to use
  import os
  path_to_repo = mount + "/My Drive/MIMIC-III Text Mining/mimim_iii_readmission"

else:
   path_to_repo = os.path.dirname(os.getcwd())

  
print(path_to_repo)

In [None]:
# PARAMETERS

icu_stays = True # set to TRUE if we want to have only ICU stays
lemmatize = False # set to false if we want to do stemming
lemma_tag = str(np.where(lemmatize, "_lemma",""))
heavier_proc = True # if we want a heavier processing
if heavier_proc:
    heavier_tag = '_heavier'
else:
    heavier_tag = ''
spacy = False
if spacy: lemma_tag = str(np.where(lemmatize, "_lemma_spacy",""))
    
expanded_def = True # set to True if we want to consider future readmissions and avoid using CMS 

if icu_stays == True:
    icu_folder = 'icu_only'
    if expanded_def:
        icu_folder = 'expanded'
else:
    icu_folder = 'all_hosp'

In [None]:
path_to_data = os.path.join(path_to_repo, "data", icu_folder,"")
print(path_to_data)

In [None]:
df = pd.read_feather(os.path.join(path_to_data,"df_cleaned"))

In [None]:
df.info()

In [None]:
def clean_text(x):
    x = " ".join(x.split())
    x= " ".join((" ".join(x.split("[**"))).split("**]"))
    x = re.sub(r"\([^()]*\)", "", x)
    key_value_strip =(x.split(":"))
    # remove all sub strings which have a length lesser than 50 characters
    string = " ".join([sub_unit for sub_unit in key_value_strip if len(sub_unit)>50])
    x = re.sub(r"(\d+)+(\.|\))", "", string) # remove all serialization eg 1. 1)
    x = re.sub(r"(\*|\?|=)+", "", x) # removing all *, ? and =
    x = re.sub(r"\b(\w+)( \1\b)+", r"\1", x) ## removing consecutive duplicate words
    x = x.replace("FOLLOW UP", "FOLLOWUP")
    x = x.replace("FOLLOW-UP", "FOLLOWUP")
    x = re.sub(r"(\b)(f|F)(irst)(\b)?[\d\-\d]*(\s)*(\b)?(n|N)(ame)[\d\-\d]*(\s)*[\d\-\d]*(\b)","",x)# remove firstname
    x = re.sub(r"(\b)(l|L)(ast)(\b)?[\d\-\d]*(\s)*(\b)?(n|N)(ame)[\d\-\d]*(\s)*[\d\-\d]*(\b)", "", x) # remove lastname
    x = re.sub(r"(\b)(d|D)\.?(r|R)\.?(\b)", "", x) # remove Dr abreviation
    x = re.sub(r"(\b)(m|M)\.?(d|D)\.?(\b)", "", x) # remove M.D. abreviation
    x = re.sub(r"([^A-Za-z0-9\s](\s)){2,}", "", x)# remove consecutive punctuations

    return(x.replace("  ", " "))

In [None]:
%time df["text"] = df["text"].apply(lambda x: clean_text(x))

In [None]:
stop_words = stopwords.words('english') # nltk stopwords

In [None]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^a-z\s]')
STOPWORDS = set(stopwords.words('english')) # import stopwords from nltk

Expanded stopwords list from: https://github.com/kavgan/clinical-concepts

In [None]:
# we import a text file with a list of additional stopwords
clinical_stopwords = open(os.path.join(path_to_repo, "data","stopwords.txt")).read().split()

In [None]:
# Find out the source !!!

med_stopwords = {'street',
'address',
'required',
'powder',
'developed',
'intermittently',
'weekly',
'later',
'echo',
'commands',
'comfort',
'back',
'ultimately',
'complete',
'daughter',
'nutrition',
'range',
'knee',
'subsequently',
'summary',
'upon',
'service',
'completed',
'consistent',
'pattern',
'woman',
'address',
'inhaled',
'times',
'count',
'number',
'underwent',
'post',
'oral',
'general',
'recommend',
'goal',
'remains',
'need',
'report',
'solution',
'female',
'exploratory',
'level',
'poor',
'aggressive',
'support',
'determined',
'hand',
'instructions',
'follow',
'rehabilitation',
'large',
'decreased',
'stay',
'four',
'hours',
'intake',
'name',
'patient',
'access',
'good',
'comfortable',
'type',
'initials',
'external',
'percent',
'descending',
'output',
'collection',
'stop',
'presented',
'unit',
'name',
'positive',
'number',
'dictated',
'line',
'plus',
'date',
'active',
'done',
'records',
'state',
'month',
'notable',
'requiring',
'factor',
'current',
'male',
'history',
'number',
'completed',
'tenderness',
'ward',
'name',
'office',
'port',
'impression',
'trace',
'improvement',
'group',
'scan',
'given',
'patient',
'laboratory',
'right',
'upper',
'however',
'patient',
'volume',
'limited',
'suggestive',
'presents',
'year',
'also',
"mg",
"ml",
"mm",
"unchanged",
"normal",
"admissions",
"social"
}
if heavier_proc == False: med_stopwords = {}

In [None]:
STOPWORDS |= set(clinical_stopwords) | med_stopwords # we merge the two sets of stopwords

In [None]:
lemmatizer = WordNetLemmatizer()
stemmer = SnowballStemmer("english", ignore_stopwords=True) # we initialize our stemmer

In [None]:
def text_prepare(text) :
    """
        text: a string        
        return: modified initial string
    """
        
    text = text.lower() # lowercase text
    text = unidecode((text))
    text = REPLACE_BY_SPACE_RE.sub(" ", text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub(" ",text) # delete symbols which are in BAD_SYMBOLS_RE from text
    if lemmatize == True:
        if spacy == True:
            doc = nlp(text)
            text = " ".join([token.lemma_ for token in doc])
        else:
            text =  " ".join([lemmatizer.lemmatize(x) for x in text.split()])
    else:
        text =  " ".join([stemmer.stem(x) for x in text.split()])
    text =  " ".join([x for x in text.split()])
   
    return text

STOPWORDS = " ".join([x for x in STOPWORDS]) # we transform our stopwords list into a text

STOPWORDS = text_prepare(STOPWORDS) # then pre process it to get lemmas

STOPWORDS = [x for x in STOPWORDS.split()] # finally re-transform it into a list

def remove_stopwords(text):
    text =  " ".join([x for x in text.split()if x not in STOPWORDS]) # delete stopwords from text
    return text

def final_text(text):
    text = text_prepare(text)
    text = remove_stopwords(text)
    return text

In [None]:
%time df['clean'] = df.text.apply(lambda x: final_text(x))

In [None]:
# need to reset the index
df.reset_index(inplace=True, drop = True)
# save our dataset up to now in feather format
df.to_feather(f'{path_to_data}df_cleaned{lemma_tag}{heavier_tag}')