# Text Cleaning

In [12]:
# Import libraries
import numpy as np
import pandas as pd
import os
import string
import re
from IPython.display import display, HTML
import spacy
import scispacy
from spacy.lang.en.stop_words import STOP_WORDS
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

##for clustering
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk import word_tokenize
from unidecode import unidecode

import sys

import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\luca9\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\luca9\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
try:
  from google.colab import drive
  IN_COLAB=True
except:
  IN_COLAB=False

if IN_COLAB:
  print("We're running Colab")

if IN_COLAB:  
  # Mount the Google Drive at mount
  mount='/content/gdrive'
  print("Colab: mounting Google drive on ", mount)
  # connect your colab with the drive
  drive.mount(mount)

 # Switch to the directory on the Google Drive that you want to use
  import os
  path_to_repo = mount + "/My Drive/MIMIC-III Text Mining/mimim_iii_readmission"

else:
   path_to_repo = os.path.dirname(os.getcwd())

  
print(path_to_repo)

C:\Users\luca9\Documents\MIMIC-III Text Mining\mimim_iii_readmission


In [3]:
path_to_data = os.path.join(path_to_repo, "data","")
print(path_to_data)

C:\Users\luca9\Documents\MIMIC-III Text Mining\mimim_iii_readmission\data\


In [4]:
df = pd.read_feather(os.path.join(path_to_data,"df_cleaned"))

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33309 entries, 0 to 33308
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   index            33309 non-null  int64         
 1   subject_id       33309 non-null  int64         
 2   hadm_id          33309 non-null  int64         
 3   admittime        33309 non-null  datetime64[ns]
 4   dischtime        33309 non-null  datetime64[ns]
 5   first_careunit   33309 non-null  object        
 6   last_careunit    33309 non-null  object        
 7   age              33309 non-null  float64       
 8   gender           33309 non-null  object        
 9   marital_status   31747 non-null  object        
 10  insurance        33309 non-null  object        
 11  diagnosis        33308 non-null  object        
 12  text             33309 non-null  object        
 13  next_readmit_dt  33309 non-null  float64       
 14  target           33309 non-null  int32

In [7]:
def clean_text(x):
    x = " ".join(x.split())
    x= " ".join((" ".join(x.split("[**"))).split("**]"))
    x = re.sub(r"\([^()]*\)", "", x)
    key_value_strip =(x.split(":"))
    # remove all sub strings which have a length lesser than 50 characters
    string = " ".join([sub_unit for sub_unit in key_value_strip if len(sub_unit)>50])
    x = re.sub(r"(\d+)+(\.|\))", "", string) # remove all serialization eg 1. 1)
    x = re.sub(r"(\*|\?|=)+", "", x) # removing all *, ? and =
    x = re.sub(r"\b(\w+)( \1\b)+", r"\1", x) ## removing consecutive duplicate words
    x = x.replace("FOLLOW UP", "FOLLOWUP")
    x = x.replace("FOLLOW-UP", "FOLLOWUP")
    x = re.sub(r"(\b)(f|F)(irst)(\b)?[\d\-\d]*(\s)*(\b)?(n|N)(ame)[\d\-\d]*(\s)*[\d\-\d]*(\b)","",x)# remove firstname
    x = re.sub(r"(\b)(l|L)(ast)(\b)?[\d\-\d]*(\s)*(\b)?(n|N)(ame)[\d\-\d]*(\s)*[\d\-\d]*(\b)", "", x) # remove lastname
    x = re.sub(r"(\b)(d|D)\.?(r|R)\.?(\b)", "", x) # remove Dr abreviation
    x = re.sub(r"(\b)(m|M)\.?(d|D)\.?(\b)", "", x) # remove M.D. abreviation
    x = re.sub(r"([^A-Za-z0-9\s](\s)){2,}", "", x)# remove consecutive punctuations

    return(x.replace("  ", " "))

In [8]:
%time df["text"] = df["text"].apply(lambda x: clean_text(x))

CPU times: total: 1min 28s
Wall time: 1min 29s


In [9]:
stop_words = stopwords.words('english') # nltk stopwords

In [25]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^a-z\s]')
STOPWORDS = set(stopwords.words('english')) # import stopwords from nltk

Expanded stopwords list from: https://github.com/kavgan/clinical-concepts

In [27]:
# we import a text file with a list of additional stopwords
clinical_stopwords = open(os.path.join(path_to_data,"stopwords.txt")).read().split()

In [28]:
STOPWORDS |= set(clinical_stopwords) # we merge the two sets of stopwords

In [11]:
stemmer = SnowballStemmer("english", ignore_stopwords=True) # we initialize our stemmer

In [30]:
def text_prepare(text) :
    """
        text: a string        
        return: modified initial string
    """
        
    text = text.lower() # lowercase text
    text = unidecode((text))
    text = REPLACE_BY_SPACE_RE.sub(" ", text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub(" ",text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text =  " ".join([stemmer.stem(x) for x in text.split()])
    text =  " ".join([x for x in text.split()])
    text =  " ".join([x for x in text.split()if x not in STOPWORDS]) # delete stopwords from text
   
    return text

In [31]:
%time df['clean'] = df.text.apply(lambda x: text_prepare(x))

CPU times: total: 6min 8s
Wall time: 6min 11s


In [33]:
# need to reset the index
df.reset_index(inplace=True, drop = True)
# save our dataset up to now in feather format
df.to_feather(f'{path_to_data}df_cleaned')