# Text Processing

After having cleaned all the discharged notes from irrelevant information, we will move onto processing the text (*removing stopwords and lemmatizing everything*) and performing the classical Bag of Word tokenization.

In [1]:
# Import libraries
import numpy as np
import pandas as pd
import os
import psycopg2
import matplotlib.pyplot as plt

%matplotlib inline

import string
import re

import spacy
import gensim
import pprint
from gensim import corpora
from gensim.utils import simple_preprocess
from gensim import utils
from nltk.corpus import stopwords # stopword library
import nltk
nltk.download('stopwords')

from spacy.lang.en.stop_words import STOP_WORDS # stopword library
try:
  from stop_words import get_stop_words # stopword library
except:
  !pip install stop_words
  from stop_words import get_stop_words # stopword library

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\luca9\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# PARAMETERS

icu_stays = True # set to TRUE if we want to have only ICU stays
med_7 = False # set to false if we want to avoid using Med7 preprocessing

if med_7 == False: 
    tag_med7 = '_nomed7'
else:
    tag_med7 = ''
    
if icu_stays == True:
    tag_icu = '_icu'
    icu_folder = 'icu_only'
else:
    tag_icu = ''
    icu_folder = 'all_hosp'

In [3]:
try:
  from google.colab import drive
  IN_COLAB=True
except:
  IN_COLAB=False

if IN_COLAB:
  print("We're running Colab")

if IN_COLAB:  
  # Mount the Google Drive at mount
  mount='/content/gdrive'
  print("Colab: mounting Google drive on ", mount)
  # connect your colab with the drive
  drive.mount(mount)

 # Switch to the directory on the Google Drive that you want to use
  import os
  path_to_repo = mount + "/My Drive/MIMIC-III Text Mining/Readmission"

else:
   path_to_repo = os.path.dirname(os.getcwd())

  
print(path_to_repo)

C:\Users\luca9\Documents\MIMIC-III Text Mining


In [4]:
path_to_data = os.path.join(path_to_repo, "Readmission","data", icu_folder,"")
print(path_to_data)

C:\Users\luca9\Documents\MIMIC-III Text Mining\Readmission\data\icu_only\


In [5]:
# load it back
df_final = pd.read_feather(f'{path_to_data}df_processed{tag_med7}')

---

In [6]:
def sent_to_words(sentences):
    """
    Function to split each text into tokens
    """
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

We first need to lowercase everything.

In [7]:
# lowercase everything
df_final.clean_text = df_final.clean_text.apply(lambda x: x.lower())

In [8]:
# lowercase everything
df_final.final_diagnosis = df_final.final_diagnosis.apply(lambda x: x.lower())

Then we define an expanded stopword list pulling terms from multiple libraries (SpaCy, NLTK, Gensim and Stop-Words).

In [9]:
# Define stopwords
stop_words = stopwords.words('english') # nltk stopwords
stop_words.extend(list(STOP_WORDS)) # add spacy stopwords
stop_words.extend(get_stop_words('en')) # add stop_words stopwords
stop_words.extend(list(gensim.parsing.preprocessing.STOPWORDS)) # add Gensim set
stop_words = list(set(stop_words)) # get only unique values
print(len(stop_words))

441


And then proceed with stopwords removal.

In [10]:
# Gensim Functions - > the version I had on my PC did not include custom stopwords
def remove_stopwords(s, stopwords=None):
    """Remove :const:`~gensim.parsing.preprocessing.STOPWORDS` from `s`.
    Parameters
    ----------
    s : str
    stopwords : iterable of str, optional
        Sequence of stopwords
        If None - using :const:`~gensim.parsing.preprocessing.STOPWORDS`
    Returns
    -------
    str
        Unicode string without `stopwords`.
    Examples
    --------
    .. sourcecode:: pycon
        >>> from gensim.parsing.preprocessing import remove_stopwords
        >>> remove_stopwords("Better late than never, but better never late.")
        u'Better late never, better late.'
    """
    s = utils.to_unicode(s)
    return " ".join(remove_stopword_tokens(s.split(), stopwords))

def remove_stopword_tokens(tokens, stopwords=None):
    """Remove stopword tokens using list `stopwords`.
    Parameters
    ----------
    tokens : iterable of str
        Sequence of tokens.
    stopwords : iterable of str, optional
        Sequence of stopwords
        If None - using :const:`~gensim.parsing.preprocessing.STOPWORDS`
    Returns
    -------
    list of str
        List of tokens without `stopwords`.
    """
    if stopwords is None:
        stopwords = STOPWORDS
    return [token for token in tokens if token not in stopwords]

In [11]:
# remove stopwords
df_final.clean_text = df_final.clean_text.apply(lambda x: remove_stopwords(x, stop_words))

In [12]:
# remove stopwords
df_final.final_diagnosis = df_final.final_diagnosis.apply(lambda x: remove_stopwords(x, stop_words))

Finally we will tokenize words, ignoring punctuation, and lemmatize everything.

In [13]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation
    """
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [14]:
def full_cleaning(text, lemmatize = True, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """
    Concatenates all the previously defined functions to clean and lemmatize our text
    """
    data_stop = list(sent_to_words(text))
    if lemmatize == True:
      data_lemma = lemmatization(data_stop, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
    else:
      data_lemma = data_stop
    return data_lemma

We create two columns: one with text processed with lemmatization, one without it. The goal is to check whether pre-trained word embedding methods can spot more words without lemmatization.

In [15]:
df_final["text_def_nolemma"]= full_cleaning(df_final.clean_text, lemmatize = False)

In [16]:
df_final["diagnosis_def_nolemma"]= full_cleaning(df_final.final_diagnosis, lemmatize = False)

In [17]:
df_final["text_def"]= full_cleaning(df_final.clean_text)

In [18]:
df_final["diagnosis_def"]= full_cleaning(df_final.final_diagnosis)

In [19]:
y = pd.DataFrame(df_final.target)

In [20]:
# need to reset the index
y.reset_index(inplace=True, drop = True)
# save our dataset up to now in feather format
y.to_feather(f'{path_to_data}y{tag_med7}')

In [21]:
# load it back
y = pd.read_feather(f'{path_to_data}y{tag_med7}')

In [22]:
# need to reset the index
df_final.reset_index(inplace=True, drop = True)
# save our dataset up to now in feather format
df_final.to_feather(f'{path_to_data}df_stem{tag_med7}')

In [23]:
# load it back
df_final = pd.read_feather(f'{path_to_data}df_stem{tag_med7}')