In [16]:
# pip install nltk

In [17]:
import pandas as pd
import re
import string
import nltk 
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [18]:
df = pd.read_csv("../data/raw/clinical_notes.csv")
df.note_text.head()

0    Patient is a 32-year-old Male with a history o...
1    Patient is a 65-year-old Female with a history...
2    Patient is a 20-year-old Female with a history...
3    Patient is a 39-year-old Female with a history...
4    Patient is a 69-year-old Male with a history o...
Name: note_text, dtype: object

In [19]:
#Lowercasing because to treat each word equally
df['clean_text'] = df['note_text'].str.lower()
df.clean_text.head()

0    patient is a 32-year-old male with a history o...
1    patient is a 65-year-old female with a history...
2    patient is a 20-year-old female with a history...
3    patient is a 39-year-old female with a history...
4    patient is a 69-year-old male with a history o...
Name: clean_text, dtype: object

In [20]:
#Remove punctuation + number:- because it does not provide meaning for simple NLP task
df['clean_text'] = df['clean_text'].apply(lambda x: re.sub(f"[{re.escape(string.punctuation)}0-9]"," ", x))

In [21]:
#Tokenation: splitting the word to analyze the frequency of the word
df['tokens'] = df['clean_text'].str.split()

In [22]:
#Remove Stopwords: Words like is, to, the are too common which does not help analyzing
nltk.download("stopwords")
stopwords = set(stopwords.words('english'))

df['tokens'] = df['tokens'].apply(lambda tokens: [word for word in tokens if word not in stopwords])


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hines\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
#Lemmatization:- its normalize the word like "running"-> run, "diagnoses" -> diagnosis

nltk.download("wordnet")
nltk.download("omw-1.4") #: This refers to version 1.4 of the Open Multilingual Wordnet (OMW). The OMW is a collection of wordnets for various languages, all structured similarly to Princeton WordNet. It provides lexical information for multiple languages, which can be useful for tasks like cross-lingual information retrieval or multilingual lemmatization.
lemmatizer = WordNetLemmatizer()

df['tokens'] = df['tokens'].apply(lambda tokens: [lemmatizer.lemmatize(word) for word in tokens])
df.tokens.head()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hines\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\hines\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


0    [patient, year, old, male, history, healthy, p...
1    [patient, year, old, female, history, hyperten...
2    [patient, year, old, female, history, diabetes...
3    [patient, year, old, female, history, asthma, ...
4    [patient, year, old, male, history, hypertensi...
Name: tokens, dtype: object

In [None]:
#Join Back(some models like tf-idf(term frequency-inverse document frequency) expect full string, not token list)
#tf-idf reflects how important a word is to a document in a collection or corpus

df['processed_text'] = df['tokens'].apply(lambda x: " ".join(x))

0    patient year old male history healthy presenti...
1    patient year old female history hypertension p...
2    patient year old female history diabetes prese...
3    patient year old female history asthma present...
4    patient year old male history hypertension pre...
Name: processed_text, dtype: object

In [25]:
print(df[["note_text", "processed_text"]].head(5))

                                           note_text  \
0  Patient is a 32-year-old Male with a history o...   
1  Patient is a 65-year-old Female with a history...   
2  Patient is a 20-year-old Female with a history...   
3  Patient is a 39-year-old Female with a history...   
4  Patient is a 69-year-old Male with a history o...   

                                      processed_text  
0  patient year old male history healthy presenti...  
1  patient year old female history hypertension p...  
2  patient year old female history diabetes prese...  
3  patient year old female history asthma present...  
4  patient year old male history hypertension pre...  
