# Text Cleaning in Python

In [None]:
#warnings :)
import warnings
warnings.filterwarnings('ignore')

In [None]:
#Creating bunch of sentences
raw_docs = ["Hello Ben! How are you? These are just basic texts.",
"I'm just writing it for the demo PURPOSE.",
"The point is to _learn demonstrate how to use NLTK to perform cleansing_ on #simple # data."]

In [None]:
#importing nltk package
import nltk

In [None]:
nltk.download()


NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d

Download which package (l=list; x=cancel)?
  Identifier> l
Packages:
  [ ] abc................. Australian Broadcasting Commission 2006
  [ ] alpino.............. Alpino Dutch Treebank
  [ ] averaged_perceptron_tagger Averaged Perceptron Tagger
  [ ] averaged_perceptron_tagger_ru Averaged Perceptron Tagger (Russian)
  [ ] basque_grammars..... Grammars for Basque
  [ ] biocreative_ppi..... BioCreAtIvE (Critical Assessment of Information
                           Extraction Systems in Biology)
  [ ] bllip_wsj_no_aux.... BLLIP Parser: WSJ Model
  [ ] book_grammars....... Grammars from NLTK Book
  [ ] brown............... Brown Corpus
  [ ] brown_tei........... Brown Corpus (TEI XML Version)
  [ ] cess_cat............ CESS-CAT Treebank
  [

True

# Step 1 - convert to lower case

In [None]:
import string
raw_docs = [doc.lower() for doc in raw_docs]
print(raw_docs)

['hello ben! how are you? these are just basic texts.', "i'm just writing it for the demo purpose.", 'the point is to _learn demonstrate how to use nltk to perform cleansing_ on #simple # data.']


# Step 2 - Tokenization

In [None]:
# word tokenize
from nltk.tokenize import word_tokenize
tokenized_docs = [word_tokenize(doc) for doc in raw_docs]
print(tokenized_docs)


[['hello', 'ben', '!', 'how', 'are', 'you', '?', 'these', 'are', 'just', 'basic', 'texts', '.'], ['i', "'m", 'just', 'writing', 'it', 'for', 'the', 'demo', 'purpose', '.'], ['the', 'point', 'is', 'to', '_learn', 'demonstrate', 'how', 'to', 'use', 'nltk', 'to', 'perform', 'cleansing_', 'on', '#', 'simple', '#', 'data', '.']]


In [None]:
#Sentence tokenization

from nltk.tokenize import sent_tokenize
sent_token = [sent_tokenize(doc) for doc in raw_docs]
print(sent_token)

[['hello ben!', 'how are you?', 'these are just basic texts.'], ["i'm just writing it for the demo purpose."], ['the point is to _learn demonstrate how to use nltk to perform cleansing_ on #simple # data.']]


# Step 3 - Punctuation Removal

In [None]:
# Removing punctuation
import re
regex = re.compile('[%s]' % re.escape(string.punctuation)) #see documentation here: http://docs.python.org/2/library/string.html

tokenized_docs_no_punctuation = []

for review in tokenized_docs:
    new_review = []
    for token in review:
        new_token = regex.sub(u'', token)
        if not new_token == u'':
            new_review.append(new_token)
    
    tokenized_docs_no_punctuation.append(new_review)
    
print(tokenized_docs_no_punctuation)

[['hello', 'ben', 'how', 'are', 'you', 'these', 'are', 'just', 'basic', 'texts'], ['i', 'm', 'just', 'writing', 'it', 'for', 'the', 'demo', 'purpose'], ['the', 'point', 'is', 'to', 'learn', 'demonstrate', 'how', 'to', 'use', 'nltk', 'to', 'perform', 'cleansing', 'on', 'simple', 'data']]


# Step 4 - Removing Stopwords

In [None]:
# Cleaning text of stopwords
from nltk.corpus import stopwords

tokenized_docs_no_stopwords = []

for doc in tokenized_docs_no_punctuation:
    new_term_vector = []
    for word in doc:
        if not word in stopwords.words('english'):
            new_term_vector.append(word)
    
    tokenized_docs_no_stopwords.append(new_term_vector)

print(tokenized_docs_no_stopwords)

[['hello', 'ben', 'basic', 'texts'], ['writing', 'demo', 'purpose'], ['point', 'learn', 'demonstrate', 'use', 'nltk', 'perform', 'cleansing', 'simple', 'data']]


# Step 5- Stemming and Lemmantization

In [None]:
# Stemming and Lemmatization
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

porter = PorterStemmer()
wordnet = WordNetLemmatizer()

preprocessed_docs = []

for doc in tokenized_docs_no_stopwords:
    final_doc = []
    for word in doc:
        final_doc.append(porter.stem(word))
        #final_doc.append(wordnet.lemmatize(word))
    
    preprocessed_docs.append(final_doc)

print(preprocessed_docs)

[['hello', 'ben', 'basic', 'text'], ['write', 'demo', 'purpos'], ['point', 'learn', 'demonstr', 'use', 'nltk', 'perform', 'cleans', 'simpl', 'data']]


# Advance cleaning technique 1 - Normalization 

In [None]:
text = "F.Y.I, On the 30th Jan 2020 30 January 2020, W.H.O declared the Ebola a Public Health Emergency and allocated fund of 490,000,000 U.S.D"

In [None]:
!pip install normalise
from normalise import normalise

custom_abbr = {
    "W.H.O": "World Health Organization",
    "D.L.S": "United States Dollars",
    "F.Y.I":"For your information"
    
}

normalized_tokens = normalise(word_tokenize(text), user_abbrevs=custom_abbr, verbose=False)
display(f"Normalized text: {' '.join(normalized_tokens)}")

Collecting normalise
[?25l  Downloading https://files.pythonhosted.org/packages/28/2d/f06cf3d3714502dec10e19238a5da201b71ce198165beda9c1adaf5063da/normalise-0.1.8-py3-none-any.whl (15.7MB)
[K     |████████████████████████████████| 15.7MB 318kB/s 
Collecting roman
  Downloading https://files.pythonhosted.org/packages/c3/9e/47df0bf47ccd7e9bbbf0a539ac86e45ded37c34dba544a0a2e5d01ce5f88/roman-3.3-py2.py3-none-any.whl
Installing collected packages: roman, normalise
Successfully installed normalise-0.1.8 roman-3.3


'Normalized text: For your information , On the thirtieth of Jan twenty twenty the thirtieth of January twenty twenty , World Health Organization declared the Ebola a Public Health Emergency and allocated fund of four hundred and ninety million U S D'