<a href="https://colab.research.google.com/github/imamslogic/NLP/blob/main/Cleaning_Text_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text Cleaning in Python Using NLTK

In [46]:
#warnings :)
import warnings
warnings.filterwarnings('ignore')

In [47]:
#Creating bunch of sentences
raw_docs = ["I am writing some very basic english sentences",
"I'm just writing it for the demo PURPOSE to make audience understand the basics .",
"The point is to _learn HOW it works_ on #simple # data."]

In [48]:
#importing nltk package
import nltk

In [49]:
#nltk.download()

#python -m nltk.downloader all

# Step 1 - convert to lower case

In [50]:
import string
raw_docs = [doc.lower() for doc in raw_docs]
print(raw_docs)

['i am writing some very basic english sentences', "i'm just writing it for the demo purpose to make audience understand the basics .", 'the point is to _learn how it works_ on #simple # data.']


In [51]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Step 2 - Tokenization

In [52]:
# word tokenize
from nltk.tokenize import word_tokenize
tokenized_docs = [word_tokenize(doc) for doc in raw_docs]
print(tokenized_docs)

print("#######################################################################################")

#Sentence tokenization

from nltk.tokenize import sent_tokenize
sent_token = [sent_tokenize(doc) for doc in raw_docs]
print(sent_token)

[['i', 'am', 'writing', 'some', 'very', 'basic', 'english', 'sentences'], ['i', "'m", 'just', 'writing', 'it', 'for', 'the', 'demo', 'purpose', 'to', 'make', 'audience', 'understand', 'the', 'basics', '.'], ['the', 'point', 'is', 'to', '_learn', 'how', 'it', 'works_', 'on', '#', 'simple', '#', 'data', '.']]
#######################################################################################
[['i am writing some very basic english sentences'], ["i'm just writing it for the demo purpose to make audience understand the basics ."], ['the point is to _learn how it works_ on #simple # data.']]


# Step 3 - Punctuation Removal

In [53]:
# Removing punctuation
import re
regex = re.compile('[%s]' % re.escape(string.punctuation)) #see documentation here: http://docs.python.org/2/library/string.html

tokenized_docs_no_punctuation = []

for review in tokenized_docs:
    new_review = []
    for token in review:
        new_token = regex.sub(u'', token)
        if not new_token == u'':
            new_review.append(new_token)
    
    tokenized_docs_no_punctuation.append(new_review)
    
print(tokenized_docs_no_punctuation)

[['i', 'am', 'writing', 'some', 'very', 'basic', 'english', 'sentences'], ['i', 'm', 'just', 'writing', 'it', 'for', 'the', 'demo', 'purpose', 'to', 'make', 'audience', 'understand', 'the', 'basics'], ['the', 'point', 'is', 'to', 'learn', 'how', 'it', 'works', 'on', 'simple', 'data']]


In [54]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Step 4 - Removing Stopwords

In [55]:
# Cleaning text of stopwords
from nltk.corpus import stopwords

tokenized_docs_no_stopwords = []

for doc in tokenized_docs_no_punctuation:
    new_term_vector = []
    for word in doc:
        if not word in stopwords.words('english'):
            new_term_vector.append(word)
    
    tokenized_docs_no_stopwords.append(new_term_vector)

print(tokenized_docs_no_stopwords)

[['writing', 'basic', 'english', 'sentences'], ['writing', 'demo', 'purpose', 'make', 'audience', 'understand', 'basics'], ['point', 'learn', 'works', 'simple', 'data']]


In [56]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Step 5- Stemming and Lemmantization

In [57]:
# Stemming and Lemmatization
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

porter = PorterStemmer()
wordnet = WordNetLemmatizer()

preprocessed_docs = []

for doc in tokenized_docs_no_stopwords:
    final_doc = []
    for word in doc:
        #final_doc.append(porter.stem(word))
        final_doc.append(wordnet.lemmatize(word))
    
    preprocessed_docs.append(final_doc)

print(preprocessed_docs)

[['writing', 'basic', 'english', 'sentence'], ['writing', 'demo', 'purpose', 'make', 'audience', 'understand', 'basic'], ['point', 'learn', 'work', 'simple', 'data']]


In [58]:
pip install normalise



In [59]:
nltk.download('brown')
nltk.download('names')
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package names to /root/nltk_data...
[nltk_data]   Package names is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

# Advance cleaning technique 1 - Normalization 

In [60]:
text = "On the 30th Jan 2020,Corona virus hit India with 1st case in kerala  anywhere, G.O.I started acting and allocated fund of 17287 Crores I.N.R"

In [61]:
from normalise import normalise

custom_abbr = {
    "G.O.I": "Government Of India",
    "I.N.R": "Indian Rupees",
    "ttyl":"talk to you later"
    
}

normalized_tokens = normalise(word_tokenize(text), user_abbrevs=custom_abbr, verbose=False)
display(f"Normalized text: {' '.join(normalized_tokens)}")

'Normalized text: On the thirtieth of Jan twenty twenty , Corona virus hit India with first case in kerala anywhere , Government Of India started acting and allocated fund of seventeen thousand, two hundred and eighty seven Crores Indian Rupees'

# Advance cleaning technique 2 - Type corection Using pyspellchecker, autocorrect and textblob

In [62]:
pip install pyspellchecker



In [63]:
from spellchecker import SpellChecker

In [64]:
Spell = SpellChecker()

In [65]:
doc = ['misspel', 'calandar', 'naccessary', 'dignity', 'bussiness', 'recive']

In [66]:
for word in doc:
  print(f"{word} : {Spell.correction(word)}")

misspel : misspelt
calandar : calendar
naccessary : necessary
dignity : dignity
bussiness : business
recive : receive


In [67]:
for word in doc:
  print(f"{word} : {Spell.candidates(word)}")

misspel : {'misspelt'}
calandar : {'calendar'}
naccessary : {'accessory', 'necessary'}
dignity : {'dignity'}
bussiness : {'fussiness', 'bossiness', 'business'}
recive : {'recife', 'recipe', 'relive', 'revive', 'receive', 'recite'}


In [68]:
pip install autocorrect



In [73]:
from autocorrect import spell

In [75]:
for word in doc:
  print(spell(word))

autocorrect.spell is deprecated,             use autocorrect.Speller instead
missed
autocorrect.spell is deprecated,             use autocorrect.Speller instead
calendar
autocorrect.spell is deprecated,             use autocorrect.Speller instead
necessary
autocorrect.spell is deprecated,             use autocorrect.Speller instead
dignity
autocorrect.spell is deprecated,             use autocorrect.Speller instead
business
autocorrect.spell is deprecated,             use autocorrect.Speller instead
receive


In [76]:
pip install textblob



In [77]:
from textblob import TextBlob, Word

In [78]:
txt = TextBlob("He is very much accomadate in his new locotion")

In [80]:
for word in txt.words:
  print(word,":",word.correct())

He : He
is : is
very : very
much : much
accomadate : accommodate
in : in
his : his
new : new
locotion : location
