# This script is to demonstrate the text pre-processing steps

In [48]:
# Downloads. Uncomment and execute if you are using first time
#nltk.download('wordnet') # To download wordnet
#nltk.download('omw-1.4') # To download omw-1.4
#nltk.download('averaged_perceptron_tagger')# for POS tagging
#nltk.download('stopwords') # for stop words
#nltk.download('punkt') # forsentence tokenization



# Importing required libraries
import nltk # for natural language processing
import re # for regular expression
import string # for punctuations
from nltk.corpus import stopwords # For stop words removal
from nltk.stem import PorterStemmer # For stemming
from nltk.stem import WordNetLemmatizer # For Lemmatizer
from nltk.tokenize import sent_tokenize # For sentence tokenization
from nltk.tokenize import word_tokenize # For work tokenization

In [49]:
# Loading the text for pre-processing

text = """
'You can't run away from truth'
'What do you mean?'
Truth strikes you every second of your life, You try to hide ,but it is
the light of your life, And  you can't keep it shut, It's 1 lamp that
shows the right way,
TRUTH can never be contained or controlled
"""

In [50]:
# Step 1: Expanding contractions

contractionDictionary = {"can't": "can not","'s":" is"}

# Regular expression for finding contractions
contractionsRegExp=re.compile('(%s)' % '|'.join(contractionDictionary.keys()))

# Function for expanding contraction
def expandContractions(text,contractionDictionary=contractionDictionary):
    def replace(match):
        return contractionDictionary[match.group(0)]
    return contractionsRegExp.sub(replace, text)

# Expanding Contractions in our text file
text = expandContractions(text)

In [51]:
# Printing the text
print (text)


'You can not run away from truth'
'What do you mean?'
Truth strikes you every second of your life, You try to hide ,but it is
the light of your life, And  you can not keep it shut, It is 1 lamp that
shows the right way,
TRUTH can never be contained or controlled



In [52]:
# Step 2: Converting the text into lowercase
text = text.lower()

In [53]:
# Printing the text
print (text)


'you can not run away from truth'
'what do you mean?'
truth strikes you every second of your life, you try to hide ,but it is
the light of your life, and  you can not keep it shut, it is 1 lamp that
shows the right way,
truth can never be contained or controlled



In [54]:
# Step 3: Removal of punctuations
text = re.sub('[%s]' % re.escape(string.punctuation), '', text)

In [55]:
# Printing the text
print (text)


you can not run away from truth
what do you mean
truth strikes you every second of your life you try to hide but it is
the light of your life and  you can not keep it shut it is 1 lamp that
shows the right way
truth can never be contained or controlled



In [56]:
# Step 4: Remove words and digits containing digits
text = re.sub(r'[0-9]','', text)

In [57]:
# Printing the text
print (text)


you can not run away from truth
what do you mean
truth strikes you every second of your life you try to hide but it is
the light of your life and  you can not keep it shut it is  lamp that
shows the right way
truth can never be contained or controlled



In [58]:
# Step 5: Removal of stop words

stopWords = set(stopwords.words('english'))

text = " ".join([word for word in str(text).split() if word not in stopWords])

In [59]:
# Printing the text
print (text)

run away truth mean truth strikes every second life try hide light life keep shut lamp shows right way truth never contained controlled


In [60]:
# Step 5a: Removal of stop words specific to domain

stopWords.add('life')

text = " ".join([word for word in str(text).split() if word not in stopWords])

In [61]:
# Printing the text
print (text)

run away truth mean truth strikes every second try hide light keep shut lamp shows right way truth never contained controlled


In [62]:
# Step 6: Stemming

stemmer = PorterStemmer()

text = " ".join([stemmer.stem(word) for word in text.split()])

In [63]:
# Printing the text
print (text)

run away truth mean truth strike everi second tri hide light keep shut lamp show right way truth never contain control


In [64]:
# Step 7: Lemmatization

lemmatizer = WordNetLemmatizer()

text = " ".join([lemmatizer.lemmatize(word) for word in text.split()])

In [65]:
# Printing the text
print (text)

run away truth mean truth strike everi second tri hide light keep shut lamp show right way truth never contain control


In [66]:
# Step 8: Remove Extra Spaces

text = re.sub(' +', ' ', text)

In [67]:
# Printing the text
print (text)

run away truth mean truth strike everi second tri hide light keep shut lamp show right way truth never contain control


In [68]:
# Step 9: Tokenization (Sentence)

tokenized_sentence=sent_tokenize(text)

print(tokenized_sentence)

['run away truth mean truth strike everi second tri hide light keep shut lamp show right way truth never contain control']


In [69]:
# Step 9a: Tokenization (Words)

tokenized_word=word_tokenize(text)

print(tokenized_word)

['run', 'away', 'truth', 'mean', 'truth', 'strike', 'everi', 'second', 'tri', 'hide', 'light', 'keep', 'shut', 'lamp', 'show', 'right', 'way', 'truth', 'never', 'contain', 'control']


### Part-Of-Speech Tagging

To identify the grammatical group of a given word. 

<B>Nouns</B>
NN(noun singular),  NNP(proper noun singular) NNS(noun plural) and NNPS(proper noun plural).

<B>Main Verbs</B>
VB(base form), VBD(past tense), VBG(gerund/present participle), VBN (past participle), VBP(singular present, non-3d), VBZ (3rd person singular present).


<B>Others</B><BR>
<PRE>CC: coordinating conjunction
CD: cardinal digit
DT: determiner
EX: existential there (like: "there is" ... think of it like "there exists")
FW: foreign word
IN: preposition/subordinating conjunction
JJ: adjective	'big'
JJR: adjective, comparative	'bigger'
JJS: adjective, superlative	'biggest'
LS: list marker	1)
MD: modal	could, will
PDT: predeterminer	'all the kids'
POS: possessive ending	parent\'s
PRP: personal pronoun	I, he, she
"PRP\$": possessive pronoun	my, his, hers
RB: adverb	very, silently,
RBR: adverb, comparative	better
RBS: adverb, superlative	best
RP: particle	give up
TO: to	go 'to' the store.
UH: interjection	errrrrrrrm
WDT: wh-determiner	which
WP: wh-pronoun	who, what
WP$: possessive wh-pronoun	whose
WRB: wh-abverb	where, when
</PRE>

In [72]:
# Categorization: Part of Speech Tagging (POS)
nltk.pos_tag(["You", "are", "an", "amazing", "student"])

[('You', 'PRP'),
 ('are', 'VBP'),
 ('an', 'DT'),
 ('amazing', 'JJ'),
 ('student', 'NN')]