# This script is to demonstrate the text pre-processing steps

In [8]:
# Importing required libraries
import nltk # for natural language processing
import re # for regular expression
import string # for punctuations
from nltk.corpus import stopwords # For stop words removal
from nltk.stem import PorterStemmer # For stemming
from nltk.stem import WordNetLemmatizer # For Lemmatizer
from nltk.tokenize import sent_tokenize # For sentence tokenization
from nltk.tokenize import word_tokenize # For work tokenization

In [34]:
# Loading the text for pre-processing

text = """
'You can't run away from truth'
'What do you mean?'
Truth strikes you every second of your life, You try to hide ,but it is
the light of your life, And  you can't keep it shut, It's 1 lamp that
shows the right way,
TRUTH can never be contained or controlled
"""

In [35]:
# Step 1: Expanding contractions

contractionDictionary = {"can't": "can not","'s":" is"}

# Regular expression for finding contractions
contractionsRegExp=re.compile('(%s)' % '|'.join(contractionDictionary.keys()))

# Function for expanding contraction
def expandContractions(text,contractionDictionary=contractionDictionary):
    def replace(match):
        return contractionDictionary[match.group(0)]
    return contractionsRegExp.sub(replace, text)

# Expanding Contractions in our text file
text = expandContractions(text)

In [36]:
# Printing the text
print (text)


'You can not run away from truth'
'What do you mean?'
Truth strikes you every second of your life, You try to hide ,but it is
the light of your life, And  you can not keep it shut, It is 1 lamp that
shows the right way,
TRUTH can never be contained or controlled



In [37]:
# Step 2: Converting the text into lowercase
text = text.lower()

In [38]:
# Printing the text
print (text)


'you can not run away from truth'
'what do you mean?'
truth strikes you every second of your life, you try to hide ,but it is
the light of your life, and  you can not keep it shut, it is 1 lamp that
shows the right way,
truth can never be contained or controlled



In [39]:
# Step 3: Removal of punctuations
text = re.sub('[%s]' % re.escape(string.punctuation), '', text)

In [40]:
# Printing the text
print (text)


you can not run away from truth
what do you mean
truth strikes you every second of your life you try to hide but it is
the light of your life and  you can not keep it shut it is 1 lamp that
shows the right way
truth can never be contained or controlled



In [41]:
# Step 4: Remove words and digits containing digits
text = re.sub(r'[0-9]','', text)

In [42]:
# Printing the text
print (text)


you can not run away from truth
what do you mean
truth strikes you every second of your life you try to hide but it is
the light of your life and  you can not keep it shut it is  lamp that
shows the right way
truth can never be contained or controlled



In [43]:
# Step 5: Removal of stop words

stopWords = set(stopwords.words('english'))

text = " ".join([word for word in str(text).split() if word not in stopWords])

In [44]:
# Printing the text
print (text)

run away truth mean truth strikes every second life try hide light life keep shut lamp shows right way truth never contained controlled


In [20]:
# Step 5a: Removal of stop words specific to domain

stopWords.add('life')

text = " ".join([word for word in str(text).split() if word not in stopWords])

In [21]:
# Printing the text
print (text)

run away truth mean truth strikes every second try hide light keep shut lamp shows right way truth never contained controlled


In [45]:
# Step 6: Stemming

stemmer = PorterStemmer()

text = " ".join([stemmer.stem(word) for word in text.split()])

In [47]:
# Printing the text
print (text)

run away truth mean truth strike everi second life tri hide light life keep shut lamp show right way truth never contain control


In [50]:
# Step 7: Lemmatization

lemmatizer = WordNetLemmatizer()

text = " ".join([lemmatizer.lemmatize(word) for word in text.split()])

In [51]:
# Printing the text
print (text)

run away truth mean truth strike everi second life tri hide light life keep shut lamp show right way truth never contain controlchildren


In [26]:
# Step 8: Remove Extra Spaces

text = re.sub(' +', ' ', text)

In [27]:
# Printing the text
print (text)

run away truth mean truth strike everi second tri hide light keep shut lamp show right way truth never contain control


In [28]:
# Step 9: Tokenization (Sentence)

tokenized_sentence=sent_tokenize(text)

print(tokenized_sentence)

['run away truth mean truth strike everi second tri hide light keep shut lamp show right way truth never contain control']


In [29]:
# Step 9a: Tokenization (Words)

tokenized_word=word_tokenize(text)

print(tokenized_word)

['run', 'away', 'truth', 'mean', 'truth', 'strike', 'everi', 'second', 'tri', 'hide', 'light', 'keep', 'shut', 'lamp', 'show', 'right', 'way', 'truth', 'never', 'contain', 'control']
