# Create a NLP Pipeline to 'Clean' Reviews Data
- Load Input File and Read Reviews
- Tokenize
- Remove Stopwords
- Perform Stemming
- Write cleaned data to output file

*NLTK*

In [2]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [3]:
# Init Object
tokenizer= RegexpTokenizer(r'\w+') # coverting data into words
en_stopwords= set(stopwords.words('english')) # stoping the words like verb and preposition
ps= PorterStemmer() # same words with different prefix or suffix

In [4]:
def getStemmedReview(review):
    
    review= review.lower()
    review= review.replace("<br /><br />"," ")
    
    #Tokenize
    tokens= tokenizer.tokenize(review)
    new_tokens= [token for token in tokens if token not in en_stopwords] # stopwords
    stemmed_tokens= [ps.stem(token) for token in new_tokens] # stemming
    
    cleaned_review= ' '.join(stemmed_tokens)
    
    return cleaned_review

In [6]:
sample_text = """I loved this movie <br /><br /> since I was 7 and I saw 
it on the opening day. It was so touching and beautiful.
I strongly recommend seeing for all. It's a movie to watch with your family 
by far.<br /><br />My MPAA rating: PG-13 for thematic elements, prolonged scenes of disastor, 
nudity/sexuality and some language."""

In [7]:
getStemmedReview(sample_text)

'love movi sinc 7 saw open day touch beauti strongli recommend see movi watch famili far mpaa rate pg 13 themat element prolong scene disastor nuditi sexual languag'