##### Create a NLP Pipeline to 'Clean' Reviews Data
##### - Load Input File and Read Reviews
##### - Tokenize
##### - Remove Stopwords
##### - Perform Stemming
##### - Write cleaned data to output file


In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [0]:
# ### NLTK 

from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [0]:
# Downloading the nltk packages
import nltk
nltk.download('all')

In [0]:
from google.colab import files
uploaded = files.upload()

Saving clean_text.py to clean_text.py


## Multimonial Event Model

In [0]:
x = ["This was awesome an awesome movie",
     "Great movie! I liked it a lot",
     "Happy Ending! awesome acting by the hero",
     "loved it! truly great",
     "bad not upto the mark",
     "could have been better",
     "Surely a Disappointing movie"]

y = [1,1,1,1,0,0,0] # 1 - Positive, 0 - Negative Class

In [0]:
x_test = ["i was happy & happy and i loved the acting in the movie",
          "the movie i saw was bad"]

### Cleaning

In [0]:
import clean_text as ct

In [0]:
x_clean=[ct.getCleanReview(i) for i in x] #List comprehension
xt_clean=[ct.getCleanReview(i) for i in x_test]

In [0]:
print(x_clean)
print(xt_clean)

['awesom awesom movi', 'great movi like lot', 'happi end awesom act hero', 'love truli great', 'bad upto mark', 'could better', 'sure disappoint movi']
['happi happi love act movi', 'movi saw bad']


### Vectorization

In [0]:
from sklearn.feature_extraction.text import CountVectorizer

In [0]:
cv = CountVectorizer(ngram_range=(1,2))
x_vect= cv.fit_transform(x_clean).toarray()
print(x_vect.shape)
print(x_vect)

(7, 34)
[[0 0 2 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 1 1 0 0 0 1 1 0 0 0 0 0 0]
 [1 1 1 1 0 0 0 0 0 0 0 0 0 1 1 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 1 1 0 0]
 [0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1]
 [0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0]]


In [0]:
print(cv.get_feature_names())

['act', 'act hero', 'awesom', 'awesom act', 'awesom awesom', 'awesom movi', 'bad', 'bad upto', 'better', 'could', 'could better', 'disappoint', 'disappoint movi', 'end', 'end awesom', 'great', 'great movi', 'happi', 'happi end', 'hero', 'like', 'like lot', 'lot', 'love', 'love truli', 'mark', 'movi', 'movi like', 'sure', 'sure disappoint', 'truli', 'truli great', 'upto', 'upto mark']


In [0]:

## Vectorization on the test set
xt_vect = cv.transform(xt_clean).toarray()
print(xt_vect)
cv.get_feature_names()
print(xt_vect.shape)

[[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]]
(2, 34)


## Multimonial Naive Bayes

In [0]:
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB

In [0]:
mnb= MultinomialNB()
print(mnb)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)


In [0]:
mnb.fit(x_vect,y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [0]:
mnb.predict(xt_vect)

array([1, 0])

In [0]:
mnb.predict_proba(xt_vect)

array([[0.09580319, 0.90419681],
       [0.61972801, 0.38027199]])

In [0]:
mnb.score(x_vect,y)

1.0

## Multivariate Bernoulli Naive Bayes

In [0]:
bnb= BernoulliNB(binarize=0.0)
print(bnb)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)


In [0]:
bnb.fit(x_vect,y)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [0]:
bnb.predict(xt_vect)

array([1, 0])

In [0]:
mnb.predict_proba(xt_vect)

array([[0.09580319, 0.90419681],
       [0.61972801, 0.38027199]])

In [0]:
mnb.score(x_vect,y)

1.0