# Movie Review Sentiment Prediction 

### Importing libraries:

In [5]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

### Loading the training dataset:

In [6]:
df = pd.read_csv('./Movie Rating Prediction data/Train/Train.csv')

In [7]:
df.head(20)

Unnamed: 0,review,label
0,mature intelligent and highly charged melodram...,pos
1,http://video.google.com/videoplay?docid=211772...,pos
2,Title: Opera (1987) Director: Dario Argento Ca...,pos
3,I think a lot of people just wrote this off as...,pos
4,This is a story of two dogs and a cat looking ...,pos
5,Steve Carell comes into his own in his first s...,pos
6,I'm only going to write more because it's requ...,neg
7,"OK, it was a ""risky"" move to rent this flick, ...",neg
8,"Cannibalism, a pair of cinematic references to...",pos
9,This is one of the great modern kung fu films....,pos


In [8]:
df.shape

(40000, 2)

In [9]:
dfx = df.iloc[:,0].values
dfy = df.iloc[:,1].values

In [10]:
type(dfx)

dfx.shape

(40000,)

In [11]:
temp = [x for x in dfy]

In [12]:
y_train = [1 if x == 'pos' else 0 for x in temp]

In [13]:
len(y_train)

40000

### Cleaning the training data


In [14]:
sw = stopwords.words('english')
sw.remove('not')
sw = set(sw)

ps = PorterStemmer()

In [15]:
def cleaning_pipeline(review):
    words = word_tokenize(review.lower())
    words = [ps.stem(word) for word in words if word not in sw and word.isalpha()]
    review = " ".join(words)
    return review

In [16]:
cleaned_reviews = [ cleaning_pipeline(review) for review in dfx]

In [17]:
len(cleaned_reviews)

40000

### Loading and cleaning testing data


In [18]:
dftest = pd.read_csv('./Movie Rating Prediction data/Test/Test.csv').values

In [19]:
dftest.shape

(10000, 1)

In [20]:
test_reviews = dftest.reshape((-1,))

In [21]:
test_reviews.shape

(10000,)

In [22]:
cleaned_test_rev = [cleaning_pipeline(review) for review in test_reviews]

In [23]:
len(cleaned_test_rev)

10000

### Vectorization

In [24]:
cv = CountVectorizer(ngram_range=(1,3))
x_train_vect = cv.fit_transform(cleaned_reviews)

In [25]:
#x_train_vect = x_train_vect.toarray()
x_train_vect.shape

(40000, 6237829)

In [26]:
x_test_vect = cv.transform(cleaned_test_rev)

In [27]:
x_test_vect.shape

(10000, 6237829)

In [28]:
#cv.vocabulary_

### 1.Multinomial NB

In [29]:
from sklearn.naive_bayes import MultinomialNB

In [30]:
mnb = MultinomialNB()

In [31]:
#Trainig
mnb.fit(x_train_vect,y_train)

MultinomialNB()

In [32]:
#Prediction
prediction = mnb.predict(x_test_vect)

In [33]:
type(prediction)

numpy.ndarray

In [34]:
#score
mnb.score(x_train_vect,y_train)

0.99975

In [58]:
import pickle
# now you can save it to a file
with open('MNBmodel.pkl', 'wb') as f:
    pickle.dump(mnb, f)

# and later you can load it
#with open('filename.pkl', 'rb') as f:
    #clf = pickle.load(f)

### 2.Multivariate Bernoulli NB


In [35]:
from sklearn.naive_bayes import BernoulliNB

In [36]:
bnb = BernoulliNB()

In [37]:
# Training 
bnb.fit(x_train_vect,y_train)

BernoulliNB()

In [38]:
# Testing 
pred2 = bnb.predict(x_test_vect)

In [39]:
#score
bnb.score(x_train_vect,y_train)

0.999575