## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hitika\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Import dataset

In [1]:
!wget 'https://www.dropbox.com/s/c6lmwqz67kpdsiq/review_dataset.csv'

'wget' is not recognized as an internal or external command,
operable program or batch file.


In [2]:
data = pd.read_csv('review_dataset.csv')
data.head()

Unnamed: 0,review,label
0,The Buddy Holly Story opens on a shot of a yel...,pos
1,***SPOILERS*** Like some evil Tinkers-to-Evers...,neg
2,Ghost of Dragstrip Hollow is a typical 1950's ...,neg
3,"Whatever you do, don't stop watching when you ...",pos
4,This is one of those movies you think that the...,neg


In [3]:
data.shape

(20000, 2)

In [4]:
words = stopwords.words('english')
words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

## preprocessing of data

In [5]:
def preprocess(text):
    text = text.lower()
    text = text.replace('<br /><br />','')
    text = re.sub('[^a-z]+'," ", text)
    text = text.split(" ")
    text = [w for w in text if w not in words]
    text = " ".join(text)
    return text

In [6]:
preprocess(data['review'][1])

' spoilers like evil tinkers evers chance double play combination omen iv evil seed deceased antichrist damien thorn come back terrorizing parents schoolmates neighbors finally entire world named delia york asia vieila given deserving couple yorks karen gene fay grant michael woods catholic church st francis orphanage little delia waste time making peasants felt scratching mom house party later delia almost get killed runaway truck saved devil dog named ryder going school delia takes care local bully getting big guy wet front classmates later father threatens yorks law suit head sliced self induced traffic accident delia someone never mess know good meanwhile dalia dad gene becomes big man town thinks getting elected congress champion clean air green trees crowd instead letting smog concrete boys take neighborhood eye white house bratty strange daughter delia anything gene york sudden good fortune later jo ann hearen hired delia nanny truth comes strange evil powers jo new age type rea

In [7]:
data['review'] = data['review'].apply(preprocess)

In [8]:
X = data['review'].values
y = data['label'].values\

y

array(['pos', 'neg', 'neg', ..., 'neg', 'neg', 'pos'], dtype=object)

## Splitting data

In [9]:
X_train,X_test, y_train,y_test = train_test_split(X,y,test_size = 0.15)

In [10]:
X_train.shape

(17000,)

In [11]:
X_test.shape

(3000,)

## data transformation

In [12]:
le = LabelEncoder()
le.fit(y_train)
y_train = le.transform(y_train)
y_test = le.transform(y_test)

In [13]:
cv = CountVectorizer(max_features=10000)
cv.fit(X_train)
# X_train = cv.transform(X_train).toarray()

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=10000, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [14]:
X_train = cv.transform(X_train).toarray()
X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [15]:
X_test = cv.transform(X_test).toarray()
X_test

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

## Model creation - Multinomial NB Model

In [16]:
mnb = MultinomialNB()
mnb.fit(X_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [17]:
mnb.predict(X_test)

array([1, 1, 0, ..., 1, 1, 1])

## checking accuracy of model

In [18]:
mnb.score(X_test,y_test)

0.8396666666666667

## using model to predict new data

In [19]:
my_review = 'movie was great. i enjoyed watching it with my friends. It was a great expeireince.'
my_review = preprocess(my_review)
my_review = cv.transform([my_review])
mnb.predict(my_review)

array([1])