<h1> Movie_Review classification using NLP with Naive Bayes</h1>


In [1]:
# Importing relevant modules for use in code

import nltk
import string
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.classify import NaiveBayesClassifier

In [2]:
# Read multiple review text files from each 'neg' and 'pos' directories using glob module. Each text file 
# contains a single review for a movie. There are in total 1000 negative reviews and 1000 positive reviews 
# stored in their respective directories. 

import glob
import errno
path_neg = 'C:\\Users\\Hemang\\Desktop\\Data Science\\Python files\\txt_sentoken\\neg\\*.txt'
path_pos = 'C:\\Users\\Hemang\\Desktop\\Data Science\\Python files\\txt_sentoken\\pos\\*.txt'
files_neg = glob.glob(path_neg)
files_pos = glob.glob(path_pos)

In [3]:
# Creating a list of all 1000 negative reviews

list_neg =[]
for file in files_neg:
    try:
        with open(file,'r') as f:
            lst = f.read()
            list_neg.append(lst)
    except IOError as exc:
        if exc.errno != errno.EISDIR:
            raise

In [4]:
# Creating a list of all 1000 positive reviews
list_pos = []
for file in files_pos:
    try:
        with open(file,'r') as f:
            lst = f.read()
            list_pos.append(lst)
    except IOError as exc:
        if exc.errno != errno.EISDIR:
            raise

In [5]:
len(list_neg),len(list_pos)

(1000, 1000)

In [6]:
# Downloading stopwords package from nltk module
# Creating a list of stop words and punctuations to omit from the 2 lists created above using 
# bag_of_words_features_filtered() function defined in next cell

nltk.download("stopwords")
useless_words = nltk.corpus.stopwords.words("english") + list(string.punctuation)
useless_words[:20]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hemang\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his']

In [7]:
def bag_of_words_features_filtered(words):
    return {word:1 for word in words if not word in useless_words}

In [8]:
nltk.download("punkt") # Download word tokenizer 'punkt' from nltk
stemmer = PorterStemmer()
def clean(lst,sign):
    word_list=[]
    for i in lst:
        word = word_tokenize(i) # split text to each word
        word = [stemmer.stem(word) for word in word] # stemming of data
        word = [word for word in word if len(word)>2] #removing words less than 3 charters longas they do not contribute much
        word = (bag_of_words_features_filtered(word),sign) # returns a tuple with a review and its sign(neg or pos review)
        word_list.append(word)
    return word_list

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Hemang\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
negative_features = clean(list_neg,'neg') # Calling clean function to perform data cleaning on positive review list
print(negative_features[:1])

[({'plot': 1, 'two': 1, 'teen': 1, 'coupl': 1, 'church': 1, 'parti': 1, 'drink': 1, 'drive': 1, 'get': 1, 'accid': 1, 'one': 1, 'guy': 1, 'die': 1, 'girlfriend': 1, 'continu': 1, 'see': 1, 'life': 1, 'nightmar': 1, 'deal': 1, 'watch': 1, 'movi': 1, 'sorta': 1, 'find': 1, 'critiqu': 1, 'mind-fuck': 1, 'gener': 1, 'touch': 1, 'veri': 1, 'cool': 1, 'idea': 1, 'present': 1, 'bad': 1, 'packag': 1, 'make': 1, 'thi': 1, 'review': 1, 'even': 1, 'harder': 1, 'write': 1, 'sinc': 1, 'applaud': 1, 'film': 1, 'attempt': 1, 'break': 1, 'mold': 1, 'mess': 1, 'head': 1, 'lost': 1, 'highway': 1, 'memento': 1, 'good': 1, 'way': 1, 'type': 1, 'folk': 1, "n't": 1, 'snag': 1, 'correctli': 1, 'seem': 1, 'taken': 1, 'pretti': 1, 'neat': 1, 'concept': 1, 'execut': 1, 'terribl': 1, 'problem': 1, 'well': 1, 'main': 1, 'simpli': 1, 'jumbl': 1, 'start': 1, 'normal': 1, 'downshift': 1, 'fantasi': 1, 'world': 1, 'audienc': 1, 'member': 1, 'dream': 1, 'charact': 1, 'come': 1, 'back': 1, 'dead': 1, 'look': 1, 'like':

In [10]:
positive_features = clean(list_pos,'pos') # Calling clean function to perform data cleaning on negative review list
print(positive_features[:1])

[({'film': 1, 'adapt': 1, 'comic': 1, 'book': 1, 'plenti': 1, 'success': 1, 'whether': 1, "'re": 1, 'superhero': 1, 'batman': 1, 'superman': 1, 'spawn': 1, 'gear': 1, 'toward': 1, 'kid': 1, 'casper': 1, 'arthous': 1, 'crowd': 1, 'ghost': 1, 'world': 1, 'never': 1, 'realli': 1, 'like': 1, 'hell': 1, 'befor': 1, 'starter': 1, 'creat': 1, 'alan': 1, 'moor': 1, 'eddi': 1, 'campbel': 1, 'brought': 1, 'medium': 1, 'whole': 1, 'new': 1, 'level': 1, 'mid': 1, "'80": 1, '12-part': 1, 'seri': 1, 'call': 1, 'watchmen': 1, 'say': 1, 'thoroughli': 1, 'research': 1, 'subject': 1, 'jack': 1, 'ripper': 1, 'would': 1, 'michael': 1, 'jackson': 1, 'start': 1, 'look': 1, 'littl': 1, 'odd': 1, 'graphic': 1, 'novel': 1, '500': 1, 'page': 1, 'long': 1, 'includ': 1, 'nearli': 1, 'consist': 1, 'noth': 1, 'footnot': 1, 'word': 1, "n't": 1, 'dismiss': 1, 'thi': 1, 'becaus': 1, 'sourc': 1, 'get': 1, 'past': 1, 'thing': 1, 'might': 1, 'find': 1, 'anoth': 1, 'stumbl': 1, 'block': 1, 'director': 1, 'albert': 1, 'all

In [11]:
# 80/20 split for training data/test data
split = 800  

# Fitting training data using Naive Bayes Classifier
sentiment_classifier = NaiveBayesClassifier.train(positive_features[:split]+negative_features[:split])

In [12]:
# Find the accuracy, using the training data

accuracy_traindata = nltk.classify.util.accuracy(sentiment_classifier,positive_features[:split]+negative_features[:split])*100
print('Model Accuracy using Training data: '+str(accuracy_traindata)+'%')

Model Accuracy using Training data: 96.0%


In [13]:
# Find the accuracy, using the test data

accuracy_testdata = nltk.classify.util.accuracy(sentiment_classifier,positive_features[split:]+negative_features[split:])*100
print('Model Accuracy using Training data: '+str(accuracy_testdata)+'%')

Model Accuracy using Training data: 70.25%
