In [1]:
import nltk
from nltk.corpus import movie_reviews

Text Pre Processing

In [2]:
len(movie_reviews.words())

1583820

In [3]:
movie_reviews.categories()

['neg', 'pos']

In [4]:
movie_reviews.fileids()[:5]

['neg/cv000_29416.txt',
 'neg/cv001_19502.txt',
 'neg/cv002_17424.txt',
 'neg/cv003_12683.txt',
 'neg/cv004_12641.txt']

We remove the punctuations:

In [5]:
text = " ".join(movie_reviews.words())
import string
text_filtered = text.translate(str.maketrans('', '', string.punctuation))

Then remove the stopwords and then make all words lowercase.

In [6]:
from nltk import word_tokenize
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
tokens = word_tokenize(text_filtered)
word_filtered = [w.lower() for w in tokens if w not in stopwords]

We can use FreqDist() function on NLTK to have a dictionary of frequency of apperance of word in a text.

In [7]:
counter_dict = nltk.FreqDist(word_filtered)

In [8]:
print(counter_dict.most_common(15))

[('film', 9519), ('one', 5853), ('movie', 5774), ('like', 3690), ('even', 2565), ('good', 2411), ('time', 2411), ('story', 2170), ('would', 2110), ('much', 2050), ('character', 2020), ('also', 1967), ('get', 1949), ('two', 1912), ('well', 1906)]


In [9]:
docs = [(list(movie_reviews.words(fileid)), category)
        for category in movie_reviews.categories()
        for fileid in movie_reviews.fileids(category)]

The above section of the code can be translated to: 

In every category (we have either pos or neg), take the entire file IDs (every review has own ID), 

Store the word_tokenized version (list of words) for the file ID, and then followed by the positive or negative label in one big list.

In [None]:
docs

Feature Extraction

We create a list containing 3000 most frequent words in the documents.

In [28]:
#word_features = [w[0] for w in counter_dict.most_common(3000)]
word_features = ['outstanding',
'schumacher',
'mulan',
'ludicrous',
'finest',
'welles',
'jolie',
'embarrassing',
'beautifully',
'religion',
'freddie',
'prinze',
'gon',
'wasted',
'lame',
'anna',
'ridiculous',
'wonderfully',
'garbage',
'idiotic',
'mature',
'alicia',
'anger',
'breathtaking',
'sandler',
'awful',
'lebowski',
'refreshing',
'whatsoever',
'inept',
'uninteresting',
'bore',
'tucker',
'painfully',
'waste',
'jedi',
'laughable',
'ordinary',
'flynt',
'bland',
'sat',
'henstridge',
'diaz',
'damon',
'nomination',
'dull',
'unfunny',
'lifeless',
'terrific',
'poorly',
'badly',
'hanks',
'random',
'damme',
'fincher',
'rape',
'snow',
'superb',
'worst',
'boring',
'era',
'stupid',
'reese',
'traditional',
'anywhere',
'inane',
'allows',
'obi',
'memorable',
'decades',
'friendship',
'skip',
'masterpiece',
'terrible',
'satisfying',
'freedom',
'designer',
'portrayed',
'spacey',
'sports',
'fits',
'snake',
'frankly',
'fantastic',
'innocence',
'subtle',
'extraordinary',
'pointless',
'contrast',
'remarkable',
'jackal',
'italian',
'terribly',
'realistic',
'understanding',
'insult',
'visually',
'mess',
'remake',
'banderas',
'tribe',
'lethal',
'colors',
'derek',
'jar',
'brilliantly',
'delight',
'delightful',
'joy',
'na',
'notch',
'excellent',
'patch',
'tedious',
'portrayal',]

In [29]:
word_features

['outstanding',
 'schumacher',
 'mulan',
 'ludicrous',
 'finest',
 'welles',
 'jolie',
 'embarrassing',
 'beautifully',
 'religion',
 'freddie',
 'prinze',
 'gon',
 'wasted',
 'lame',
 'anna',
 'ridiculous',
 'wonderfully',
 'garbage',
 'idiotic',
 'mature',
 'alicia',
 'anger',
 'breathtaking',
 'sandler',
 'awful',
 'lebowski',
 'refreshing',
 'whatsoever',
 'inept',
 'uninteresting',
 'bore',
 'tucker',
 'painfully',
 'waste',
 'jedi',
 'laughable',
 'ordinary',
 'flynt',
 'bland',
 'sat',
 'henstridge',
 'diaz',
 'damon',
 'nomination',
 'dull',
 'unfunny',
 'lifeless',
 'terrific',
 'poorly',
 'badly',
 'hanks',
 'random',
 'damme',
 'fincher',
 'rape',
 'snow',
 'superb',
 'worst',
 'boring',
 'era',
 'stupid',
 'reese',
 'traditional',
 'anywhere',
 'inane',
 'allows',
 'obi',
 'memorable',
 'decades',
 'friendship',
 'skip',
 'masterpiece',
 'terrible',
 'satisfying',
 'freedom',
 'designer',
 'portrayed',
 'spacey',
 'sports',
 'fits',
 'snake',
 'frankly',
 'fantastic',
 'inn

We consider existence/non-existence of these words in each of reviews as features by defining the following function.


In [30]:
def search_features(doc):
    words = set(doc)
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features

We can test the function on the first review (docs was defined before as the list of tuples containing reviews with their corresponding sentiments).

In [None]:
search_features(docs[0][0])

These are only the first 14 words in the word_features variable (containing 3000 words) for the first review. 


Applying the function to all the reviews.


In [32]:
featureset = [(search_features(doc), category) for (doc, category) in docs]

Print the first element from featureset and explore the component inside it

In [33]:
print(featureset[0][0])

{'outstanding': False, 'schumacher': False, 'mulan': False, 'ludicrous': False, 'finest': False, 'welles': False, 'jolie': False, 'embarrassing': False, 'beautifully': False, 'religion': False, 'freddie': False, 'prinze': False, 'gon': False, 'wasted': False, 'lame': False, 'anna': False, 'ridiculous': False, 'wonderfully': False, 'garbage': False, 'idiotic': False, 'mature': False, 'alicia': False, 'anger': False, 'breathtaking': False, 'sandler': False, 'awful': False, 'lebowski': False, 'refreshing': False, 'whatsoever': False, 'inept': False, 'uninteresting': False, 'bore': False, 'tucker': False, 'painfully': False, 'waste': False, 'jedi': False, 'laughable': False, 'ordinary': False, 'flynt': False, 'bland': False, 'sat': False, 'henstridge': False, 'diaz': False, 'damon': False, 'nomination': False, 'dull': False, 'unfunny': False, 'lifeless': False, 'terrific': False, 'poorly': False, 'badly': False, 'hanks': False, 'random': False, 'damme': False, 'fincher': False, 'rape': Fal

Training and testing set

Before we can train and test our algorithm, we should first split our data into training and test sets. 

Since our dataset has been shuffled, the first 1600 shuffled reviews (consisting both positive and negative reviews) will be used as the training set. 

The remaining 20% (400 reviews) will be used to perform the test. 


In [34]:
training_set = featureset[:1600]
testing_set = featureset[1600:]

In [35]:
def count_features(list_item):
    negative_count = 0
    positive_count = 0
    for value in list_item:
        if value[1] == 'neg':
            negative_count += 1
        elif value[1] == 'pos':
            positive_count += 1

    total = len(list_item)

    print('negative : ', negative_count / total * 100, '%')
    print('positive : ', positive_count / total * 100, '%')


print('Training set')
count_features(training_set)
print('Testing set')
count_features(testing_set)

Training set
negative :  62.5 %
positive :  37.5 %
Testing set
negative :  0.0 %
positive :  100.0 %


In [36]:
import random

In [37]:
random.shuffle(featureset)

In [38]:
training_set = featureset[:1600]
testing_set = featureset[1600:]

In [39]:
def count_features(list_item):
    negative_count = 0
    positive_count = 0
    for value in list_item:
        if value[1] == 'neg':
            negative_count += 1
        elif value[1] == 'pos':
            positive_count += 1

    total = len(list_item)

    print('negative : ', negative_count / total * 100, '%')
    print('positive : ', positive_count / total * 100, '%')


print('Training set')
count_features(training_set)
print('Testing set')
count_features(testing_set)

Training set
negative :  50.0625 %
positive :  49.9375 %
Testing set
negative :  49.75 %
positive :  50.24999999999999 %


Training

Since our data is labeled, this process is referred to as supervised learning.

We will use Naive Bayes on NLTK as the algorithm as it is a very popular algorithm for text classification.

We create the instance of the model and then train it on the training set. 


In [40]:
classifier = nltk.NaiveBayesClassifier.train(training_set)

Model evaluation

We can evaluate the model accuracy by testing it on the testing set. 

We can use classify.accuracy() function on nltk to determine the accuracy of a trained model.

It needs the trained model instance and the training test.


In [41]:
print("classifier's accuracy for testing_set is: {}" .format(nltk.classify.accuracy(classifier, testing_set)*100))
print("classifier's accuracy for training_set is : {}" .format(nltk.classify.accuracy(classifier, training_set)*100))

classifier's accuracy for testing_set is: 80.75
classifier's accuracy for training_set is : 82.25


Most informative words

we can take it a step further to see what the most valuable words are when it comes to positive or negative reviews.


In [44]:
classifier.show_most_informative_features(10)

Most Informative Features
                 idiotic = True              neg : pos    =     15.6 : 1.0
               ludicrous = True              neg : pos    =     14.6 : 1.0
             outstanding = True              pos : neg    =      9.6 : 1.0
                     obi = True              pos : neg    =      7.7 : 1.0
                  finest = True              pos : neg    =      7.7 : 1.0
                religion = True              pos : neg    =      7.0 : 1.0
                  poorly = True              neg : pos    =      7.0 : 1.0
            breathtaking = True              pos : neg    =      6.8 : 1.0
               painfully = True              neg : pos    =      6.1 : 1.0
                  wasted = True              neg : pos    =      6.0 : 1.0


In [46]:
import pickle
save_classifier = open("naive_bayes_model.pkl", "wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()

In [47]:
classifier_f = open("naive_bayes_model.pkl", "rb")
classifier = pickle.load(classifier_f)
classifier_f.close()

In [48]:
custom_review = "I hated the restaurant. It was a disaster eating there. Poor service, arrogant waiters."

from nltk import web_tokenize

custom_review_tokens = word_tokenize(custom_review)
custom_review_set = document_features(custom_review_tokens)
print(classifier.clssify(custom_review_set))

ImportError: cannot import name 'web_tokenize' from 'nltk' (c:\Users\ASUS\anaconda3\Lib\site-packages\nltk\__init__.py)

In [49]:
prob_result = classifier.prob_classify(custom_review_set)
print(prob_result.max())
print(prob_result.prob('pos'))
print(prob_result.prob('neg'))

NameError: name 'custom_review_set' is not defined