In [1]:
import nltk
import os 

In [2]:
# Current Working Directory
os.getcwd()

'E:\\Insofe\\Lab\\Python\\Text_mining_lab\\movie_reviews_sentiment_analysis'

In [3]:
# List of Directory
os.listdir(os.getcwd())

['.ipynb_checkpoints',
 'negative.txt',
 'positive.txt',
 'sentiment_analysis.ipynb']

In [4]:
# Reading files
# Using with so that we don't close file expilcitly every time
with open('positive.txt','r') as file_pos:
    pos = file_pos.read() # positive texts
    
with open('negative.txt','r') as file_neg:
    neg = file_neg.read() # Negative texts

In [5]:
# Converting text to lower case, as it is case sensitive
pos = pos.lower()
neg = pos.lower()

In [6]:
# Splitting at each sentence 
pos_doc = pos.split('\n')
neg_doc = neg.split('\n')

In [7]:
pos_doc[:5]

['the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal . ',
 'the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\'s expanded vision of j . r . r . tolkien\'s middle-earth . ',
 'effective but too-tepid biopic',
 'if you sometimes like to go to the movies to have fun , wasabi is a good place to start . ',
 "emerges as something rare , an issue movie that's so honest and keenly observed that it doesn't feel like one . "]

In [8]:
# Taking first 1000 sentences of both postive and negative files
posdoc = pos_doc[:1000]
negdoc = neg_doc[:1000]

In [9]:
documents = [] # empty list
for p in posdoc:
    # appending positive text into documents
    documents.append((p,'p')) # appending 'p' for each sentence
for n in negdoc:
    # appending negative text into documents
    documents.append((n,'n')) # appending 'n' for each sentence

In [10]:
# view
documents

[('the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal . ',
  'p'),
 ('the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\'s expanded vision of j . r . r . tolkien\'s middle-earth . ',
  'p'),
 ('effective but too-tepid biopic', 'p'),
 ('if you sometimes like to go to the movies to have fun , wasabi is a good place to start . ',
  'p'),
 ("emerges as something rare , an issue movie that's so honest and keenly observed that it doesn't feel like one . ",
  'p'),
 ('the film provides some great insight into the neurotic mindset of all comics -- even those who have reached the absolute top of the game . ',
  'p'),
 ('offers that rare combination of entertainment and education . ', 'p'),
 ('perhaps no picture ever made has more literally showed that t

In [11]:
from nltk.tokenize import RegexpTokenizer

In [12]:
tokenizer = RegexpTokenizer(pattern = r'\w+') # using pattern 

In [13]:
from nltk.corpus import stopwords
stopwords = stopwords.words('english')# stop words in english

In [14]:
tokens = tokenizer.tokenize(posdoc[0]) # tokenizing first sentence
# tagging parts of speech to each token
print(nltk.pos_tag(tokens))

[('the', 'DT'), ('rock', 'NN'), ('is', 'VBZ'), ('destined', 'VBN'), ('to', 'TO'), ('be', 'VB'), ('the', 'DT'), ('21st', 'JJ'), ('century', 'NN'), ('s', 'VBD'), ('new', 'JJ'), ('conan', 'NN'), ('and', 'CC'), ('that', 'IN'), ('he', 'PRP'), ('s', 'VBZ'), ('going', 'VBG'), ('to', 'TO'), ('make', 'VB'), ('a', 'DT'), ('splash', 'NN'), ('even', 'RB'), ('greater', 'JJR'), ('than', 'IN'), ('arnold', 'RB'), ('schwarzenegger', 'JJ'), ('jean', 'JJ'), ('claud', 'NN'), ('van', 'NN'), ('damme', 'NN'), ('or', 'CC'), ('steven', 'JJ'), ('segal', 'NN')]


In [15]:
allowed_word = ['JJ'] # Adjective

all_words = [] # empty list
# iterating over each sentence and corresponding label
for doc,label in documents:
    # tokenizing each sentence
    words = tokenizer.tokenize(doc)
    # tagging parts of speech to each token
    tagged_words = nltk.pos_tag(words)
    # iterating over each word and corresponding tag
    for word,tag in tagged_words:
        # if tag is adjective then
        if tag in allowed_word:
            # append to all_words
            all_words.append(word)
            

In [16]:
# length of all_words
len(all_words)

4628

In [17]:
# number of times each word has appeared 
freq_dist = nltk.FreqDist(all_words)

In [18]:
# displaying top ten
freq_dist.most_common(10)

[('good', 98),
 ('s', 84),
 ('little', 50),
 ('great', 48),
 ('old', 48),
 ('new', 46),
 ('much', 46),
 ('romantic', 46),
 ('own', 40),
 ('american', 38)]

In [19]:
# word features
word_features = set(all_words)
word_features

{'vietnamese',
 'absurd',
 'offbeat',
 'dumb',
 'unexpected',
 'fierce',
 'imaginative',
 'afghan',
 'experienced',
 'venerable',
 'maggie',
 'canadian',
 'superior',
 'lead',
 'evocative',
 'youthful',
 'distracted',
 'stunning',
 'amusing',
 'provocative',
 'buzz',
 'third',
 'fit',
 'great',
 'general',
 'unfortunately',
 'static',
 'disturbing',
 'bitter',
 'hugh',
 'faithful',
 'plucky',
 'goofy',
 'epic',
 'popular',
 'riveting',
 'potent',
 'o',
 'timely',
 'unnerving',
 'saturday',
 'noyce',
 'vivid',
 'political',
 'heavy',
 'rapid',
 'rich',
 'manic',
 'modulated',
 'morvern',
 'loose',
 'southern',
 'israeli',
 'playful',
 'charismatic',
 'natural',
 't',
 'tepid',
 'numerous',
 'single',
 'everyday',
 'middle',
 'ikea',
 'romantic',
 'worthwhile',
 'wholesome',
 'key',
 'fi',
 'wet',
 'shambling',
 'modern',
 'incendiary',
 'sly',
 'forgivable',
 'committed',
 'forceful',
 'observed',
 'tu',
 'psychological',
 'peculiar',
 'surprising',
 'metropolis',
 'fine',
 'co',
 'trou

In [20]:
def find_features(document):
    # returnd features of a document.
    document_tokens = tokenizer.tokenize(document) # tokenizing a document
    
    features = {} # empty set
    # iterating over all word_features(which are ajectives of document)
    for w in word_features:
        # dictionary of features which are not in stopwords
        features[w] = (w in document_tokens and w not in stopwords)
    return features
# iterating over sentence and its category in documents
feature_sets = [(find_features(rev),category) for(rev,category) in documents]
        

In [21]:
# length
len(feature_sets)

2000

In [22]:
import random
# shuffling feature_sets
random.shuffle(feature_sets)
# train 
train = feature_sets[:1800]
# test
test = feature_sets[1800:]

In [23]:
# training classifier
classifier = nltk.NaiveBayesClassifier.train(train)

In [24]:
# testing on train
train_acc = nltk.classify.accuracy(classifier,train)*100
print('trainAccuracy',train_acc)

trainAccuracy 54.11111111111111


In [25]:
# most informative features
classifier.show_most_informative_features(20)

Most Informative Features
           sophisticated = True                p : n      =      3.0 : 1.0
                    rich = True                n : p      =      2.3 : 1.0
                    dumb = True                n : p      =      2.3 : 1.0
                     wit = True                n : p      =      2.2 : 1.0
                   acted = True                n : p      =      2.1 : 1.0
                   quiet = True                n : p      =      1.8 : 1.0
              satisfying = True                p : n      =      1.8 : 1.0
                 quality = True                p : n      =      1.8 : 1.0
              unexpected = True                n : p      =      1.8 : 1.0
                 amusing = True                n : p      =      1.8 : 1.0
                 amazing = True                n : p      =      1.8 : 1.0
                original = True                n : p      =      1.7 : 1.0
           contemplative = True                p : n      =      1.7 : 1.0