# MOVIE REVIEWS SENTIMENT ANALYSIS

In [None]:
#Python 3
import re #Please check that you have these!
from os import listdir

In [None]:
import numpy as np #Only end of recommendation system is affected

In [None]:
import matplotlib.pyplot as plt #Only plotting will not work

# Work with a single review

In [None]:
#Open and output each line of a review
file1 = 'pos/6_10_tt0100680.txt'
f = open(file1,'r')
for line in f:
    print(line)

In [None]:
f = open(file1,'r')
words = [] #List of all words in a review
for line in f:
    line = line.strip()
    words = line.split() #Split by spaces
    
print(sorted(words)) #remained "word." or "word:", etc.

In [None]:
f = open(file1,'r')
words = []
for line in f:
    line = line.strip()
    words = re.split(' |, |: |!|\.|"|\'|\(|\)|\?|/|;|>|<',line) 
    #Split review using all possible punctuations as delimiters
    # "|" is used to show where each delimiter ends

print(sorted(words)) #Same words, inneficient, better a dictionary with a word and its count

In [None]:
f = open(file1,'r')
vocab = {} #Dictionary: {word1:count1, word2:count2,...}
for line in f:
    line = line.strip().lower()
    words = re.split(' |, |: |!|\.|"|\'|\(|\)|\?|/|;|>|<',line)
    for word in words:
        if word == '': #No need to have empty strings in the vocabulary
            continue
        vocab[word]=vocab.get(word,0)+1 #Increment the count of the just seen word

print(sorted(vocab))
vLen = len(vocab) #Length of vocabulary
print("The vocabulary contains ",vLen," words.")

In [None]:
#The above procedure can be used done with NLTK module
from nltk.tokenize import word_tokenize #Load NLTK word_tokenize module
#If you got an error - just watch the explanation of the next cell
#Resume the work after the next cell

In [None]:
f = open(file1,'r')
vocab = {}
for line in f:
    line = line.strip().lower()
    words = word_tokenize(line.strip().lower()) #Instead of regular expressions use this to tokenize
    for word in words:
        if word == '':
            continue
        vocab[word]=vocab.get(word,0)+1

print(sorted(vocab))
vLen = len(vocab)
print("The vocabulary contains ",vLen," words.") 
#Vocabulary length is different as word_tokenize also included ",",".","...", 
#but did not tokenize "story/script", "best.this"

In [None]:
plt.bar(range(len(vocab)), vocab.values()) #Show counts for each word in our dictionary
plt.show() #Most counts are 1

In [None]:
top20_vocab = dict(sorted(vocab.items(), key=lambda x: -x[1])[:20]) #Dictionary of the words with the highest count
top20_vocab

In [None]:
#Bar chart of 20 words with the highest count
plt.bar(range(len(top20_vocab)), top20_vocab.values(), align='center')
plt.xlim([-1,20])
plt.xticks(range(len(top20_vocab)), top20_vocab.keys(), rotation=90)
plt.show() #Most polarized words (important for classification) as "good" appear only 1 time

In [None]:
# Load polarity words, adapted from http://sentiwordnet.isti.cnr.it/
f = open('polarity_words_uniq.csv','r')
i = 0
pol_words = {} #Dictionary of polarity words: {pol_word1:polarity1, pol_word2:polarity2,...}
next(f) #Skip header
for line in f:
    line = line.strip()
    line = line.split(',')
    pol_words[line[0]] = np.sign(float(line[1])) #+1 is for positive words, -1 is for negative words
pol_words

In [None]:
pol_words.get('and',None),pol_words.get('love',None),pol_words.get('interesting',None) #Check polarity of several words

In [None]:
words = top20_vocab.keys() #List of top20 words
counts = top20_vocab.values() #their counts
pos = range(len(top20_vocab)) #their positions on a bar chart
barWords= plt.bar(pos, counts, align='center', color='w') #make bar chart of counts for all top20 words
for i,word in enumerate(words):
    polarity = pol_words.get(word,None) #Get polarity of each word
    if polarity==1:
        barWords[i].set_color('r') #Change the color to "red" for positive word
    if polarity==-1:
        barWords[i].set_color('b') #Change the color to "blue" for negative word
plt.xlim([-1,20]) #Set the range to display
plt.xticks(pos, words, rotation=90) #Add words to the ticks on X axis
plt.show() #Most polarized words (important for classification) as "good" appear only 1 time

In [None]:
# Naive Bayes model:
# Type of review is defined by:
# the number of positive words - number of negative words
# If there are more positive words --> positive review
# otherwise --> negative review
vote = 0
for word in vocab.keys():
    polarity = pol_words.get(word,0)
    vote += polarity
if vote>0:
    print("Positive review, vote =",vote)
elif vote == 0:
    print("Neutral review") #If the number of positive words == number of negative words
else:
    print("Negative review, vote =",vote)

In [None]:
# The above analysis is combined in this cell.
# For a single review
file1 = 'pos/6_10_tt0100680.txt' #Check other 
f = open(file1,'r')

vocab = {}
for line in f:
    line = line.strip().lower()
    words = re.split(' |, |: |!|\.|"|\(|\)|\?|/|;|>|<',line)
    for word in words:
        if word == '':
            continue
        vocab[word]=vocab.get(word,0)+1

vLen = len(vocab)
top20_vocab = dict(sorted(vocab.items(), key=lambda x: -x[1])[:20])

counts = top20_vocab.values()
words = top20_vocab.keys()
pos = range(len(top20_vocab))
barWords= plt.bar(pos, counts, align='center', color='w')
for i,word in enumerate(words):
    polarity = pol_words.get(word,None)
    if polarity==1:
        barWords[i].set_color('r') #Change the color of only the i-th bar
    if polarity==-1:
        barWords[i].set_color('b')
plt.xlim([-1,20])
plt.xticks(pos, words, rotation=90)
plt.show() #Most polarized words (important for classification) as "good" appear only 1 time

vote = 0
for word in vocab.keys():
    polarity = pol_words.get(word,0)
    vote += polarity
if vote>0:
    print("Positive review, vote =",vote)
elif vote == 0:
    print("Neutral review")
else:
    print("Negative review, vote =",vote)

In [None]:
#For a single review with NLTK
file1 = 'pos/6_10_tt0100680.txt'
f = open(file1,'r')

vocab = {}

for line in f:
    line = line.strip().lower()
    words = word_tokenize(line.strip().lower())
    for word in words:
        if word == '':
            continue
        vocab[word]=vocab.get(word,0)+1

vLen = len(vocab)
top20_vocab = dict(sorted(vocab.items(), key=lambda x: -x[1])[:20])

counts = top20_vocab.values()
words = top20_vocab.keys()
pos = range(len(top20_vocab))
barWords= plt.bar(pos, counts, align='center', color='w')
for i,word in enumerate(words):
    polarity = pol_words.get(word,None)
    if polarity==1:
        barWords[i].set_color('r') #Change the color of only the i-th bar
    if polarity==-1:
        barWords[i].set_color('b')
plt.xlim([-1,20])
plt.xticks(pos, words, rotation=90)
plt.show() #Most polarized words (important for classification) as "good" appear only 1 time

vote = 0
for word in vocab.keys():
    polarity = pol_words.get(word,0)
    vote += polarity
if vote>0:
    print("Positive review, vote =",vote)
elif vote == 0:
    print("Neutral review")
else:
    print("Negative review, vote =",vote)

# Classify all reviews

In [None]:
# Using regular expressions for tokenization
fold = 'pos/'
n_pos = 0 #Number of positive reviews
n_files = 0 #Total number of files
for file in listdir(fold): #Get the name of each file in fold
    n_files += 1
    if n_files>5: #Comment this
        break     #Comment this
    file1 = fold + file
    vote = 0 #The total vote for each review
    try:
        f = open(file1,'r')
        for line in f:
            print(line) #Comment this
            line = line.strip().lower()
            words = re.split(' |, |: |!|\.|"|\'|\(|\)|\?|/|;|>|<',line)
            for word in words:
                polarity = pol_words.get(word,0)
                vote += polarity
        print('\n') #Comment this
        if vote>0:
            print("Positive review, vote =",vote) #Comment this
            n_pos += 1
        elif vote == 0:
            print("Neutral review") #Comment this
            pass
        else:
            print("Negative review, vote =",vote) #Comment this
            pass
        print('\n') #Comment this
    except:
        continue

#Check the 4th review with the vote -14. It is indeed difficult to say from the text that this is a positive review.
print("Classifier accuracy is",n_pos/n_files)