## Importing Necessary Packages

In [1]:
## Importing alsl the required packages
import nltk
import gensim
import pandas as pd
import string
import random
from nltk.corpus import subjectivity,stopwords
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *
from unidecode import unidecode
from nltk import word_tokenize,sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.util import extract_unigram_feats, mark_negation
import re
from sklearn.metrics import precision_score, recall_score, confusion_matrix, accuracy_score, f1_score



## Importing Data

In [2]:
## Movie review dataset for classifying the reviews into Positive or negative
movieReview = pd.read_csv('labeledTrainDataSentiment.tsv',delimiter='\t')

### For the analysis we ll consider only 1000 rows
movieReview_head = movieReview[0:1000]

movieReview_head.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [4]:
#Preprocessing 
def clean_text(text):
    return text.replace("\\", " ")

In [5]:
movieReview["review"][1]

'\\The Classic War of the Worlds\\" by Timothy Hines is a very entertaining film that obviously goes to great effort and lengths to faithfully recreate H. G. Wells\' classic book. Mr. Hines succeeds in doing so. I, and those who watched his film with me, appreciated the fact that it was not the standard, predictable Hollywood fare that comes out every year, e.g. the Spielberg version with Tom Cruise that had only the slightest resemblance to the book. Obviously, everyone looks for different things in a movie. Those who envision themselves as amateur \\"critics\\" look only to criticize everything they can. Others rate a movie on more important bases,like being entertained, which is why most people never agree with the \\"critics\\". We enjoyed the effort Mr. Hines put into being faithful to H.G. Wells\' classic novel, and we found it to be very entertaining. This made it easy to overlook what the \\"critics\\" perceive to be its shortcomings."'

In [6]:
clean_text(movieReview["review"][1])

' The Classic War of the Worlds " by Timothy Hines is a very entertaining film that obviously goes to great effort and lengths to faithfully recreate H. G. Wells\' classic book. Mr. Hines succeeds in doing so. I, and those who watched his film with me, appreciated the fact that it was not the standard, predictable Hollywood fare that comes out every year, e.g. the Spielberg version with Tom Cruise that had only the slightest resemblance to the book. Obviously, everyone looks for different things in a movie. Those who envision themselves as amateur  "critics " look only to criticize everything they can. Others rate a movie on more important bases,like being entertained, which is why most people never agree with the  "critics ". We enjoyed the effort Mr. Hines put into being faithful to H.G. Wells\' classic novel, and we found it to be very entertaining. This made it easy to overlook what the  "critics " perceive to be its shortcomings."'

## Algorithm 1 : Using Bag of Words Approach

In [93]:
### Step 1: Sentence tokenize and then word tokenize the statements
### Step 2: Clean the given words and do lemmatization / stemming based on sthe use case
### Step 3: Case conversion is done before checking the word is present in positive or negative corpus
### Step 4: Check whether the words are present either in the positive corpus or negative corpus
### Step 5: If found in the positive corpus then score of 1 is given and if found in negative corpus then score of -1 is given
### Step 6: Total score of the statement is the sum of the scores of each individual words

In [7]:
### regex to split the sentences into words
nltk_tokenizer = nltk.tokenize.RegexpTokenizer('\w+')

def split(text):
    
    ### Split is used for converting a paragraph into statements
    ### Sentence tokenizer can also be used for sentence split (Alternative)
    text = "".join([ch for ch in text if ord(ch)<= 128])
    sentences = text.split('. ') 

    
    ### regex is used for word split, instead of this word tokenizer can also be usesd
    tokenized_sentences = [nltk_tokenizer.tokenize(sent) for sent in sentences]

    return tokenized_sentences

In [59]:
movieReview["review"][1]

'\\The Classic War of the Worlds\\" by Timothy Hines is a very entertaining film that obviously goes to great effort and lengths to faithfully recreate H. G. Wells\' classic book. Mr. Hines succeeds in doing so. I, and those who watched his film with me, appreciated the fact that it was not the standard, predictable Hollywood fare that comes out every year, e.g. the Spielberg version with Tom Cruise that had only the slightest resemblance to the book. Obviously, everyone looks for different things in a movie. Those who envision themselves as amateur \\"critics\\" look only to criticize everything they can. Others rate a movie on more important bases,like being entertained, which is why most people never agree with the \\"critics\\". We enjoyed the effort Mr. Hines put into being faithful to H.G. Wells\' classic novel, and we found it to be very entertaining. This made it easy to overlook what the \\"critics\\" perceive to be its shortcomings."'

In [8]:
split(movieReview["review"][1])

[['The',
  'Classic',
  'War',
  'of',
  'the',
  'Worlds',
  'by',
  'Timothy',
  'Hines',
  'is',
  'a',
  'very',
  'entertaining',
  'film',
  'that',
  'obviously',
  'goes',
  'to',
  'great',
  'effort',
  'and',
  'lengths',
  'to',
  'faithfully',
  'recreate',
  'H'],
 ['G'],
 ['Wells', 'classic', 'book'],
 ['Mr'],
 ['Hines', 'succeeds', 'in', 'doing', 'so'],
 ['I',
  'and',
  'those',
  'who',
  'watched',
  'his',
  'film',
  'with',
  'me',
  'appreciated',
  'the',
  'fact',
  'that',
  'it',
  'was',
  'not',
  'the',
  'standard',
  'predictable',
  'Hollywood',
  'fare',
  'that',
  'comes',
  'out',
  'every',
  'year',
  'e',
  'g'],
 ['the',
  'Spielberg',
  'version',
  'with',
  'Tom',
  'Cruise',
  'that',
  'had',
  'only',
  'the',
  'slightest',
  'resemblance',
  'to',
  'the',
  'book'],
 ['Obviously',
  'everyone',
  'looks',
  'for',
  'different',
  'things',
  'in',
  'a',
  'movie'],
 ['Those',
  'who',
  'envision',
  'themselves',
  'as',
  'amateur',

# Sentiment Analysis

In [9]:
### Reading the positive and negative corpus and storing it in the training dictionary

training_dictionary = {}

### Positive corpus
with open('pos.txt') as fp:
    for line in fp:
        training_dictionary[line.split('\n')[0].strip()] = 1

### Negative corpus        
with open('neg.txt') as fp:
    for line in fp:
        training_dictionary[line.split('\n')[0].strip()] = -1

In [10]:
training_dictionary

{'a+': 1,
 'abound': 1,
 'abounds': 1,
 'abundance': 1,
 'abundant': 1,
 'accessable': 1,
 'accessible': 1,
 'acclaim': 1,
 'acclaimed': 1,
 'acclamation': 1,
 'accolade': 1,
 'accolades': 1,
 'accommodative': 1,
 'accomodative': 1,
 'accomplish': 1,
 'accomplished': 1,
 'accomplishment': 1,
 'accomplishments': 1,
 'accurate': 1,
 'accurately': 1,
 'achievable': 1,
 'achievement': 1,
 'achievements': 1,
 'achievible': 1,
 'acumen': 1,
 'adaptable': 1,
 'adaptive': 1,
 'adequate': 1,
 'adjustable': 1,
 'admirable': 1,
 'admirably': 1,
 'admiration': 1,
 'admire': 1,
 'admirer': 1,
 'admiring': 1,
 'admiringly': 1,
 'adorable': 1,
 'adore': 1,
 'adored': 1,
 'adorer': 1,
 'adoring': 1,
 'adoringly': 1,
 'adroit': 1,
 'adroitly': 1,
 'adulate': 1,
 'adulation': 1,
 'adulatory': 1,
 'advanced': 1,
 'advantage': 1,
 'advantageous': 1,
 'advantageously': 1,
 'advantages': 1,
 'adventuresome': 1,
 'adventurous': 1,
 'advocate': 1,
 'advocated': 1,
 'advocates': 1,
 'affability': 1,
 'affable'

In [11]:
### Get the score of the given word
### score of +1 is given if the word is found is positive 
### score of -1 is given if the word is found is negative
### score of 0 is given if the word is not found in the dictionary

def get_score_word(word):
    word = word.lower()
    if word in training_dictionary:
        return training_dictionary[word]
    
    elif word in training_dictionary:
        return training_dictionary[word]
    else:
        return 0

In [12]:
[(word,get_score_word(word)) for doc in split(movieReview["review"][1]) for word in doc]

[('The', 0),
 ('Classic', 1),
 ('War', 0),
 ('of', 0),
 ('the', 0),
 ('Worlds', 0),
 ('by', 0),
 ('Timothy', 0),
 ('Hines', 0),
 ('is', 0),
 ('a', 0),
 ('very', 0),
 ('entertaining', 1),
 ('film', 0),
 ('that', 0),
 ('obviously', 0),
 ('goes', 0),
 ('to', 0),
 ('great', 1),
 ('effort', 0),
 ('and', 0),
 ('lengths', 0),
 ('to', 0),
 ('faithfully', 1),
 ('recreate', 0),
 ('H', 0),
 ('G', 0),
 ('Wells', 0),
 ('classic', 1),
 ('book', 0),
 ('Mr', 0),
 ('Hines', 0),
 ('succeeds', 1),
 ('in', 0),
 ('doing', 0),
 ('so', 0),
 ('I', 0),
 ('and', 0),
 ('those', 0),
 ('who', 0),
 ('watched', 0),
 ('his', 0),
 ('film', 0),
 ('with', 0),
 ('me', 0),
 ('appreciated', 1),
 ('the', 0),
 ('fact', 0),
 ('that', 0),
 ('it', 0),
 ('was', 0),
 ('not', 0),
 ('the', 0),
 ('standard', 0),
 ('predictable', 0),
 ('Hollywood', 0),
 ('fare', 0),
 ('that', 0),
 ('comes', 0),
 ('out', 0),
 ('every', 0),
 ('year', 0),
 ('e', 0),
 ('g', 0),
 ('the', 0),
 ('Spielberg', 0),
 ('version', 0),
 ('with', 0),
 ('Tom', 0),
 

In [18]:
## Using only unigrams for sentimental analysis
## For any given paragraph, the text is splitted, score for each word is given and final score is also calculated

def review_score(sentences):   
    
    if len(sentences)<=1:
        return 0
    else:
        final_score = 0
        tagged_sentences =  split(sentences)
        
        for sentence_tokens in tagged_sentences:    
            total_score = 0
            
            for i,current_token in enumerate(sentence_tokens):
                
                token_score = get_score_word(current_token)
                total_score = total_score + token_score

            final_score = final_score + total_score
    if(final_score) < 0 : return -1
    elif(final_score)>0 : return 1
    else : return 0

In [20]:
print(review_score('I had a critical week'))

-1


In [14]:
### Creating a separate column for storing the output of the sentiment for algorithm 1
movieReview_head['new_score_algo1'] = movieReview_head.apply(lambda x:review_score(x['review']),axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [16]:
## Accuracy Measurement for the unsupervised sentiment analysis algorithm 
print('Accuracy is: ',round(accuracy_score(movieReview_head['sentiment'],movieReview_head['new_score_algo1'])  * 100,2))
print('Precision is:',round(precision_score(movieReview_head['sentiment'],movieReview_head['new_score_algo1']) * 100,2))
print('Recall is:',round(recall_score(movieReview_head['sentiment'],movieReview_head['new_score_algo1']) * 100,2))
print('F1 Score is:', round(f1_score(movieReview_head['sentiment'],movieReview_head['new_score_algo1']) * 100,2))

Accuracy is:  71.3
Precision is: 68.36
Recall is: 75.31
F1 Score is: 71.67


In [18]:
#How to improve the results ?
## For Example:
## 1. I want a burrito so good
## 2. I just had a burrito which was not good.

print(review_score('I h|ad a burrito so good'))
print(review_score('I just had a burrito which was not good'))

#What about sentence 2 ? Is it actually positive?

#Hence we can go to next algorithm using Bag of Words and Modifiers

1
1


## Algorithm 2 : Using Bag of Words and Modifiers

In [27]:
modifiers_dictionary = {}
modifiers_dictionary['very'] = 'inc'
modifiers_dictionary['not'] = 'inv'
modifiers_dictionary['too'] = 'inc'
modifiers_dictionary['so'] = 'inc'

In [31]:
#give more options to improve the accuracy???
#add incrementers and inverters 
def review_score(sentences):    
    
    if len(sentences)<2:
        return 0
    else:
        final_list = []
        tagged_sentences = split(sentences)
        
        for sentence_tokens in tagged_sentences:
            total_score = 0
            for i,current_token in enumerate(sentence_tokens):
                
                token_score = get_score_word(current_token)
                previous_token = sentence_tokens[i-1] if i>1 else None 
                
                if previous_token is not None:
                    
                    if previous_token in modifiers_dictionary and modifiers_dictionary[previous_token]=='inc':
                        token_score *= 2.0
                    elif previous_token in modifiers_dictionary and modifiers_dictionary[previous_token]=='dec':
                        token_score /= 2.0
                    elif previous_token in modifiers_dictionary and modifiers_dictionary[previous_token]=='inv':
                        token_score *= -1.0
                    else:
                        pass
                total_score +=token_score
            final_list.append(total_score)
    return total_score

In [32]:
#How to improve the results ?
## For Example:
## 1. I want a burrito so good
## 2. I just had a burrito which was not good.

#What about sentence 2 ? It is actually positive 
print(review_score('I want a burrito so good'))
print(review_score('I just had a burrito which was not good'))

2.0
-1.0


In [23]:
### Creating a separate column for storing the output of the sentiment for algorithm 2
movieReview_head['new_score_algo2'] = movieReview_head.apply(lambda x:review_score(x['review']),axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [24]:
## Accuracy Measurement for the new algorithm with incrementers, decrementers and inverters

print('Accuracy is: ',round(accuracy_score(movieReview_head['sentiment'],movieReview_head['new_score_algo2'])  * 100,2))
print('Precision is:',round(precision_score(movieReview_head['sentiment'],movieReview_head['new_score_algo2']) * 100,2))
print('Recall is:',round(recall_score(movieReview_head['sentiment'],movieReview_head['new_score_algo2']) * 100,2))
print('F1 Score is:', round(f1_score(movieReview_head['sentiment'],movieReview_head['new_score_algo2']) * 100,2))

Accuracy is:  71.7
Precision is: 68.74
Recall is: 75.73
F1 Score is: 72.06


In [59]:
## Algorithms for Unsupervised Sentimental Analysis

## Two algorithms for unsupervised sentimental analysis

## Algorithm 3 : Textblob for Sentimental Analysis

In [34]:
from textblob import TextBlob

### Output of TextBlob gives two scores
## Polarity: Polarity ranges between -1 and 1
## 1. Polarity score with negative value means negative statement
## 2. Polarity score with positive value means positive statement
## 3. Polarity score with 0 means neutral statement

## Subjectivity: Subjectivity ranges between 0 and 1
## Subjective sentences generally refer to personal opinion, emotion or judgment whereas objective refers to factual information.
## 1. Subjectivity score of 0 is Objective statement
## 2. Subjectivity score of 1 is subjective statement

statements =TextBlob("Analytics Vidhya is a great platform to learn data science")
print(statements.sentiment)

statements =TextBlob("Mars is better than earth")
print(statements.sentiment)

statements = TextBlob('The food here is good')
print(statements.sentiment)

statements = TextBlob('The food here is bad')
print(statements.sentiment)

statements = TextBlob('The food here is very bad')
print(statements.sentiment)

statements = TextBlob('I was feeling good earlier but I am not feeling good now')
print(statements.sentiment)

Sentiment(polarity=0.8, subjectivity=0.75)
Sentiment(polarity=0.5, subjectivity=0.5)
Sentiment(polarity=0.7, subjectivity=0.6000000000000001)
Sentiment(polarity=-0.6999999999999998, subjectivity=0.6666666666666666)
Sentiment(polarity=-0.9099999999999998, subjectivity=0.8666666666666667)
Sentiment(polarity=0.4666666666666666, subjectivity=0.5666666666666668)


In [31]:
### Textblob takes the input as sentences and returns two output
### The scoring for each statement is calculated and if the score is more than 0 assuming that it is positive
### if the score is less than 0 assuming that it is negative
### the final score will be calculated based on the sum of the score across the sentences

def textblob(sentences):
    total_score = 0
    
    for sentence in sent_tokenize(sentences):
        sentence = "".join([ch for ch in sentence if ord(ch)<= 128])

        if sentence=='':
            return 0
        else:
            sent_score = TextBlob(sentence).sentiment[0]
        
        total_score += sent_score
    return 1 if total_score>0 else 0

In [32]:
### Creating a separate column for storing the output of textblob
movieReview_head['textblob'] = movieReview_head.apply(lambda x:textblob(x['review']),axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [33]:
## Accuracy Measurement for Textblob sentimental analysis

print('Accuracy is: ',round(accuracy_score(movieReview_head['sentiment'],movieReview_head['textblob']) * 100,2))
print('Precision is:',round(precision_score(movieReview_head['sentiment'],movieReview_head['textblob']) * 100,2))
print('Recall is:',round(recall_score(movieReview_head['sentiment'],movieReview_head['textblob']) * 100,2))
print('F1 Score is:', round(f1_score(movieReview_head['sentiment'],movieReview_head['textblob']) * 100,2))

Accuracy is:  68.3
Precision is: 61.16
Recall is: 93.78
F1 Score is: 74.04


## Algorithm 4: Vader Sentiment Analyzer

In [35]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

In [36]:
## Normal statement with neutral polarity
sents = "I just got a call from my boss - does he realise it's Saturday?"
print(sid.polarity_scores(sents))

{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}


In [37]:
sid.polarity_scores('good')

{'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': 0.4404}

In [38]:
## Normal statement with neutral polarity 
## Let's try adding emoticons to the statement as well

sents = "I just got a call from my boss - does he realise it's Saturday? :("
print(sid.polarity_scores(sents))

## Adding an emoticons has changed the polarity of the statement
sents = "I just got a call from my boss - does he realise it's Saturday? WTF"
print(sid.polarity_scores(sents))


{'neg': 0.209, 'neu': 0.791, 'pos': 0.0, 'compound': -0.4404}
{'neg': 0.292, 'neu': 0.708, 'pos': 0.0, 'compound': -0.6739}


In [36]:
## Advantage of using Incrementers/ Decrementers, inverters, & capitalization
sents = 'the food here is good'
print(sid.polarity_scores(sents))

## adding incrementer/decrementer words to the statement increases/decreases the sentiment of the statement
sents = "The food here is so good"
print(sid.polarity_scores(sents))

sents = "The food here is so bad"
print(sid.polarity_scores(sents))

## It also considers the inverters before the keywords and increases/decreases the sentiment of the statement accordingly
sents = "'I was feeling good earlier but I am not feeling good now'"
print(sid.polarity_scores(sents))


{'neg': 0.0, 'neu': 0.58, 'pos': 0.42, 'compound': 0.4404}
{'neg': 0.0, 'neu': 0.572, 'pos': 0.428, 'compound': 0.5777}
{'neg': 0.473, 'neu': 0.527, 'pos': 0.0, 'compound': -0.6696}
{'neg': 0.0, 'neu': 0.443, 'pos': 0.557, 'compound': 0.7783}


In [40]:
## adding capitalization for a word increases the intensity of the word
sents = "The food here is so GOOD"
print(sid.polarity_scores(sents))

## It also handles the 'but' & 'and' case to change the sentiment score accordingly
print(sid.polarity_scores("The food is really GOOD! But the service is dreadful."))
print(sid.polarity_scores("The food is really GOOD! and the service is awesome."))

{'neg': 0.0, 'neu': 0.518, 'pos': 0.482, 'compound': 0.6866}
{'neg': 0.192, 'neu': 0.529, 'pos': 0.279, 'compound': 0.3222}
{'neg': 0.0, 'neu': 0.49, 'pos': 0.51, 'compound': 0.8526}


In [43]:
## Vader sentimental analysis

def vaderSentiment(sentences):
    total_score = 0
    for sentence in sent_tokenize(sentences):
        sentence = "".join([ch for ch in sentence if ord(ch)<= 128])
        pos = neg =neu = sent_score= 0

        if sentence=='':
            return 0
        else:
            sent_score = sid.polarity_scores(sentence)
            neg = sent_score['neg']
            pos = sent_score['pos']
            neu = sent_score['neu']
        
            if pos>=neg and pos>=neu:
                sent_score = 1
            elif neu>=pos and neu>= neg:
                sent_score = 1
            else:
                sent_score = -1
        
        total_score += sent_score
    return 1 if total_score>0 else 0

In [44]:
### Creating a separate column for storing the output of vader sentimental analysis
movieReview_head['vader'] = movieReview_head.apply(lambda x:vaderSentiment(x['review']),axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [45]:
## Accuracy Measurement for the vader sentiment algorithm

print('Accuracy is: ',round(accuracy_score(movieReview_head['sentiment'],movieReview_head['vader']) * 100,2))
print('Precision is:',round(precision_score(movieReview_head['sentiment'],movieReview_head['vader']) * 100,2))
print('Recall is:',round(recall_score(movieReview_head['sentiment'],movieReview_head['vader']) * 100,2))
print('F1 Score is:', round(f1_score(movieReview_head['sentiment'],movieReview_head['vader']) * 100,2))

Accuracy is:  48.2
Precision is: 48.2
Recall is: 100.0
F1 Score is: 65.05
