# Sentiment Analysis Example

We went over a simple sentiment analysis example in the previous lecture. Today we will go over the steps of the example with a practical code.

Decisions to take for any analysis:
- Input data
- Possible outputs
- Decision algorithm
- Evaluation method



For this sentiment analysis exercise, we will have only the text data to decide whether it is positive or negative while using the dictionary method. At the end, we will evaluate our system with the movie_reviews dataset of NLTK using accuracy measure.


In [None]:
#import libraries
#download corpora

import nltk

nltk.download('movie_reviews')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('opinion_lexicon')


from nltk import word_tokenize
from nltk import punkt
from nltk.corpus import stopwords
from nltk.corpus import opinion_lexicon
from nltk.corpus import movie_reviews
import numpy as np


In [None]:
def sentiment_score(input_tokens, positive_words, negative_words):
    '''
    type input_text: string
    param input_text: The text to be analyzed
    type positive_words: list
    param positive_words: The list of positive words from the lexicon
    type negative_words: list
    param negative_words: The list of negative words from the lexicon
    rtype: score
    return: sentiment score of the input text
    '''

    score = 0
    for token in input_tokens:
      if token in positive_words:
        score+=1
      elif token in negative_words:
        score-=1

    return score

In [None]:
def preprocess_text(input_text):
    '''
    type input_text: string
    param input_text: The text to be analyzed
    rtype: list
    return: input tokens
    '''

    #small preprocessing without lemmatization, stemming, part-of-speech tagging,...
    tokens = word_tokenize(input_text.lower(), language='english')

    filtered_tokens = [token for token in tokens if token not in stopwords.words('english')]

    return filtered_tokens

In [None]:


pos_list=set(opinion_lexicon.positive())
neg_list=set(opinion_lexicon.negative())

input_text = 'This is disgusting.'
tokens = preprocess_text(input_text)
print(sentiment_score(tokens,pos_list,neg_list))



In [None]:
#let's write an evaluator for the sentiment score method
def evaluate(sentiment_predicted_labels, sentiment_gold_labels):
    '''
    type sentiment_predicted_labels: list
    param sentiment_predicted_labels: The predicted labels for sentiments
    type sentiment_gold_labels: list
    param sentiment_gold_labels: The list of sentiment gold standard labels
    rtype: float
    return: accuracy of the predicted labels
    '''

    predicted_np = np.array(sentiment_predicted_labels)
    gold_np = np.array(sentiment_gold_labels)

    matched = (predicted_np == gold_np)
    accuracy = matched.sum() / matched.size

    return accuracy

In [None]:

from collections import defaultdict

pos_list=set(opinion_lexicon.positive())
neg_list=set(opinion_lexicon.negative())

documents = defaultdict(list)

for i in movie_reviews.fileids():
    documents[i.split('/')[0]].append(i)

sentiment_predicted_labels = []
sentiment_gold_labels = []

for neg_id in documents['neg']:
  sentiment_gold_labels.append(False)
  input_text = movie_reviews.raw(neg_id)
  tokens = preprocess_text(input_text)
  predicted_label = sentiment_score(tokens,pos_list,neg_list) > 0
  sentiment_predicted_labels.append(predicted_label)

for pos_id in documents['pos']:
  sentiment_gold_labels.append(True)
  input_text = movie_reviews.raw(pos_id)
  tokens = preprocess_text(input_text)
  predicted_label = sentiment_score(tokens,pos_list,neg_list) > 0
  sentiment_predicted_labels.append(predicted_label)


print('The accuracy measure for the sentiment analyzer is {:.2f}'.format(evaluate(sentiment_predicted_labels, sentiment_gold_labels)))


