# Sentiment Module
BY --<i>BHARAT SRI HARSHA KARPURAPU</i>

<p>This is a general-purpose sentiment module, which I developed for calculating the sentiment in my other projects where this module is used as a library for giving sentiment. This module is well built with different classifiers like Naive Bayes, SVC, Bernoulli, Multinomial Naive Bayes and logistic Regression. The final sentiment result was given by the mode value of all the different classifiers. Confidence score was also given by this module by taking certain considerations.</p>
<p>Kindly go through the entire notebook for detailed understanding. If any questions, please feel free to contact <b><i> kbsriharsha@gmail.com</i></b></p>

In [1]:
# Importing necessary libraries
import nltk
import random
from nltk.classify.scikitlearn import SklearnClassifier
import pickle
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from nltk.classify import ClassifierI
from nltk.tokenize import word_tokenize
from collections import Counter

In [2]:
# Class with methods for giving mode of the classification list and for giving sentiment score
class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers

    def classify(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        return Counter(votes).most_common(1)[0][0]

    def confidence(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)

        choice_votes = votes.count(Counter(votes).most_common(1)[0][0])
        conf = float(choice_votes) / float(len(votes))
        return conf

In [3]:
# To make the default encoding to utf-8, so as to make platform independent to tackle encoding problem.
import sys  

reload(sys)  
sys.setdefaultencoding('utf8')


In [4]:
# Performing natural language processing operation like tokenizing, stemming, chunking, etc.
import nltk
import re
import numpy as np
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction import text
def required_words(tweet):
    stop_words_nltk = stopwords.words('english')
    stop_words_sklearn = text.ENGLISH_STOP_WORDS
    stop = set(stop_words_nltk) | set(stop_words_sklearn)
    punc = set(["#",".","{","}","{}","@","\\","/",",","!","?","'","|"])
    words = [re.sub(r"[^\x00-\x7F]","",re.sub(r"[\-\d\\./?'|]","",x.lower())) for x in word_tokenize(tweet) if x not in stop | punc]
    words_replaced = filter(lambda a: a != "", words)
    pos_words = nltk.pos_tag(words_replaced)
    #print pos_words
    adjectives = [x[0] for x in pos_words if x[1] in ["JJ","JJR","JJS"]]
    nouns = [x[0] for x in pos_words if x[1] in ["NN","NNP","NNPS", "NNS"]]
    verbs = [x[0] for x in pos_words if x[1] in ["RBS","RBR","RB","VB"]]
    #return adjectives+nouns
    return " ".join(adjectives+nouns+verbs)

In [5]:
# Reading the positive text for features
req_tags = ["JJ","JJR","JJS","VB","RB","RBR","RBS"]
documents = []
all_words = []
with open("positive.txt","r") as f:
        positive_words = []
        for line in f:
            try:
                lines = required_words(line)
                documents.append( (lines, "pos") )
                tokenize = word_tokenize(lines)
                pos_tag = nltk.pos_tag(tokenize)
                positive_words.append([x[0].lower() for x in pos_tag if x[1] in req_tags])    
            except:
                continue

In [22]:
# Reading negative text for features
with open("negative.txt","r") as f:
        negative_words = []
        for line in f:
            try:
                lines = required_words(line)
                documents.append( (lines, "neg") )
                tokenize = word_tokenize(lines)
                neg_tag = nltk.pos_tag(tokenize)
                negative_words.append([x[0].lower() for x in neg_tag if x[1] in req_tags])    
            except:
                continue

In [9]:
# Making a list of lists into single list
positive_words = [x for y in positive_words for x in y]

In [23]:
len(positive_words)

14497

In [24]:
# Making a list of lists to single list
negative_words = [x for y in negative_words for x in y]

In [25]:
len(negative_words)

13728

In [26]:
# For calculationg the frequency distribution of positive words
freq_pos = nltk.FreqDist(positive_words)

In [27]:
# For calculating the propability distribution of the positive words
prop_pos = [{freq_pos.keys()[x]:float(freq_pos.values()[x]/13431.00)} for x in range(len(freq_pos))]

In [28]:
# For calculating the frequency distribution of negative words
freq_neg = nltk.FreqDist(negative_words)

In [29]:
# For calculating the propability distribution of negative words
prop_neg = [{freq_neg.keys()[x]:float(freq_neg.values()[x]/13431.00)} for x in range(len(freq_neg))]

In [30]:
# For unique feature words
features_words = nltk.FreqDist(positive_words+negative_words).keys()

In [31]:
len(features_words)

6093

In [34]:
# Saving documents positive and negative documents
document_save = open("documents.pickle","wb")
pickle.dump(documents, document_save)
document_save.close()

In [35]:
# Saving feature words
features_save = open("features.pickle","wb")
pickle.dump("features_words", features_save)
features_save.close()

In [36]:
# Function for constructing feature Vector
def find_features(document):
    words = word_tokenize(document)
    features = {}
    for w in features_words:
        features[w] = (w in words)

    return features

In [37]:
# Constructing feature set for training and testing purpose
featuresets = [(find_features(rev), category) for (rev, category) in documents]

In [34]:
len(featuresets)

10658

In [39]:
random.shuffle(featuresets)
len(featuresets)

10658

In [40]:
training_data = featuresets[:9000]
testing_data = featuresets[9000:]

Here different classifiers were trained and saved so that we can use at any point of time by just opeining the trained classifier.

In [38]:
# Classifiers
#base_classifier = nltk.NaiveBayesClassifier.train(training_data)
#MNB_classifier = SklearnClassifier(MultinomialNB()).train(training_data)
#BernoulliNB_classifier = SklearnClassifier(BernoulliNB()).train(training_data)
#LinearSVC_classifier = SklearnClassifier(LinearSVC()).train(training_data)
#LogisticRegression_classifier = SklearnClassifier(LogisticRegression()).train(training_data)

In [39]:
#save_classifier = open("algorithms_pickle/naive_bayes.pickle","wb")
#pickle.dump(base_classifier, save_classifier)
#save_classifier.close()

In [40]:
#save_classifier = open("algorithms_pickle/MNB.pickle","wb")
#pickle.dump(MNB_classifier, save_classifier)
#save_classifier.close()

In [41]:
#save_classifier = open("algorithms_pickle/Berno.pickle","wb")
#pickle.dump(BernoulliNB_classifier, save_classifier)
#save_classifier.close()

In [42]:
#save_classifier = open("algorithms_pickle/LineaSVC.pickle","wb")
#pickle.dump(LinearSVC_classifier, save_classifier)
#save_classifier.close()

In [43]:
#save_classifier = open("algorithms_pickle/Logistic.pickle","wb")
#pickle.dump(LogisticRegression_classifier, save_classifier)
#save_classifier.close()

In [50]:
# Opening the trained Naive Bayes Classifier
open_file = open("algorithms_pickle/naive_bayes.pickle","rb")
Naive_Bayes_Classifier = pickle.load(open_file)
open_file.close()

In [44]:
# Opeining trained Multinomial Naive Bayes Classifier
open_file = open("algorithms_pickle/MNB.pickle","rb")
MNB_Classifier = pickle.load(open_file)
open_file.close()

In [46]:
# Opening the trained Bernoulli Classifier
open_file = open("algorithms_pickle/Berno.pickle","rb")
Berno_Classifier = pickle.load(open_file)
open_file.close()

In [47]:
# Opening trained Linear SVC Classifier
open_file = open("algorithms_pickle/LineaSVC.pickle","rb")
Linear_SVC_Classifier = pickle.load(open_file)
open_file.close()

In [48]:
# Opeining trained Logistic Regression Classifier
open_file = open("algorithms_pickle/Logistic.pickle","rb")
Logistic_Classifier = pickle.load(open_file)
open_file.close()

In [51]:
# Building a Final classifier which will give the mode value of all the classifier results.
Classifier_winner = VoteClassifier(Naive_Bayes_Classifier,
                                   MNB_Classifier,
                                   Berno_Classifier,
                                   Linear_SVC_Classifier,
                                   Logistic_Classifier)

In [53]:
# Defining a function which can be called from other program file to give the sentiment polarity
def sentiment(tweet):
    tweet_words = find_features(tweet)
    return Classifier_winner.classify(tweet_words),Classifier_winner.confidence(tweet_words)

In [63]:
sentiment("I am awesome classifier")

('pos', 1.0)

<h4>Note</h4>
<p>We can increase the performance by giving more training data. This module is being called by many of my other programs for evaluating sentiment sentiment polarity.</p>
<p>For using in other programs, first import the module into your current program by <br><i>from Sentiment_Module import Sentiment</i> <br> and then pass the statement whose polarity needs to be determined through the function <br><i>Sentiment(<--statement-->)</i></p>

<p>I calculated probability distribution of feature words but I didn't that one for training purpose. However, we can extend this model to more precise by giving propabilities of negativity and positivity using those propability distribution.</p>
<p>I believe this module can be greatly developed futher with all the above ideas taken into consideration.</p>

# Skills

<p><b> Topics: </b> Machine Learning, Natural Language Processing, Sentiment analysis</p>
<p><b> Machine Learning Algorithms: </b> Naive Bayes classifier, Multinomial Naive Bayes classifier, Logistic Regression classifier, Linear SVC classifier, Bernoulli classifier</p>
<p><b> Natural Language Processing:</b> Parsing, chunking, Parts of Speech tagging, Named Entity Recognition</p>
<p><b> Compute: </b> python</p>
<p><b> Data Tools:</b>Jupyter Notebook</p>
<p><b> Version Control: </b> Git</p>
    