We are going to build a movie review classifier, train it with short reviews and then perform live twitter sentiment analysis

In [2]:
#import statements
import nltk
import random
from sklearn.naive_bayes import MultinomialNB
from nltk.classify.scikitlearn import SklearnClassifier
from nltk.classify import ClassifierI
from statistics import mode

In [3]:
class VoteClassifier(ClassifierI):
    def __init__(self,*classifiers):
        self.classifiers=classifiers
    def classify(self,features):
        vote=[]
        for c in self.classifiers:
            v=c.classify(features)
            vote.append(v)
        return mode(vote)   
    def confidence(self,features):
        vote=[]
        for c in self.classifiers:
            v=c.classify(features)
            vote.append(v)
        choice_votes=vote.count(mode(vote)) 
        conf= choice_votes/len(vote)
        return conf

In [6]:
short_neg_reviews=open('./negative.txt','r').read()
short_pos_reviews=open('./positive.txt','r').read()

In [7]:
documents=[]
for w in short_neg_reviews.split('\n'):
    documents.append((w,'neg'))
for w in short_pos_reviews.split('\n'):
    documents.append((w,'pos'))    

In [11]:
from nltk.tokenize import word_tokenize
all_words=[]
short_pos_words=word_tokenize(short_pos_reviews)
short_neg_words=word_tokenize(short_neg_reviews)
for w in short_pos_words:
    all_words.append(w.lower())
for w in short_neg_words:
    all_words.append(w.lower())    

In [12]:
#calculating frequecy of occurence of each word
all_words=nltk.FreqDist(all_words)

In [13]:
#now we are going to use top 5000 words as features
word_features=list(all_words.keys())[:5000]

In [14]:
#method to extract features from a given piece of text
def find_features(document):
    words=word_tokenize(document)
    features={}
    for w in word_features:
        features[w]=(w in words)
    return features    

In [15]:
#now we are extracting featureset from our document
featuresets = [(find_features(rev), category) for (rev, category) in documents]
random.shuffle(featuresets)

In [16]:
print(len(featuresets))

10664


In [17]:
#we have 10664 samples in our dataset, 
#we will use 10,000 for training and remaining for testing purpose
training_set=featuresets[:10000]
testing_set=featuresets[10000:]

In [18]:
#now using naive bayes classifier
classifier= nltk.NaiveBayesClassifier.train(training_set)
print("Naive Bayes Algo accuracy percent: ", (nltk.classify.accuracy(classifier,testing_set))*100)
classifier.show_most_informative_features(15)

Naive Bayes Algo accuracy percent:  72.59036144578313
Most Informative Features
              engrossing = True              pos : neg    =     21.0 : 1.0
                provides = True              pos : neg    =     18.3 : 1.0
               inventive = True              pos : neg    =     14.3 : 1.0
              refreshing = True              pos : neg    =     14.3 : 1.0
            refreshingly = True              pos : neg    =     12.3 : 1.0
               wonderful = True              pos : neg    =     12.2 : 1.0
                    warm = True              pos : neg    =     12.2 : 1.0
                  stupid = True              neg : pos    =     11.4 : 1.0
              thoughtful = True              pos : neg    =     11.4 : 1.0
             mesmerizing = True              pos : neg    =     11.0 : 1.0
                  unless = True              neg : pos    =     10.3 : 1.0
                    ages = True              pos : neg    =     10.3 : 1.0
               reali

In [19]:
#multinomial classifier
multinomial_classifier=SklearnClassifier(MultinomialNB())
multinomial_classifier.train(training_set)
print("Multinomial Algo accuracy percent: ", (nltk.classify.accuracy(multinomial_classifier,testing_set))*100)

Multinomial Algo accuracy percent:  70.33132530120481


In [20]:
#now using other ml classification algorithms
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC


In [21]:
logistic_classifier= SklearnClassifier(LogisticRegression())
logistic_classifier.train(training_set)
print("Logistic Algo accuracy percent: ", (nltk.classify.accuracy(logistic_classifier,testing_set))*100)

Logistic Algo accuracy percent:  73.79518072289156


In [22]:
sgd_classifier= SklearnClassifier(SGDClassifier())
sgd_classifier.train(training_set)
print("SGD Algo accuracy percent: ", (nltk.classify.accuracy(sgd_classifier,testing_set))*100)



SGD Algo accuracy percent:  69.27710843373494


In [23]:
svc_classifier= SklearnClassifier(SVC())
svc_classifier.train(training_set)
print("SVC Algo accuracy percent: ", (nltk.classify.accuracy(svc_classifier,testing_set))*100)

SVC Algo accuracy percent:  49.6987951807229


In [24]:
lsvc_classifier= SklearnClassifier(LinearSVC())
lsvc_classifier.train(training_set)
print("LinearVC Algo accuracy percent: ", (nltk.classify.accuracy(lsvc_classifier,testing_set))*100)

LinearVC Algo accuracy percent:  73.04216867469879


In [25]:
nusvc_classifier= SklearnClassifier(NuSVC())
nusvc_classifier.train(training_set)
print("NuSVC Algo accuracy percent: ", (nltk.classify.accuracy(nusvc_classifier,testing_set))*100)

NuSVC Algo accuracy percent:  73.79518072289156


In [28]:
voted_classifier=VoteClassifier(logistic_classifier,lsvc_classifier,nusvc_classifier)
print("Voted Algo accuracy percent: ", (nltk.classify.accuracy(voted_classifier,testing_set))*100)

Voted Algo accuracy percent:  73.64457831325302


In [29]:
def sentiment(text):
    feats=find_features(text)
    return voted_classifier.classify(feats), voted_classifier.confidence(feats)

In [42]:
print(sentiment('engrossing movie'))


('pos', 1.0)
