In [1]:
import nltk
import random
from nltk.corpus import movie_reviews
print([category for category in movie_reviews.categories()])
documents=[(list(movie_reviews.words(fileid)),category) 
          for category in movie_reviews.categories()
          for fileid in movie_reviews.fileids(category)]
# print(*documents,sep="\n\n")

['neg', 'pos']


In [2]:
random.shuffle(documents)


In [3]:
all_words=[w.lower() for w in movie_reviews.words()]

In [4]:
all_words=nltk.FreqDist(all_words)
print(all_words["stupid"])
print(all_words.most_common(10))

253
[(',', 77717), ('the', 76529), ('.', 65876), ('a', 38106), ('and', 35576), ('of', 34123), ('to', 31937), ("'", 30585), ('is', 25195), ('in', 21822)]


# Converting words to Features with NLTK

In [5]:
word_features=list(all_words.keys())[:3000]

def find_features(document):
    words=set(document)
    features={}
    for w in word_features:
        features[w]=(w in words)
    return features

In [6]:
#print((find_features(movie_reviews.words('neg/cv000_29416.txt'))))

In [7]:
featuresets = [(find_features(rev), category) for (rev, category) in documents]

# Naive Bayes Classifier with NLTK

In [8]:
training_set=featuresets[:1900]
testing_set=featuresets[1900:]
print(training_set[0])



In [9]:
classifier=nltk.NaiveBayesClassifier.train(training_set)
print("Naive Bayes Algo accuracy percent", nltk.classify.accuracy(classifier,testing_set))

Naive Bayes Algo accuracy percent 0.67


In [10]:
classifier.show_most_informative_features(15)

Most Informative Features
             fascination = True              pos : neg    =     11.0 : 1.0
                 symbols = True              pos : neg    =      7.0 : 1.0
               balancing = True              pos : neg    =      7.0 : 1.0
                 amateur = True              neg : pos    =      7.0 : 1.0
                  coyote = True              neg : pos    =      7.0 : 1.0
             magnificent = True              pos : neg    =      6.6 : 1.0
              ineptitude = True              neg : pos    =      6.3 : 1.0
              annoyingly = True              neg : pos    =      6.3 : 1.0
                  turkey = True              neg : pos    =      5.9 : 1.0
            thematically = True              pos : neg    =      5.7 : 1.0
                  argues = True              pos : neg    =      5.7 : 1.0
                     liu = True              neg : pos    =      5.7 : 1.0
                  skimpy = True              neg : pos    =      5.7 : 1.0

# Saving classifier with nltk using pickle

In [11]:
import pickle

save_classifier=open("naivebayes.pickle","wb")
pickle.dump(classifier,save_classifier)
save_classifier.close()



# retriving classifier

In [12]:
classifier_f=open("naivebayes.pickle","rb")
classifier=pickle.load(classifier_f)
classifier_f.close()
classifier.show_most_informative_features(15)
print("Naive Bayes Algo accuracy percent", 100*nltk.classify.accuracy(classifier,testing_set))

Most Informative Features
             fascination = True              pos : neg    =     11.0 : 1.0
                 symbols = True              pos : neg    =      7.0 : 1.0
               balancing = True              pos : neg    =      7.0 : 1.0
                 amateur = True              neg : pos    =      7.0 : 1.0
                  coyote = True              neg : pos    =      7.0 : 1.0
             magnificent = True              pos : neg    =      6.6 : 1.0
              ineptitude = True              neg : pos    =      6.3 : 1.0
              annoyingly = True              neg : pos    =      6.3 : 1.0
                  turkey = True              neg : pos    =      5.9 : 1.0
            thematically = True              pos : neg    =      5.7 : 1.0
                  argues = True              pos : neg    =      5.7 : 1.0
                     liu = True              neg : pos    =      5.7 : 1.0
                  skimpy = True              neg : pos    =      5.7 : 1.0

# Scikit-Learn Sklearn with nltk

In [13]:
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB,GaussianNB,BernoulliNB



In [14]:

MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)


MNB_classifier accuracy percent: 71.0


In [15]:

BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100)


BernoulliNB_classifier accuracy percent: 68.0


In [16]:
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC


In [17]:
LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)

LogisticRegression_classifier accuracy percent: 67.0


In [18]:
SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100)


SGDClassifier_classifier accuracy percent: 70.0


In [19]:
SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(training_set)
print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, testing_set))*100)


SVC_classifier accuracy percent: 49.0


In [20]:
LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)


LinearSVC_classifier accuracy percent: 69.0


In [21]:

NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)
print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100)

NuSVC_classifier accuracy percent: 69.0


# Combining Algorithm

In [22]:
from nltk.classify import ClassifierI
from statistics import mode
class VoteClassifier(ClassifierI):
    def __init__(self,*classifiers):
        self._classifiers=classifiers
    def classify(self,features):
        votes=[]
        for c in self._classifiers:
            v=c.classify(features)
            votes.append(v)
        return mode(votes)
    def confidence(self,features):
        votes=[]
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        choice_votes=votes.count(mode(votes))
        conf=choice_votes/len(votes)
        return conf



In [23]:
voted_classifier=VoteClassifier(classifier,
                                NuSVC_classifier,
                                LinearSVC_classifier,
                                SGDClassifier_classifier,
                                MNB_classifier,
                                BernoulliNB_classifier,
                                LogisticRegression_classifier)

In [24]:
print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set))*100)

print("Classification:", voted_classifier.classify(testing_set[0][0]), "Confidence %:",voted_classifier.confidence(testing_set[0][0])*100)
print("Classification:", voted_classifier.classify(testing_set[1][0]), "Confidence %:",voted_classifier.confidence(testing_set[1][0])*100)
print("Classification:", voted_classifier.classify(testing_set[2][0]), "Confidence %:",voted_classifier.confidence(testing_set[2][0])*100)
print("Classification:", voted_classifier.classify(testing_set[3][0]), "Confidence %:",voted_classifier.confidence(testing_set[3][0])*100)
print("Classification:", voted_classifier.classify(testing_set[4][0]), "Confidence %:",voted_classifier.confidence(testing_set[4][0])*100)
print("Classification:", voted_classifier.classify(testing_set[5][0]), "Confidence %:",voted_classifier.confidence(testing_set[5][0])*100)

voted_classifier accuracy percent: 71.0
Classification: pos Confidence %: 71.42857142857143
Classification: pos Confidence %: 100.0
Classification: pos Confidence %: 100.0
Classification: pos Confidence %: 57.14285714285714
Classification: neg Confidence %: 57.14285714285714
Classification: pos Confidence %: 100.0
