##### Importing new data sets - how to do it

In [1]:
#if you want to save from webpage uncomment this

from bs4 import BeautifulSoup
import urllib.request
import pickle

def saveFile(webpage, filename):
    websource = urllib.request.urlopen(webpage)
    soup = BeautifulSoup(websource.read(), "html.parser")
    lines = soup.get_text()
    fh = open(filename,"w")
    fh.writelines(lines)
    fh.close() 

#Negative sentiment examples
saveFile(webpage = "https://pythonprogramming.net/static/downloads/short_reviews/negative.txt", 
         filename = "negative.txt")
#Postive sentiment examples
saveFile(webpage = "https://pythonprogramming.net/static/downloads/short_reviews/positive.txt", 
         filename = "positive.txt")


In [2]:

short_pos  = open("positive.txt", "r").read()
short_neg  = open("negative.txt", "r").read()

documents = []

for line in short_pos.split("\n"):
    documents.append((line, "pos"))

    
for line in short_neg.split("\n"):
    documents.append((line, "neg"))



In [3]:
import nltk

In [4]:
short_pos_words  = nltk.word_tokenize(short_pos)
short_neg_words  = nltk.word_tokenize(short_neg)


In [5]:
len(short_pos_words)
type(short_pos)
len(short_pos)

562832

In [6]:
all_words = []

for w in short_pos_words:
    all_words.append(w.lower())

for w in short_neg_words:
    all_words.append(w.lower())

In [28]:
all_words = nltk.FreqDist(all_words)
#We want to use top 5000 words
word_features = list(all_words.keys())[:5000] 

word_features_f = open("pickled_algos/word_features.pickle", "wb")
pickle.dump(word_features, word_features_f)
word_features_f.close()


In [8]:
import random
def find_features(document):
    words = nltk.word_tokenize(document)
    features = {}
    for w in word_features:  #note that word_features contains top 3000 words only(above)
        features[w] = (w in words)
    return(features)

featuresets = [(find_features(doc), category) for (doc, category) in documents]

random.shuffle(featuresets)

In [9]:
save_documents = open("pickled_algos/documents.pickle", "wb")
pickle.dump(documents, save_documents)
save_documents.close()

In [10]:
training_set = featuresets[:10000] #firs 1900
testing_set  = featuresets[10000:] #rest starting 1900th 

In [11]:
classifier = nltk.NaiveBayesClassifier.train(training_set)
nltk.classify.accuracy(classifier, testing_set)

save_classifier = open("pickled_algos/naivebayes.pickle", "wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()


In [12]:
import pickle
save_classifier = open("pickled_algos/naivebayes_pos_neg.pickle", "wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()

classifier_f = open("pickled_algos/naivebayes_pos_neg.pickle", "rb")
classifier_fromPickle = pickle.load(classifier_f)
classifier_f.close()

In [13]:
##Accuracy
nltk.classify.accuracy(classifier_fromPickle, testing_set)

0.7874015748031497

### Using other classfiers

In [14]:
from nltk.classify.scikitlearn import SklearnClassifier #wrapper of nltk over scikit learn
import pickle

from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.preprocessing import LabelEncoder

In [15]:
mnb_classifier = SklearnClassifier(MultinomialNB())
mnb_classifier.train(training_set)
##Accuracy
nltk.classify.accuracy(mnb_classifier, testing_set)


save_classifier = open("pickled_algos/MultinomialNB.pickle", "wb")
pickle.dump(mnb_classifier, save_classifier)
save_classifier.close()


In [16]:
##Some Errors in this 

#gaussian_classifier = SklearnClassifier(GaussianNB())
#gaussian_classifier.train(training_set)
#gaussian_classifier.train(training_set)

##Accuracy
#nltk.classify.accuracy(gaussian_classifier, testing_set)


In [17]:
bernoulli_classifier = SklearnClassifier(BernoulliNB())
bernoulli_classifier.train(training_set)
##Accuracy
nltk.classify.accuracy(bernoulli_classifier, testing_set)

save_classifier = open("pickled_algos/BernoulliNB.pickle", "wb")
pickle.dump(bernoulli_classifier, save_classifier)
save_classifier.close()

### Other scikitlearn classiers

#LogisticRegression, SGDClassifier
#SVC, LinearSVC, NuSVC


In [18]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

In [19]:
LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
##Accuracy
nltk.classify.accuracy(LogisticRegression_classifier, testing_set)


save_classifier = open("pickled_algos/LogisticRegression.pickle", "wb")
pickle.dump(LogisticRegression_classifier, save_classifier)
save_classifier.close()

In [20]:
SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
##Accuracy
nltk.classify.accuracy(SGDClassifier_classifier, testing_set)


save_classifier = open("pickled_algos/SGDClassifier.pickle", "wb")
pickle.dump(SGDClassifier_classifier, save_classifier)
save_classifier.close()



In [21]:
SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(training_set)
##Accuracy
nltk.classify.accuracy(SVC_classifier, testing_set)


save_classifier = open("pickled_algos/SVC.pickle", "wb")
pickle.dump(SVC_classifier, save_classifier)
save_classifier.close()

In [22]:
LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
##Accuracy
nltk.classify.accuracy(LinearSVC_classifier, testing_set)


save_classifier = open("pickled_algos/LinearSVC.pickle", "wb")
pickle.dump(LinearSVC_classifier, save_classifier)
save_classifier.close()

In [23]:
NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)
##Accuracy
nltk.classify.accuracy(NuSVC_classifier, testing_set)


save_classifier = open("pickled_algos/NuSVC.pickle", "wb")
pickle.dump(NuSVC_classifier, save_classifier)
save_classifier.close()

#### Note

All of above have their own paramaters which you can customize to imporve the acuuracy

In [24]:
from nltk.classify import ClassifierI
from statistics import mode

In [25]:
class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers
        
    def classify(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        return(mode(votes))
    
    def confidence(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
            
        choice_votes = votes.count(mode(votes))
        conf = choice_votes/len(votes)
        return conf


In [26]:
voted_classfier = VoteClassifier(mnb_classifier,
                                 bernoulli_classifier,
                                 classifier_fromPickle,
                                 SVC_classifier,
                                 NuSVC_classifier)

In [27]:
nltk.classify.accuracy(voted_classfier, testing_set)

0.7874015748031497