In [124]:
### NAIVE BAYES CLASS
from nltk import word_tokenize
import numpy as np
import re
class naive_classifier:
    def __init__(self):
        self.trained = False
        self.ndoc = 0
        self.classes = ["positive", "negative"]
        self.nclasses = len(self.classes)
        self.classCounts = [0 for i in range(self.nclasses)]
        self.wordCounts = {c : dict() for c in range(self.nclasses) }
        self.likelihoods = {c : dict() for c in range(self.nclasses) }
        self.priors = [0 for i in range(self.nclasses)]
        self.vocabulary = []

    def _train(self, corpus):
        self.ndoc = len(corpus)
        for document in corpus:
            review = document[0]
            label = document[-1]
            self.classCounts[label] += 1
            for word in review:
                if word in self.wordCounts[label].keys():
                    self.wordCounts[label][word] += 1
                else:
                    self.wordCounts[label][word] = 1
                    
        for index in range(len(self.classes)):
            self.priors[index] = np.log(self.classCounts[index]/self.ndoc)
            self.vocabulary += list(self.wordCounts[index].keys())
        for index in range(len(self.classes)):
            for word in self.vocabulary:
                if word in self.wordCounts[index]:
                    self.likelihoods[index][word] = np.log((self.wordCounts[index][word]+1)/(self.classCounts[index]+1))
                else:
                    self.likelihoods[index][word] = np.log((1)/(self.classCounts[index]+1))
                    
        print (len(self.wordCounts[0]),len(self.wordCounts[1]))
                
            
                           
            
        
        
    def _read(self, document):
        toReturn = []
        with open(document) as f:
            for line in f.readlines():
                pair = line.split('\n')
                pair = pair[0].split('\t')
                review = re.sub(r"[,/?!-()*&^%|',]","",pair[0])
                bag = word_tokenize(pair[0].lower())
                label = int(pair[1])
                toReturn.append((bag,label))
        return toReturn
    
    def train(self, documents, test=False, split_ratio=0.3):
        """
        Takes txt inputs and trains the classifier
        """
        corpus = []
        for doc in documents:
            print ("reading: ",doc)
            for review in self._read(doc):
                corpus.append(review)
        if test:
            np.random.shuffle(corpus)
            split_point = int(len(corpus) * split_ratio)
            test_data = corpus[:split_point]
            train_data = corpus[split_point:]
            self._train(train_data)
            test_acc = self._test(test_data)
            train_acc = self._test(train_data)
            print (len(train_data)," training items")
            print (len(test_data)," testing items")
            print ("Training done")
            print ("Train accuracy: ",train_acc)
            print ("Test accuracy: ",test_acc)
        else:
            self._train(corpus)
            print ("Training done")
        self.trained = True
        
    def _predict(self, sentence):
        import operator
        """
        Takes tokenized input and outputs numerical class
        """
        sumc = dict()
        for c in range(self.nclasses):
            sumc[c] = self.priors[c]
            for word in sentence:
                if word in self.vocabulary:
                    sumc[c] += self.likelihoods[c][word]
        return max(sumc.items(), key=operator.itemgetter(1))[0]
    
    def predict(self, text):
        """
        Tokenize sentence, predicts and output class
        """
        sentence = word_tokenize(text)
        return self._predict(sentence)
    
    def _test(self, data):
        n_items = len(data)
        n_correct = 0
        for document in data:
            review = document[0]
            label = document[-1]
            c = self._predict(review)
            if (c == label): n_correct += 1
        return n_correct / n_items
            
            

In [125]:
#from my_naive_bayes import naive_classifier

classifier = naive_classifier()
classifier.train(["./sentiment_labelled_sentences/amazon_cells_labelled.txt",
                  "./sentiment_labelled_sentences/imdb_labelled.txt",
                  "./sentiment_labelled_sentences/yelp_labelled.txt"],
                 test=True,
                 split_ratio=0.2)

reading:  ./sentiment_labelled_sentences/amazon_cells_labelled.txt
reading:  ./sentiment_labelled_sentences/imdb_labelled.txt
reading:  ./sentiment_labelled_sentences/yelp_labelled.txt
2920 3049
2400  training items
600  testing items
Training done
Train accuracy:  0.9416666666666667
Test accuracy:  0.805


In [102]:
testS = "This experience was not bad"
print (classifier.predict(testS))

0
