In [74]:
### NAIVE BAYES CLASS
from nltk import word_tokenize
import numpy as np
import re
class naive_classifier:
    def __init__(self):
        self.trained = False
        self.classes = ["positive", "negative"]
        self.nclasses = len(self.classes)
        
        self.likelihoods = {c : dict() for c in range(self.nclasses) }
        self.priors = [0 for i in range(self.nclasses)]
        self.priorsBi = [0 for i in range(self.nclasses)]
        self.vocabulary = []
        self.vocabularyBi =[]

    def _train(self, corpus):
        classCounts = [0 for i in range(self.nclasses)]
        ndoc = len(corpus)
        wordCounts = {c : dict() for c in range(self.nclasses)}
        for document in corpus:
            review = document[0]
            label = document[-1]
            classCounts[label] += 1
            for word in review:
                if word in wordCounts[label].keys():
                    wordCounts[label][word] += 1
                else:
                    wordCounts[label][word] = 1
            for biagram in self.biagramize(review):
                if biagram in wordCounts[label].keys():
                    wordCounts[label][biagram] += 1
                else:
                    wordCounts[label][biagram] = 1
                    
        for index in range(len(self.classes)):
            self.vocabularyBi += list(wordCounts[index].keys())
            self.priorsBi[index] = np.log(classCounts[index]/ndoc)
            self.vocabulary += list(wordCounts[index].keys())
            self.priors[index] = np.log(classCounts[index]/ndoc)
        self.vocabularyBi = set(self.vocabularyBi)
        self.vocabulary = set(self.vocabulary)
            
        for index in range(len(self.classes)):
            for word in self.vocabularyBi:
                if word in wordCounts[index]:
                    self.likelihoods[index][word] = np.log((wordCounts[index][word]+1)/(sum(wordCounts[index].values())+len(self.vocabulary)))
                else:
                    self.likelihoods[index][word] = np.log((1)/(sum(wordCounts[index].values())+len(self.vocabulary)))     

            for word in self.vocabulary:
                if word in wordCounts[index]:
                    self.likelihoods[index][word] = np.log((wordCounts[index][word]+1)/(sum(wordCounts[index].values())+len(self.vocabulary)))
                else:
                    self.likelihoods[index][word] = np.log((1)/(sum(wordCounts[index].values())+len(self.vocabulary)))     

            
        
        
    def _read(self, document):
        toReturn = []
        with open(document) as f:
            for line in f.readlines():
                pair = line.split('\n')
                pair = pair[0].split('\t')
                review = re.sub(r"[,/?!-()*&^%|'.,]","",pair[0])
                bag = word_tokenize(pair[0].lower())
                label = int(pair[1])
                toReturn.append((bag,label))
        return toReturn
    
    def train(self, documents, test=False, split_ratio=0.3):
        """
        Takes txt inputs and trains the classifier
        """
        corpus = []
        for doc in documents:
            print ("reading: ",doc)
            for review in self._read(doc):
                corpus.append(review)
                
        if test:
            np.random.shuffle(corpus)
            split_point = int(len(corpus) * split_ratio)
            test_data = corpus[:split_point]
            train_data = corpus[split_point:]
            self._train(train_data)
            test_acc = self._test(test_data)
            train_acc = self._test(train_data)
            print (len(train_data)," training items")
            print (len(test_data)," testing items")
            print ("Training done")
            print ("Train accuracy: ",train_acc)
            print ("Test accuracy: ",test_acc)
        else:
            self._train(corpus)
            print ("Training done")
        self.trained = True
        
    def _predict(self, sentence):
        import operator
        """
        Takes tokenized input and outputs numerical class
        """
        sumc = dict()
        for c in range(self.nclasses):
            sumc[c] = self.priorsBi[c]
            for word in sentence:
                if word in self.vocabularyBi:
                    sumc[c] += self.likelihoods[c][word]
                elif '<>' in word:
                    unigrams = word.split('<>')
                    for unigram in unigrams:
                        if unigram in self.vocabulary:
                            sumc[c] += self.likelihoods[c][unigram]
        return max(sumc.items(), key=operator.itemgetter(1))[0]
    
    def predict(self, text):
        """
        Tokenize sentence, predicts and output class
        """
        sentence = self.biagramize(word_tokenize(text))
        return self._predict(sentence)
    
    def _test(self, data):
        n_items = len(data)
        n_correct = 0
        for document in data:
            review = document[0]
            label = document[-1]
            c = self._predict(self.biagramize(review))
            if (c == label): n_correct += 1
        return n_correct / n_items
    
    def test_file(self, file_name):
        """
        Tests with a file and outputs a file of labels
        """
        labels = []
        with open(file_name) as f:
            for line in f.readlines():
                print(line,self.predict(line))
                labels.append(self.predict(line))
        
        with open('output.txt', 'w') as f:
            for label in labels:
                f.write(str(label)+"\n")
                
        print ("Results from ",file_name," printed to: output.txt")
        
        
                
    def biagramize(self, words):
        """
        Turns unigrams into biagrams
        """
        toReturn = []
        temp = ['<s>'] + words + ['</s>']
        for ind in range(len(temp)-1):
            toReturn.append(temp[ind]+'<>'+temp[ind+1])
        return toReturn
                
        
            
        
    
    def export(self, name):
        import json
        
        toExport = {
            "likelihoods":self.likelihoods,
            "priors":self.priors,
            "vocabulary":self.vocabulary
        }
        
        with open(name,'w') as f:
            json.dump(toExport, f)
            
    def load(self, name):
        import json
        
        with open(name, 'r') as f:
            loaded = json.load(f)
            
        self.likelihoods = loaded["likelihoods"]
        self.priors = loaded["priors"]
        self.vocabulary = loaded["vocabulary"]
            
            

In [76]:
#from my_naive_bayes import naive_classifier

classifier = naive_classifier()
classifier.train(["./sentiment_labelled_sentences/amazon_cells_labelled.txt",
                  "./sentiment_labelled_sentences/imdb_labelled.txt",
                  "./sentiment_labelled_sentences/yelp_labelled.txt"],
                 test=True,
                 split_ratio=0.3)

reading:  ./sentiment_labelled_sentences/amazon_cells_labelled.txt
reading:  ./sentiment_labelled_sentences/imdb_labelled.txt
reading:  ./sentiment_labelled_sentences/yelp_labelled.txt
2100  training items
900  testing items
Training done
Train accuracy:  0.9961904761904762
Test accuracy:  0.8366666666666667


In [78]:
classifier.predict("This product was not bad")

0

In [80]:
classifier.vocabularyBi

{"'s<>character",
 'turkey<>in',
 'nice<>color',
 'even<>flash',
 'deliciously',
 'above<>...',
 'phone<>,',
 'diving',
 'covers<>were',
 '<s><>earbud',
 'phone<>when',
 'i<>wo',
 'has<>indeed',
 'went<>for',
 'sequel',
 'sorely<>disappointed',
 'restaurant',
 'flag<>in',
 'score<>is',
 'them<>were',
 'are<>emotionally',
 'recognition',
 'the<>meal',
 'storyline',
 'time<>and',
 'blare<>out',
 'wonderful<>parts',
 'genius<>.',
 'involved<>must',
 'drinking<>just',
 'phoenix<>magazine',
 'fisted<>.',
 'balls<>of',
 'talk<>on',
 'black<>was',
 'junk',
 'similar<>complaints',
 'battery<>would',
 'regardless<>,',
 'entertaining',
 '12',
 'ceases',
 'is<>eggplant',
 'film-maker<>takes',
 'drama',
 'the<>toilet',
 'future',
 'give<>it',
 'on<>the',
 'defect<>,',
 'trouble<>with',
 'the<>end',
 'controls<>are',
 'is<>actually',
 'town<>by',
 'by<>the',
 'road',
 'the<>verge',
 'ordered<>burger',
 '<s><>lewis',
 'concrete',
 'good<>addition',
 'vandiver',
 'presentation',
 'four',
 'one<>line'

In [81]:
classifier.test_file("test_sentences.txt")

This GPS tracker works like a charm.
 1
When I opened the box the product was not in the cutouts snd the protective cover was not on the unit
 0
Everyone should have one who owns a computer
 0
Buy something else
 1
Pure junk do not buy ever the greatest load of junk I have ever purchased ever
 0
The DataVac was used and full of dust and dirt
 0
Not so great...bought to clean the bobbin case area of my Brother and Baby Lock Quilting and Embroidery machines
 1
It is a great size, I keep it in my desk drawer at work and beause I teach wood shop it's going to get a lot of use
 1
I just bought this Vacuum. It's just good for nothing
 0
This is just perfect for vacuuming out the lint from my sewing machine
 1
I use it mostly to vacuum threads on the sewing machine. It is just the right size for this task.
 1
I have found this mini vac. to be everything it is said to be
 1
I ordered the Pork Prime Rib Chop it was beautiful, scrumptious and totally tender.
 1
A bastion of fine dining in The Ci

In [21]:
classifier.test_file("test_sentences.txt")

This GPS tracker works like a charm.
 1
When I opened the box the product was not in the cutouts snd the protective cover was not on the unit
 0
Everyone should have one who owns a computer
 0
Buy something else
 1
Pure junk do not buy ever the greatest load of junk I have ever purchased ever
 0
The DataVac was used and full of dust and dirt
 0
Not so great...bought to clean the bobbin case area of my Brother and Baby Lock Quilting and Embroidery machines
 1
It is a great size, I keep it in my desk drawer at work and beause I teach wood shop it's going to get a lot of use
 1
I just bought this Vacuum. It's just good for nothing
 0
This is just perfect for vacuuming out the lint from my sewing machine
 1
I use it mostly to vacuum threads on the sewing machine. It is just the right size for this task.
 1
I have found this mini vac. to be everything it is said to be
 0
I ordered the Pork Prime Rib Chop it was beautiful, scrumptious and totally tender.
 1
A bastion of fine dining in The Ci

In [12]:
classifier2 = naive_classifier()
classifier2.load("model.json")

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [10]:
testS = "I was disappointed"
print (classifier.predict(testS))

0


In [5]:
classifier.likelihoods

{0: {'however': -4.5529194956442138,
  ',': -0.88470667833883632,
  'there': -2.9434815832101129,
  'was': -1.4912292542984251,
  'so': -3.0589944703319576,
  'much': -3.731938943574383,
  'garlic': -6.2576675878826391,
  'in': -2.1801301439769194,
  'the': -0.45554921250557578,
  'fondue': -6.2576675878826391,
  'it': -1.3231936547519469,
  'barely': -5.004904619387271,
  'edible': -5.8522024797744745,
  '.': -0.1049348931785344,
  'this': -1.5942284937705715,
  'allows': -6.2576675878826391,
  'possibility': -6.2576675878826391,
  'of': -1.5571872220902225,
  'double': -6.2576675878826391,
  'booking': -6.2576675878826391,
  'for': -2.130533202837547,
  'same': -4.4659081186545837,
  'date': -6.2576675878826391,
  'and': -1.1762632228981758,
  'time': -3.0589944703319576,
  'after': -4.0063757892761434,
  'first': -4.0063757892761434,
  'ironically': -6.2576675878826391,
  'i': -1.005394159836009,
  'mostly': -6.2576675878826391,
  'find': -5.8522024797744745,
  'his': -4.38586541098