**Follow the naïve Bayes examples from chapter 4 of Harrington's *Machine Learning in Action* **

**4.5.1** *Prepare: making word vectors from text*

In [142]:
from numpy import *

def loadDataSet():
    postingList=['my dog has flea problems help please',
                 'maybe not take him to dog park stupid',
                 'my dalmation is so cute I love him',                 
                 'you should stop posting stupid worthless garbage',
                 'mr licks ate my steak how do I stop him',
                 'quit buying worthless dog food stupid']
    classVector = [0,1,0,1,0,1] # Class 1 is abusive, 0 is not
    return [doc.split() for doc in postingList], classVector

def createVocabList(dataSet):
    vocabSet = set([])
    for doc in dataSet:
        vocabSet = vocabSet |\
        set(d.lower() for d in doc) # Create union of two sets (unique values only)
    return list(vocabSet)

def setOfWords2Vec(vocabList, inputSet):
    # Converts a list of words into a vector.
    
    # Each unique word (within our vocabulary) is a feature. So if we
    # have N words in our vocabulary, the input sentence can be represented
    # as a point in N-dimensional space. Sentences that use a similar
    # vocabulary will cluster in this "vocab space"
    
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        word = word.lower()
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
        else: print("The word: %s is not in my vocabulary!" % word)
    return returnVec

def trainNB0(trainMatrix,trainCategory):
    n_trainDocs = len(trainMatrix)
    n_words = len(trainMatrix[0])
    pAbusive = sum(trainCategory)/float(n_trainDocs)
    p0Num = ones(n_words); p1Num = ones(n_words)
    p0Denom = 2.0; p1Denom = 2.0
    for i in range(n_trainDocs):
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p1Vect = log(p1Num/p1Denom)
    p0Vect = log(p0Num/p0Denom)
    return p0Vect, p1Vect, pAbusive

def classifyNB(vecToClassify, p0Vec, p1Vec, pClass1):
    p1 = sum(vecToClassify * p1Vec) + log(pClass1)
    p0 = sum(vecToClassify * p0Vec) + log(1.0 - pClass1)
    if p1 > p0:
        return 1
    else:
        return 0  

def testingNB(testEntry):
    posts, classes = loadDataSet()        
    myVocabList = createVocabList(posts)

    # Create a training matrix by converting the forum post words to vectors
    trainMat = []
    for postInDoc in posts:
        trainMat.append(setOfWords2Vec(myVocabList, postInDoc))
    p0V,p1V,pAb = trainNB0(array(trainMat), array(classes))
    
    # Classify the test value
    classNames = ['Supportive','Abusive']
    thisDoc = array(setOfWords2Vec(myVocabList, testEntry.split()))
    print(testEntry, 'classified as: ',\
          classNames[classifyNB(thisDoc,p0V,p1V,pAb)]) 

In [141]:
testingNB('I love my dalmation')
testingNB('Dog is stupid garbage')

('I love my dalmation', 'classified as: ', 'Supportive')
('Dog is stupid garbage', 'classified as: ', 'Abusive')


**4.6** *Example: classifying spam email with naïve Bayes*

In [None]:
def bagOfWords2VecMN(vocabList, inputSet):
    # Keep track of multiple occurences of words
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1
    return returnVec

In [151]:
# Load the text from the emails provided
emailText = open('./SourceMaterial/Ch04/email/ham/6.txt').read()
listOfTokens = regEx.split(emailText)

In [152]:
def textParse(bigString):
    import re
    listOfTokens = re.split(r'\W*', bigString)
    return [tok.lower() for tok in listOfTokens if len(tok) > 2]

In [203]:
def spamTest():
    docList=[]; classList = []; fullText = []
    for i in range(1,26):
        wordList = textParse(open('./SourceMaterial/Ch04/email/spam/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(open('./SourceMaterial/Ch04/email/ham/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = createVocabList(docList)
    trainingSet = range(50); testSet = []
    
    # Pick 10 random indices for the test set
    for i in range(10):
        randIdx = int(random.uniform(0,len(trainingSet)))
        testSet.append(trainingSet[randIdx])
        del(trainingSet[randIdx])
    
    # Build the training set (after test words have been removed)
    trainMat=[]; trainClasses = []
    for docIdx in trainingSet:
        trainMat.append(bagOfWords2VecMN(vocabList, docList[docIdx]))
        trainClasses.append(classList[docIdx])
    p0V,p1V,pSpam = trainNB0(array(trainMat), array(trainClasses))
    
    # Evaluate classifier on the test set
    errorCount = 0
    for docIdx in testSet:
        wordVector = bagOfWords2VecMN(vocabList, docList[docIdx])
        if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIdx]:
            errorCount += 1

    errorRate = float(errorCount)/len(testSet)
#     print("The error rate is: ", errorRate)
    return errorRate

In [229]:
# The spamTest() function performs hold-out cross validation
# To get a realistic sense of the error rate, we need to average
# multiple tests

n_runs = 100
totalError = 0.0
for n in range(n_runs):
    totalError = totalError + spamTest()
    
print("The average error rate is: %.02f percent." % (100*totalError/float(n_runs)))


The average error rate is: 7.60 percent.


**Reminder:**

I should compare the performance (accuracy and efficiency) of the Naive Bayes classifier to the decision tree from chapter 3