# This notebook contains the code needed to run the examples in Chapter 04 'Classifying with Probability Theory - Naive Bayes'

Naïve Bayes
Pros: Works with a small amount of data, handles multiple classes Cons: Sensitive to how the input data is prepared
Works with: Nominal values


General approach to naïve Bayes
1. Collect: Any method. We’ll use RSS feeds in this chapter.
2. Prepare: Numeric or Boolean values are needed.
3. Analyze: With many features, plotting features isn’t helpful. Looking at histo- grams is a better idea.
4. Train: Calculate the conditional probabilities of the independent features.
5. Test: Calculate the error rate.
6. Use: One common application of naïve Bayes is document classification. You can use naïve Bayes in any classification setting. It doesn’t have to be text.

In [2]:
def loadDataSet(): #create sample data to experiment with 
            postingList=[['my', 'dog', 'has', 'flea','problems', 'help', 'please'],
                         ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                         ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                         ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                         ['mr', 'licks', 'ate', 'my', 'steak', 'how','to', 'stop', 'him'],
                         ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
            classVec = [0,1,0,1,0,1]
            return postingList,classVec

def createVocabList(dataSet): #gather all unique words in all docs 
    vocabSet = set([]) #create an empty set 
    for document in dataSet:
#1 is abusive, 0 not
        vocabSet = vocabSet | set(document) #create union of 2 sets 
    return list(vocabSet)

In [3]:
def setOfWords2Vec(vocabList, inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
        else: print("the word: %s is not in my Vocabulary!" % word)
    return returnVec

In [39]:
def bagOfWords2VecMN(vocabList, inputSet):
            returnVec = [0]*len(vocabList)
            for word in inputSet:
                if word in vocabList:
                    returnVec[vocabList.index(word)] += 1
            return returnVec

In [4]:
listOPosts,listClasses = loadDataSet()

In [43]:
print(sum(listClasses))

3


In [5]:
myVocabList = createVocabList(listOPosts)

In [6]:
print(myVocabList)

['cute', 'mr', 'to', 'buying', 'flea', 'him', 'worthless', 'how', 'I', 'garbage', 'has', 'maybe', 'stop', 'ate', 'love', 'so', 'problems', 'park', 'dog', 'not', 'is', 'please', 'licks', 'stupid', 'posting', 'my', 'food', 'quit', 'steak', 'take', 'dalmation', 'help']


In [7]:
setOfWords2Vec(myVocabList, listOPosts[0])

[0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1]

In [8]:
setOfWords2Vec(myVocabList, listOPosts[3])

[0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [9]:
from numpy import *

In [25]:
#Naive Bayes Classifier Training Function 
def trainNB0(trainMatrix,trainCategory): #trainMatrix -- list of docs , trainCategory - list of classes
    numTrainDocs = len(trainMatrix) #no of docs 
    numWords = len(trainMatrix[0]) #no of words 
    pAbusive = sum(trainCategory)/float(numTrainDocs) #initialize probabilities sum(trainCategory) basically sums the 1's
    p0Num = ones(numWords); p1Num = ones(numWords)  #initialize to 1 instead of 0 , these are of type ndarray
    p0Denom = 2.0; p1Denom = 2.0
    for i in range(numTrainDocs): #loop over every document in training set 
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i] #vector addition i.e Every time a word appears in a document, the count for that word (p1Num or p0Num) gets incremented,
            p1Denom += sum(trainMatrix[i]) #the total number of words for a document gets summed up over all the documents.
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p1Vect = log(p1Num/p1Denom)   
    p0Vect = log(p0Num/p0Denom)          #change to log()
    return p0Vect,p1Vect,pAbusive

In [50]:
p0Num = zeros(32)
print(type(p0Num))
print(p0Num)



<class 'numpy.ndarray'>
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]


In [26]:
myVocabList = createVocabList(listOPosts)

In [42]:
trainMat=[]
for postinDoc in listOPosts:
    trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
print(len(trainMat))
print(trainMat)
print(len(trainMat[0]))


6
[[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1], [0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0], [1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0], [0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0]]
32


In [51]:
print(trainMat[0])
print(sum(trainMat[0]))

[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1]
7


In [53]:
p1Num = ones(32)
print(p1Num)
p1Num += trainMat[0]
print(p1Num)

[ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.]
[ 1.  1.  1.  1.  2.  1.  1.  1.  1.  1.  2.  1.  1.  1.  1.  1.  2.  1.
  2.  1.  1.  2.  1.  1.  1.  2.  1.  1.  1.  1.  1.  2.]


In [28]:
p0V,p1V,pAb=trainNB0(trainMat,listClasses)

In [54]:
#the probabilities of the words from our vocabulary given the document class.
print(p0V) # prob of word given class 0 - Non Abusive

[-2.56494936 -2.56494936 -2.56494936 -3.25809654 -2.56494936 -2.15948425
 -3.25809654 -2.56494936 -2.56494936 -3.25809654 -2.56494936 -3.25809654
 -2.56494936 -2.56494936 -2.56494936 -2.56494936 -2.56494936 -3.25809654
 -2.56494936 -3.25809654 -2.56494936 -2.56494936 -2.56494936 -3.25809654
 -3.25809654 -1.87180218 -3.25809654 -3.25809654 -2.56494936 -3.25809654
 -2.56494936 -2.56494936]


In [55]:
print(p1V) # prob of word given class 1 - Abusive

[-3.04452244 -3.04452244 -2.35137526 -2.35137526 -3.04452244 -2.35137526
 -1.94591015 -3.04452244 -3.04452244 -2.35137526 -3.04452244 -2.35137526
 -2.35137526 -3.04452244 -3.04452244 -3.04452244 -3.04452244 -2.35137526
 -1.94591015 -2.35137526 -3.04452244 -3.04452244 -3.04452244 -1.65822808
 -2.35137526 -3.04452244 -2.35137526 -2.35137526 -3.04452244 -2.35137526
 -3.04452244 -3.04452244]


In [31]:
print(pAb)

0.5


In [37]:
#Naive Bayes Classify Function 

def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
    p1 = sum(vec2Classify * p1Vec) + log(pClass1)
    p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
    if p1 > p0:
        return 1
    else:
        return 0

def testingNB():
    listOPosts,listClasses = loadDataSet()
    myVocabList = createVocabList(listOPosts)
    trainMat=[]
    for postinDoc in listOPosts:
        trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
    p0V,p1V,pAb = trainNB0(array(trainMat),array(listClasses))
    testEntry = ['love', 'my', 'dalmation']        
    thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
    print(testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb))
    testEntry = ['stupid', 'garbage']
    thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
    print(testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb))

In [38]:
testingNB()

['love', 'my', 'dalmation'] classified as:  0
['stupid', 'garbage'] classified as:  1


In [56]:
mySent='This book is the best book on Python or M.L. I have ever laid my eyes upon.'

In [57]:
mySent.split()

['This',
 'book',
 'is',
 'the',
 'best',
 'book',
 'on',
 'Python',
 'or',
 'M.L.',
 'I',
 'have',
 'ever',
 'laid',
 'my',
 'eyes',
 'upon.']

In [58]:
import re 
regEx = re.compile('\\W*')
listOfTokens = regEx.split(mySent)
print(listOfTokens)

['This', 'book', 'is', 'the', 'best', 'book', 'on', 'Python', 'or', 'M', 'L', 'I', 'have', 'ever', 'laid', 'my', 'eyes', 'upon', '']


  app.launch_new_instance()


In [59]:
[tok for tok in listOfTokens if len(tok)>0]

['This',
 'book',
 'is',
 'the',
 'best',
 'book',
 'on',
 'Python',
 'or',
 'M',
 'L',
 'I',
 'have',
 'ever',
 'laid',
 'my',
 'eyes',
 'upon']

In [74]:
emailText = open('email/ham/6.txt',encoding='UTF-8',errors='ignore').read()

In [75]:
listOfTokens=regEx.split(emailText)

  if __name__ == '__main__':


In [76]:
print(listOfTokens)

['Hello', 'Since', 'you', 'are', 'an', 'owner', 'of', 'at', 'least', 'one', 'Google', 'Groups', 'group', 'that', 'uses', 'the', 'customized', 'welcome', 'message', 'pages', 'or', 'files', 'we', 'are', 'writing', 'to', 'inform', 'you', 'that', 'we', 'will', 'no', 'longer', 'be', 'supporting', 'these', 'features', 'starting', 'February', '2011', 'We', 'made', 'this', 'decision', 'so', 'that', 'we', 'can', 'focus', 'on', 'improving', 'the', 'core', 'functionalities', 'of', 'Google', 'Groups', 'mailing', 'lists', 'and', 'forum', 'discussions', 'Instead', 'of', 'these', 'features', 'we', 'encourage', 'you', 'to', 'use', 'products', 'that', 'are', 'designed', 'specifically', 'for', 'file', 'storage', 'and', 'page', 'creation', 'such', 'as', 'Google', 'Docs', 'and', 'Google', 'Sites', 'For', 'example', 'you', 'can', 'easily', 'create', 'your', 'pages', 'on', 'Google', 'Sites', 'and', 'share', 'the', 'site', 'http', 'www', 'google', 'com', 'support', 'sites', 'bin', 'answer', 'py', 'hl', 'en',

In [82]:
def textParse(bigString): #takes a  big string and parses into list of strings 
            import re
            listOfTokens = re.split(r'\W*', bigString)
            return [tok.lower() for tok in listOfTokens if len(tok) > 2] #eliminates the strings less than 2 characters 
        
def spamTest():
            docList=[]; classList = []; fullText =[]
            for i in range(1,26): #since there are 25 files 
                wordList = textParse(open('email/spam/%d.txt' % i,encoding='UTF-8',errors='ignore').read())
                docList.append(wordList)
                fullText.extend(wordList)
                classList.append(1)
                wordList = textParse(open('email/ham/%d.txt' % i,encoding='UTF-8',errors='ignore').read())
                docList.append(wordList)
                fullText.extend(wordList)
                classList.append(0)
            vocabList = createVocabList(docList)
            trainingSet = list(range(50)); testSet=[]
            for i in range(10): #select 10 random files for testset
                randIndex = int(random.uniform(0,len(trainingSet)))
                testSet.append(trainingSet[randIndex])
                del(trainingSet[randIndex]) #remove the file added to test set from training set 
            trainMat=[]; trainClasses = []
            for docIndex in trainingSet:
                trainMat.append(setOfWords2Vec(vocabList, docList[docIndex])) #convert training emails to vectors 
                trainClasses.append(classList[docIndex]) #store the class category 
            p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses)) #calc probabilities needed for classification
            errorCount = 0
            for docIndex in testSet: #check how email is classified from the test set 
                wordVector = setOfWords2Vec(vocabList, docList[docIndex])
                if classifyNB(array(wordVector),p0V,p1V,pSpam) !=classList[docIndex]:
                    errorCount += 1 #increment error count if it is not classified correctly 
            print('the error rate is: ',float(errorCount)/len(testSet)) #calc error percentage 

In [83]:
spamTest()

the error rate is:  0.0


  return _compile(pattern, flags).split(string, maxsplit)


In [92]:
spamTest()

the error rate is:  0.0


  return _compile(pattern, flags).split(string, maxsplit)


Example: using naïve Bayes to find locally used words
1. Collect: Collect from RSS feeds. We’ll need to build an interface to the RSS feeds.
2. Prepare: Parse text into token vectors.
3. Analyze: Inspect the tokens to make sure parsing was done correctly.
4. Train: Use trainNB0() that we created earlier.
5. Test: We’ll look at the error rate to make sure this is actually working. We can make modifications to the tokenizer to improve the error rate and results.
6. Use: We’ll build a complete program to wrap everything together. It will display the most common words given in two RSS feeds.

In [93]:
import feedparser

In [94]:
ny=feedparser.parse('http://newyork.craigslist.org/stp/index.rss')


In [95]:
ny['entries']

[{'dc_source': 'http://newyork.craigslist.org/que/stp/6077242127.html',
  'dc_type': 'text',
  'id': 'http://newyork.craigslist.org/que/stp/6077242127.html',
  'language': 'en-us',
  'link': 'http://newyork.craigslist.org/que/stp/6077242127.html',
  'links': [{'href': 'http://newyork.craigslist.org/que/stp/6077242127.html',
    'rel': 'alternate',
    'type': 'text/html'}],
  'published': '2017-04-06T18:11:32-04:00',
  'published_parsed': time.struct_time(tm_year=2017, tm_mon=4, tm_mday=6, tm_hour=22, tm_min=11, tm_sec=32, tm_wday=3, tm_yday=96, tm_isdst=0),
  'rights': 'copyright 2017 craiglist',
  'rights_detail': {'base': 'https://newyork.craigslist.org/search/stp?format=rss',
   'language': None,
   'type': 'text/plain',
   'value': 'copyright 2017 craiglist'},
  'summary': 'Open to making a buddy. Anybody drive like this? Enjoy adrenaline? Hit me up. So we can discuss. Can hopefully make it worth your time.',
  'summary_detail': {'base': 'https://newyork.craigslist.org/search/stp?

In [96]:
len(ny['entries'])

25

In [110]:
def calcMostFreq(vocabList,fullText):
    import operator
    freqDict = {}
    for token in vocabList:
         freqDict[token]=fullText.count(token)
    sortedFreq = sorted(freqDict.items(), key=operator.itemgetter(1),reverse=True)
    return sortedFreq[:30]

def localWords(feed1,feed0):
    import feedparser
    docList=[]; classList = []; fullText =[]
    minLen = min(len(feed1['entries']),len(feed0['entries']))
    for i in range(minLen):
        wordList = textParse(feed1['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(feed0['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = createVocabList(docList)
    top30Words = calcMostFreq(vocabList,fullText)
    for pairW in top30Words:
        if pairW[0] in vocabList: vocabList.remove(pairW[0])
    trainingSet = list(range(2*minLen)); testSet=[]
    for i in range(20):
        randIndex = int(random.uniform(0,len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])
    trainMat=[]; trainClasses = []
    for docIndex in trainingSet:
        trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))
    errorCount = 0
    for docIndex in testSet:
        wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
        if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
            errorCount += 1
    print('the error rate is: ',float(errorCount)/len(testSet))
    return vocabList,p0V,p1V

In [105]:
sf=feedparser.parse('http://sfbay.craigslist.org/stp/index.rss')

In [106]:
sf['entries']

[{'dc_source': 'http://sfbay.craigslist.org/sfc/stp/6062120100.html',
  'dc_type': 'text',
  'enc_enclosure': {'resource': 'https://images.craigslist.org/00K0K_47YOfabxKOa_300x300.jpg',
   'type': 'image/jpeg'},
  'id': 'http://sfbay.craigslist.org/sfc/stp/6062120100.html',
  'language': 'en-us',
  'link': 'http://sfbay.craigslist.org/sfc/stp/6062120100.html',
  'links': [{'href': 'http://sfbay.craigslist.org/sfc/stp/6062120100.html',
    'rel': 'alternate',
    'type': 'text/html'}],
  'published': '2017-04-06T21:04:15-07:00',
  'published_parsed': time.struct_time(tm_year=2017, tm_mon=4, tm_mday=7, tm_hour=4, tm_min=4, tm_sec=15, tm_wday=4, tm_yday=97, tm_isdst=0),
  'rights': 'copyright 2017 craiglist',
  'rights_detail': {'base': 'https://sfbay.craigslist.org/search/stp?format=rss',
   'language': None,
   'type': 'text/plain',
   'value': 'copyright 2017 craiglist'},
  'summary': 'Hello, \nI am looking to find a buddy to keep me motivated. \nYou should be fit / athlete, around my 

In [111]:
vocabList,pSF,pNY=localWords(ny,sf)

the error rate is:  0.2


  return _compile(pattern, flags).split(string, maxsplit)


In [112]:
vocabList,pSF,pNY=localWords(ny,sf)

the error rate is:  0.4


  return _compile(pattern, flags).split(string, maxsplit)


In [114]:
def getTopWords(ny,sf):
    import operator
    vocabList,p0V,p1V=localWords(ny,sf)
    topNY=[]; topSF=[]
    for i in range(len(p0V)):
        if p0V[i] > -6.0 : topSF.append((vocabList[i],p0V[i]))
        if p1V[i] > -6.0 : topNY.append((vocabList[i],p1V[i]))
    sortedSF = sorted(topSF, key=lambda pair: pair[1], reverse=True)
    print("SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**")
    for item in sortedSF:
        print(item[0])
    sortedNY = sorted(topNY, key=lambda pair: pair[1], reverse=True)
    print("NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY **")
    for item in sortedNY:
        print(item[0])

In [115]:
getTopWords(ny,sf)

the error rate is:  0.4
SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**
perhaps
there
guy
friends
going
hello
single
interested
hang
around
serious
fit
nice
more
email
bay
professional
over
easy
friendship
how
side
been
pretty
some
post
text
send
thanks
feet
sense
life
drama
athlete
young
near
eye
visiting
curious
interests
each
emails
only
cuddled
need
great
mam
posting
supporting
brunch
point
walks
put
happy
kinky
female
convo
joke
its
nothing
exercising
friend
always
transient
dinner
untrained
work
person
lost
70s
couple
thought
desires
musica
richmond
wish
whose
waste
respect
see
show
any
hippies
santa
erotic
tall
once
platonic
running
open
path
doing
runs
bot
hear
develop
almost
seem
bike
ever
cool
hopefully
simply
hoping
real
other
explore
gym
now
kissing
away
fantasy
nature
apa
mixed
educated
pastels
also
seems
sexual
amp
keep
clubs
approximately
sad
host
live
pornographic
motivated
try
made
per
before
maf
mature
hiking
told
giving
draw
chatting
making
hidden
should
fre

  return _compile(pattern, flags).split(string, maxsplit)
