In [1]:
import numpy as np
import matplotlib.pyplot as plt

% matplotlib inline

In [2]:
def loadDataSet():                                                                                                                  
    postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],                                                        
                 ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],                                                    
                 ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],                                                       
                 ['stop', 'posting', 'stupid', 'worthless', 'garbage'],                                                             
                 ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],                                                 
                 ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]                                                          
    classVec = [0,1,0,1,0,1]    #1 is abusive, 0 not                                                                                
    return postingList,classVec   

def createVocabList(dataSet):
    vocabSet = set([])
    for doc in dataSet:
        vocabSet = vocabSet | set(doc)
    return list(vocabSet)

def setOfWords2Vec(vocabList, inputSet):
    resVec = [0] * len(vocabList)
    for word in inputSet:
        if word in vocabList:
            resVec[vocabList.index(word)] = 1
        else:
            print ("the word: %s is not in my Vocabulary!" % word)
    return resVec

In [3]:
postingList, classVec = loadDataSet()
vocabSet = createVocabList(postingList)
print(vocabSet)
vecDoc1 = setOfWords2Vec(vocabSet, postingList[0])
print(vecDoc1)

['cute', 'love', 'help', 'garbage', 'quit', 'I', 'problems', 'is', 'park', 'stop', 'flea', 'dalmation', 'licks', 'food', 'not', 'him', 'buying', 'posting', 'has', 'worthless', 'ate', 'to', 'maybe', 'please', 'dog', 'how', 'stupid', 'so', 'take', 'mr', 'steak', 'my']
[0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1]


## NB classifier                                                                                        

### train

In [4]:
def trainNB2Class(trainMatrix, trainClass):
    numDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pAbusive = sum(trainClass) / float(numDocs)
    
    # original
    #p1Num = np.zeros(numWords) 
    #p0Num = np.zeros(numWords)
    #p1Denom = 0.0
    #p0Denom = 0.0
    
    # improvement - avoid p(w0|ci)p(w1|ci)...p(wn|ci) = 0
    p1Num = np.ones(numWords)
    p0Num = np.ones(numWords)
    p1Denom = 2.0
    p0Denom = 2.0
    
    for i in range(numDocs):
        if trainClass[i] == 1:
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    
    # original
    #p1Vect = p1Num / p1Denom
    #p0Vect = p0Num / p0Denom
    
    # improvement - avoid p(w0|ci)p(w1|ci)...p(wn|ci) too small to underflow
    p1Vect = np.log(p1Num / p1Denom)
    p0Vect = np.log(p0Num / p0Denom)
    
    return p0Vect, p1Vect, pAbusive

# the original code calculate the real probability, which can be used to debugging
# the improvement version will be used into real case

In [5]:
postingList, classVec = loadDataSet()
vocabList = createVocabList(postingList)
trainMatrix = []
for doc in postingList:
    trainMatrix.append(setOfWords2Vec(vocabList, doc))

p0Vect, p1Vect, pAbusive = trainNB2Class(trainMatrix, classVec)
print(p0Vect)
print(p1Vect)
print(pAbusive)

[-2.56494936 -2.56494936 -2.56494936 -3.25809654 -3.25809654 -2.56494936
 -2.56494936 -2.56494936 -3.25809654 -2.56494936 -2.56494936 -2.56494936
 -2.56494936 -3.25809654 -3.25809654 -2.15948425 -3.25809654 -3.25809654
 -2.56494936 -3.25809654 -2.56494936 -2.56494936 -3.25809654 -2.56494936
 -2.56494936 -2.56494936 -3.25809654 -2.56494936 -3.25809654 -2.56494936
 -2.56494936 -1.87180218]
[-3.04452244 -3.04452244 -3.04452244 -2.35137526 -2.35137526 -3.04452244
 -3.04452244 -3.04452244 -2.35137526 -2.35137526 -3.04452244 -3.04452244
 -3.04452244 -2.35137526 -2.35137526 -2.35137526 -2.35137526 -2.35137526
 -3.04452244 -1.94591015 -3.04452244 -2.35137526 -2.35137526 -3.04452244
 -1.94591015 -3.04452244 -1.65822808 -3.04452244 -2.35137526 -3.04452244
 -3.04452244 -3.04452244]
0.5


### classify

In [6]:
def classifyNB(instVec, p0Vect, p1Vect, pClass1):
    p1 = sum(instVec * p1Vect) + np.log(pClass1)  # log(p(w|ci)p(ci))) = log(p(w|ci))+log(ci)
    p0 = sum(instVec * p0Vect) + np.log(1-pClass1)
    
    if p1 > p0:
        return 1
    else:
        return 0

In [7]:
def testingNB(): # convenience function to reduce input operation
    docList, classVec = loadDataSet()
    vocabList = createVocabList(docList)
    docMetrix = []
    for doc in docList:
        docMetrix.append( setOfWords2Vec(vocabList, doc) )
        
    p0Vect, p1Vect, pClass1 = trainNB2Class(docMetrix, classVec)
    
    testEntry = ['love', 'my', 'dalmation']
    testVect = np.array(setOfWords2Vec(vocabList, testEntry))
    predLabel = classifyNB(testVect, p0Vect, p1Vect, pClass1)
    print(testEntry, 'classified as: ', predLabel)
    
    testEntry = ['stupid', 'garbage']
    testVect = np.array(setOfWords2Vec(vocabList, testEntry))
    predLabel = classifyNB(testVect, p0Vect, p1Vect, pClass1)
    print(testEntry, 'classified as: ', predLabel)

testingNB()

(['love', 'my', 'dalmation'], 'classified as: ', 0)
(['stupid', 'garbage'], 'classified as: ', 1)


In [8]:
def bagOfWord2Vect(vocabList, doc):
    resVect = [0]*len(vocabList)
    for word in doc:
        if word in vocabList:
            resVect[vocabList.index(word)] += 1
    return resVect

## A real case : filter spam email

In [9]:
def textParse(bigString):
    import re
    listOfTokens = re.split(r'\W*', bigString)
    return [ token.lower() for token in listOfTokens if len(token) > 2 ]

def spamTest():
    dirName = "data/email/"
    docList = []
    classList = []
    fullText = []
    
    for i in range(1,26):
        doc = open(dirName+"spam/{}.txt".format(i)).read()
        wordList = textParse(doc)
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        
        doc = open(dirName + "ham/{}.txt".format(i)).read()
        wordList = textParse(doc)
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    
    vocabList = createVocabList(docList)
    # random split data set into train set and test set
    trainingSet = range(50)
    testSet = []
    for i in range(10):
        randIndex = int( np.random.uniform(0, len(trainingSet)) )
        testSet.append(trainingSet[randIndex])
        del trainingSet[randIndex]
        
    trainMatrix = []
    trainClass = []
    for docIndex in trainingSet:
        trainMatrix.append( setOfWords2Vec(vocabList, docList[docIndex]) )
        trainClass.append(classList[docIndex])
    p0Vect, p1Vect, pSpam = trainNB2Class(trainMatrix, trainClass)
    
#     print(p0Vect.shape)
#     print(p1Vect.shape)
#     print(pSpam)
    
    errorCount = 0
    for docIndex in testSet:
        testVect = np.array(setOfWords2Vec(vocabList, docList[docIndex]))
#         print(type(testVect))
#         print(testVect.shape)
        preLabel = classifyNB(testVect, p0Vect, p1Vect, pSpam)
        if preLabel != classList[docIndex]:
            errorCount += 1
    print("the error rate is : ", float(errorCount) / len(testSet))

In [10]:
spamTest()

('the error rate is : ', 0.0)


## A real case : gain domain orientation from advertisement 

In [11]:
import feedparser

In [12]:
ny = feedparser.parse("http://newyork.craigslist.org/stp/index.rss")

In [13]:
print(ny['entries'])
len(ny['entries'])

[{'dc_source': u'http://newyork.craigslist.org/que/stp/6131968239.html', 'summary_detail': {'base': u'https://newyork.craigslist.org/search/stp?format=rss', 'type': u'text/html', 'value': u'I am an Italian man looking for a woman for friendship.... \nI am polite good manners! \nNO DRUGS AND SMOKE!! \n:)', 'language': None}, 'published_parsed': time.struct_time(tm_year=2017, tm_mon=5, tm_mday=15, tm_hour=2, tm_min=20, tm_sec=28, tm_wday=0, tm_yday=135, tm_isdst=0), 'updated_parsed': time.struct_time(tm_year=2017, tm_mon=5, tm_mday=15, tm_hour=2, tm_min=20, tm_sec=28, tm_wday=0, tm_yday=135, tm_isdst=0), 'links': [{'href': u'http://newyork.craigslist.org/que/stp/6131968239.html', 'type': u'text/html', 'rel': u'alternate'}], 'title': u'Italian Man Looking for a Woman for frienship..... - m4w (Queens)', 'rights': u'copyright 2017 craiglist', 'updated': u'2017-05-14T22:20:28-04:00', 'summary': u'I am an Italian man looking for a woman for friendship.... \nI am polite good manners! \nNO DRUG

25

In [14]:
def getTopKFreqWords(vocabList, fullText, k):
    import operator
    freDict = {}
    for token in vocabList:
        freDict[token] = fullText.count(token)
    sortedFreDict = sorted(freDict.iteritems(), key = operator.itemgetter(1), reverse = True)
    return sortedFreDict[:k]

def localWords(feed1, feed0, isRemoveTopKFreqWords):
    # two class RSS
    import feedparser
    docList = []
    classList = []
    fullText = []
    minLen = min(len(feed1['entries']), len(feed0['entries']))
    for i in range(minLen):
        wordList = textParse(feed1['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        
        wordList = textParse(feed0['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    
    vocabList = []
    vocabList = createVocabList(docList)
    
    
    top30Words = getTopKFreqWords(vocabList, fullText, 30)
    if isRemoveTopKFreqWords == True:
        for pairW in top30Words:
            if pairW[0] in vocabList : vocabList.remove(pairW[0])
    
    trainingSet = range(2*minLen)
    testSet = []
    for i in range(20):
        randIndex = int(np.random.uniform(0, len(trainingSet)))
        testSet.append(randIndex)
        del trainingSet[randIndex]
    
    trainMat = []
    trainClasses = []
    for docIndex in trainingSet:
        doc = docList[docIndex]
        trainMat.append(bagOfWord2Vect(vocabList, doc))
        trainClasses.append(classList[docIndex])
    
    p0Vect, p1Vect, pClass1 = trainNB2Class(trainMat, trainClasses)
    
    errCount = 0.0
    for docIndex in testSet:
        doc = docList[docIndex]
        wordVect = np.array(bagOfWord2Vect(vocabList, doc))
        preLabel = classifyNB(wordVect, p0Vect, p1Vect, pClass1)
        if preLabel != classList[docIndex]:
            errCount += 1
    
    print ("the error rate is: ", errCount / len(testSet))
    
    return vocabList, top30Words, p0Vect, p1Vect, pClass1

In [15]:
ny = feedparser.parse("http://newyork.craigslist.org/stp/index.rss")
sf = feedparser.parse("http://sfbay.craigslist.org/stp/index.rss")

In [16]:
for i in range(10):
    vocabList,topkWords, p0Vect, p1Vect, pClass1 = localWords(ny, sf, True)
print(topkWords)

('the error rate is: ', 0.35)
('the error rate is: ', 0.15)
('the error rate is: ', 0.25)
('the error rate is: ', 0.15)
('the error rate is: ', 0.1)
('the error rate is: ', 0.3)
('the error rate is: ', 0.15)
('the error rate is: ', 0.15)
('the error rate is: ', 0.2)
('the error rate is: ', 0.35)
[(u'and', 63), (u'for', 48), (u'looking', 31), (u'the', 30), (u'you', 27), (u'not', 17), (u'who', 14), (u'have', 14), (u'with', 13), (u'out', 11), (u'get', 11), (u'are', 11), (u'like', 11), (u'been', 10), (u'guy', 9), (u'single', 9), (u'can', 8), (u'someone', 8), (u'going', 8), (u'good', 8), (u'just', 8), (u'time', 8), (u'from', 7), (u'this', 7), (u'there', 7), (u'please', 7), (u'know', 7), (u'your', 7), (u'chat', 7), (u'interested', 7)]


In [17]:
for i in range(10):
    vocabList,topkWords, p0Vect, p1Vect, pClass1 = localWords(ny, sf, False)
print(topkWords)

('the error rate is: ', 0.05)
('the error rate is: ', 0.2)
('the error rate is: ', 0.1)
('the error rate is: ', 0.1)
('the error rate is: ', 0.15)
('the error rate is: ', 0.2)
('the error rate is: ', 0.0)
('the error rate is: ', 0.1)
('the error rate is: ', 0.15)
('the error rate is: ', 0.05)
[(u'and', 63), (u'for', 48), (u'looking', 31), (u'the', 30), (u'you', 27), (u'not', 17), (u'who', 14), (u'have', 14), (u'with', 13), (u'out', 11), (u'get', 11), (u'are', 11), (u'like', 11), (u'been', 10), (u'guy', 9), (u'single', 9), (u'can', 8), (u'someone', 8), (u'going', 8), (u'good', 8), (u'just', 8), (u'time', 8), (u'from', 7), (u'this', 7), (u'there', 7), (u'please', 7), (u'know', 7), (u'your', 7), (u'chat', 7), (u'interested', 7)]


In [18]:
## analysis data

def getTopWords(ny, sf):
    import operator
    vocabList, topkWords, p0Vect, p1Vect, pClass1 = localWords(ny, sf, True)
    topNY = []
    topSF = []
    
    for i in range(len(p0Vect)):
        topSF.append((vocabList[i], p0Vect[i]))
        topNY.append((vocabList[i], p1Vect[i]))
    
    sortedSF = sorted(topSF, key = lambda pair : pair[1], reverse = True)
    print("SF**"*10)
    for item in sortedSF[:20]:
        print(item[0])
        
    sortedNY = sorted(topNY, key = lambda pair : pair[1], reverse = True)
    print("NY**"*10)
    for item in sortedNY[:20]:
        print(item[0])


In [19]:
getTopWords(ny, sf)

('the error rate is: ', 0.2)
SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**
dancing
nice
enjoy
awesome
fit
anyone
between
some
pass
here
active
would
want
dinner
maybe
personal
white
also
don
watch
NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**
new
friendship
chill
amp
easy
into
friends
all
smoke
meet
male
man
seeking
girl
friend
don
clean
make
hang
italian
