In [2]:
import numpy as np

def loadDataSet():
    postingList = [
        ['my','dog','has','flea','problems','help','please'],
        ['maybe','not','take','him','to','dog','park','stupid'],
        ['my','dalmation','is','so','cute','I','love','him'],
        ['stop','posting','stupid','worthless','garbage'],
        ['mr','licks','ate','my','steak','how','to','stop','him'],
        ['quit','buying','worthless','dog','food','stupid']
    ]
    classVec = [0,1,0,1,0,1]
    return postingList, classVec


def createVocabList(dataSet):#统计词表
    vocabSet = set([])
    for document in dataSet:
        vocabSet = vocabSet|set(document)
    return list(vocabSet)


def setOfWords2Vec(vocabList, inputSet):#以one-hot的形式将文章转变成向量，出现的单词在向量中为1，未出现的为0
    returnVec = [0] * len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
        else:
            print("the word: {} is not in my Vocabulary!".format(word))
    return returnVec


def trainNB0(trainMatrix,trainCategory):
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pAbusive = sum(trainCategory)/float(numTrainDocs)#计算各类的概率分布
    p0Num = np.zeros(numWords)
    p1Num = np.zeros(numWords)
    p0Denom = 2.0  #为了避免除数为0的情况
    p1Denom = 2.0
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i]
            p1Denom += np.sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += np.sum(trainMatrix[i])
    #print(p1Num,p0Num)
    p1Vect = np.log(p1Num/p1Denom)
    #这里其实是有问题的，为了避免小的数乘起来越来越小以至于无法计算的问题，用ln的形式表示
    #但是会有大量的log0的情况出现
    
    # p1V = p1Num.copy()
    # for i in range(len(p1Num)):
    #     if p1Num[i] != 0:
    #         p1Num[i] = np.log(p1Num[i] / p1Denom)
    #     else: p1Num[i] = 0
    p0Vect = np.log(p0Num/p0Denom)
 #    p0V = p0Num.copy()
 #    for i in range(len(p0Num)):
 #        if p0Num[i] != 0:
 #            p0Num[i] = np.log(p0Num[i] / p0Denom)
 #        else: p0Num[i] = 0
    return p0Vect, p1Vect, pAbusive


def classfyNB(vec2Classify, p0Vec, p1Vec, pClass1):
    p1 = np.sum(vec2Classify * p1Vec) + np.log(pClass1) #取ln后，用加法代替乘法
    p0 = np.sum(vec2Classify * p0Vec) + np.log(1.0 - pClass1)
    if p1 > p0:
        return 1
    else:
        return 0


def testingNB():
    listOPosts, listClasses = loadDataSet()
    myVocabList = createVocabList(listOPosts)
    trainMat = []
    for postinDoc in listOPosts:
        trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
    p0V, p1V ,pAb = trainNB0(np.array(trainMat), np.array(listClasses))
    testEntry = ['love','my','dalmation']
    thisDoc = np.array(setOfWords2Vec(myVocabList,testEntry))
    print(testEntry,'classified as:',classfyNB(thisDoc,p0V,p1V,pAb))
    testEntry = ['stupid', 'garbage']
    thisDoc = np.array(setOfWords2Vec(myVocabList,testEntry))
    print(testEntry, 'classified as:', classfyNB(thisDoc, p0V, p1V, pAb))


def bagOfWords2VecMN(vocabList, inputSet):
    returnVec = [0] * len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1
    return returnVec

def textParse(bigString):
    import re
    listOfTokens = re.split(r'\W+', bigString)#这里是有问题的，'\W*'会报匹配空串的warning
    return [tok for tok in listOfTokens if len(tok)>2]


def spamTest():
    docList = []
    classList = []
    fullText = []
    for i in range(1,26):
        wordList = textParse(open('email/spam/%d.txt' % i).read())#这里问题也很多，有一个文件23.txt里面多了一个？导致报错
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(open('email/ham/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = createVocabList(docList)
    trainingSet = list(range(50))#这里也有问题，range函数返回的是range序列，需要转换成list
    testSet = []
    for i in range(10):#这里做了hold-out cross validation
        randIndex = int(np.random.uniform(0,len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])
    trainMat = []
    trainClasses = []
    for docIndex in trainingSet:
        trainMat.append(setOfWords2Vec(vocabList,docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V,p1V,pSpam = trainNB0(np.array(trainMat),np.array(trainClasses))
    errorCount = 0
    for docIndex in testSet:
        wordVector = setOfWords2Vec(vocabList,docList[docIndex])
        if classfyNB(np.array(wordVector),p0V,p1V,pSpam) !=classList[docIndex]:
            errorCount += 1
    print("the error rate is:",float(errorCount)/len(testSet))


# listOPosts, listClass = loadDataSet()
# myVocabList = createVocabList(listOPosts)
# print(myVocabList)
# print(setOfWords2Vec(myVocabList,listOPosts[0]))
# print(setOfWords2Vec(myVocabList,listOPosts[3]))
spamTest()

the error rate is: 0.5


