# 基于概率论的分类方法：朴素贝叶斯
1. 优点：在数据较少的情况下仍然有效，可以处理多类别问题
2. 缺点：对于输入数据的准备方式较为敏感
3. 适用数据类型：标称型数据

## 朴素贝叶斯的一般过程
1. 收集数据：可以使用任何方法。本章使用RSS源
2. 准备数据：需要数值型或者布尔型数据
3. 分析数据：有大量特征时，绘制特征作用不大，此时使用直方图效果更好
4. 训练算法：计算不同的独立特征的条件概率
5. 测试算法：计算错误率
6. 使用算法：一个常见的朴素贝叶斯应用是文档分类，可以在任意的分类场景中使用朴素贝叶斯分类器，不一定非要是文本

## 使用Python进行文本分类
### 1. 准备数据：从文本中构建词向量

In [1]:
# 词表到向量的转换函数
def loadDataSet():
    postingList = [['my','dog','has','flea','problems','help','please'],
                  ['maybe','not','take','him','to','dog','park','stupid'],
                  ['my','dalmation','is','so','cute','I','love','him'],
                  ['stop','posting','stupid','worthless','garbage'],
                  ['mr','licks','ate','my','steak','how','to','stop','him'],
                  ['quit','buying','worthless','dog','food','stupid']]
    classVec = [0,1,0,1,0,1]
    return postingList,classVec

# 创建一个包含在所有文档中出现的不重复词的列表
def createVocabList(dataSet):
    # 创建一个空集合
    vocabSet = set([])
    for document in dataSet:
        # 操作符 | 用于求两个集合的并集
        vocabSet = vocabSet | set(document)
    return list(vocabSet)

# 该函数使用词汇表的所有单词作为输入，如果出现了词汇表中的单词，
# 在输出文档的向量的对应值设为1
def setOfWords2Vec(vocabList,inputSet):
    returnVec = [0] * len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
        else:
            print("the word:%s is not in my Vocabulary!" % word)
    return returnVec

In [4]:
listOPosts,listClasses = loadDataSet()
myVocabList = createVocabList(listOPosts)
myVocabList

['ate',
 'mr',
 'please',
 'stop',
 'quit',
 'I',
 'is',
 'stupid',
 'steak',
 'has',
 'how',
 'so',
 'him',
 'park',
 'flea',
 'problems',
 'help',
 'posting',
 'buying',
 'dalmation',
 'my',
 'garbage',
 'cute',
 'worthless',
 'take',
 'licks',
 'dog',
 'food',
 'maybe',
 'to',
 'not',
 'love']

In [14]:
# 朴素贝叶斯分类器训练函数
from numpy import * 
def trainNB0(trainMatrix,trainCategory):
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pAbusive = sum(trainCategory) / float(numTrainDocs)
    p0Num = zeros(numWords)
    p1Num = zeros(numWords)
    p0Denom = 0.0
    p1Denom = 0.0
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p1Vect = p1Num / p1Denom
    p0Vect = p0Num / p0Denom
    return p0Vect,p1Vect,pAbusive

In [22]:
trainMat = []
for postinDoc in listOPosts:
    trainMat.append(setOfWords2Vec(myVocabList,postinDoc))
trainMat
# sum(trainMat[0])

[[0,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  0],
 [0,
  0,
  1,
  1,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  1,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  1],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  1,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0],
 [0,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  1,
  0,
  0]]

In [17]:
p0V,p1V,pAb = trainNB0(trainMat,listClasses)
# 侮辱类的概率
pAb

0.5

In [18]:
# 在非侮辱类文档中每个单词出现的概率，即非侮辱类的条件概率
p0V

array([0.04166667, 0.04166667, 0.        , 0.04166667, 0.125     ,
       0.        , 0.04166667, 0.        , 0.        , 0.        ,
       0.04166667, 0.04166667, 0.04166667, 0.04166667, 0.04166667,
       0.04166667, 0.        , 0.08333333, 0.04166667, 0.        ,
       0.        , 0.04166667, 0.04166667, 0.04166667, 0.        ,
       0.04166667, 0.04166667, 0.        , 0.04166667, 0.        ,
       0.04166667, 0.04166667])

In [23]:
# 在侮辱类文档中每个单词出现的概率，即侮辱类的条件概率
p1V

array([0.        , 0.        , 0.05263158, 0.10526316, 0.        ,
       0.05263158, 0.        , 0.05263158, 0.05263158, 0.05263158,
       0.        , 0.05263158, 0.        , 0.        , 0.        ,
       0.        , 0.05263158, 0.05263158, 0.        , 0.05263158,
       0.15789474, 0.        , 0.        , 0.        , 0.10526316,
       0.05263158, 0.        , 0.05263158, 0.        , 0.05263158,
       0.        , 0.        ])

## 测试算法：根据显示情况修改分类器
1. 利用贝叶斯分类器对文档进行分类时，要计算多个概率的乘积以获得文档属于某个类别的概率  
   即计算p(w0|1)p(w1|1)p(w2|1)。如果其中一个概率值为0，那么最后的乘积也为0。为降低这种影响，  
   可以将所有词的出现数初始化为1，并将分母初始化为2。
2. 下溢出：由于太多很小的数相乘造成的(python相乘很多很小的数，最后四舍五入后会得到0)
   解决方法：对乘积取自然对数。在代数中有ln(a*b) = ln(a)+ln(b),通过对数可以避免下溢或者浮点数舍入导致的错误  
   并且fx于ln(fx)两个函数的增减性一致，在相同点上取到极值

In [2]:
from numpy import * 
# 修改过后的分类器
def trainNB0(trainMatrix,trainCategory):
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pAbusive = sum(trainCategory) / float(numTrainDocs)
    # 将所有词的出现数初始化为1
    p0Num = ones(numWords)
    p1Num = ones(numWords)
    # 将分母初始化为2
    p0Denom = 2.0
    p1Denom = 2.0
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    # 取自然对数避免下溢或者浮点舍入导致的错误
    p1Vect = log(p1Num / p1Denom)
    p0Vect = log(p0Num / p0Denom)
    return p0Vect,p1Vect,pAbusive

In [3]:
# 朴素贝叶斯分类函数
def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
    p1 = sum(vec2Classify * p1Vec) + log(pClass1)
    p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
#     print(p1,p0)
    if p1 > p0:
        return 1
    else: 
        return 0
    
def testingNB():
    listOPosts,listClasses = loadDataSet()
    myVocabList = createVocabList(listOPosts)
    trainMat = []
    for postinDoc in listOPosts:
        trainMat.append(setOfWords2Vec(myVocabList,postinDoc))
    p0V,p1V,pAb = trainNB0(array(trainMat),array(listClasses))
    # print(p1V)
    testEntry = ['love','my','dalmation']
    thisDoc = array(setOfWords2Vec(myVocabList,testEntry))
    print(testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb))
    testEntry = ['stupid','garbage']
    thisDoc = array(setOfWords2Vec(myVocabList,testEntry))
    print(testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb))

In [4]:
# thisDoc = array(setOfWords2Vec(myVocabList,['love','my','dalmation']))
testingNB()

['love', 'my', 'dalmation'] classified as:  0
['stupid', 'garbage'] classified as:  1


## 准备数据：文档词袋模型
1. 词集模型：将每个词的出现与否作为一个特征
2. 词袋模型：如果一个词在文档中出现不止一次，这以为着包含该词是否出现所在文档中所不能表达的某种信息

In [5]:
## 朴素贝叶斯词袋模型
def bagOfWords2VecMN(vocabList,inputSet):
    returnVec = [0] * len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1
    return returnVec

## 使用朴素贝叶斯过滤垃圾邮件
1. 收集数据：提供文本文件
2. 准备数据：将文本文件解析成词条向量
3. 分析数据：检查词条确保解析的正确性
4. 训练算法：使用我们之前建立的trainNB0()函数
5. 测试算法：使用classifyNB()，并且构建一个新的测试函数来计算文档集的错误率
6. 使用算法：构建一个完成的程序对一组文档进行分类，将错分的文档输出到屏幕上

In [6]:
## 切分文本
import re
# 匹配非普通字符，即字母数字下划线除外
regEx= re.compile('\\W*')
mySent = 'This book is the best book on Python or M.L. I have ever laid eyes upon.'
listOfTokens = regEx.split(mySent)
listOfTokens

  


['This',
 'book',
 'is',
 'the',
 'best',
 'book',
 'on',
 'Python',
 'or',
 'M',
 'L',
 'I',
 'have',
 'ever',
 'laid',
 'eyes',
 'upon',
 '']

In [10]:
# 需要将里面的空字符串去掉，可以计算每个字符串的长度，只返回长度大于0的字符串
[tok for tok in listOfTokens if len(tok) > 0]

['This',
 'book',
 'is',
 'the',
 'best',
 'book',
 'on',
 'Python',
 'or',
 'M',
 'L',
 'I',
 'have',
 'ever',
 'laid',
 'eyes',
 'upon']

In [7]:
## 最后句子中的第一个单词是大写的，这里的文本只看成词袋，所以所有词的形式必须是统一的
[tok.lower() for tok in listOfTokens if len(tok) > 0]

['this',
 'book',
 'is',
 'the',
 'best',
 'book',
 'on',
 'python',
 'or',
 'm',
 'l',
 'i',
 'have',
 'ever',
 'laid',
 'eyes',
 'upon']

## 测试算法：使用朴素贝叶斯进行交叉验证
将文本解析器集成到一个完整分类器中

In [7]:
## 文件解析及完整的垃圾邮件测试函数
def textParse(bigString):
    import re
    listOfTokens = re.split(r'\W*',bigString)
    return [tok.lower() for tok in listOfTokens if len(tok) > 2]

def spamTest():
    docList = []
    classList = []
    fullText = []
    for i in range(1,26):
        # 文件编码形式是：'ISO-8859-1'
        wordList = textParse(open('email/spam/%d.txt' % i,encoding='ISO-8859-1').read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(open('email/ham/%d.txt' % i,encoding='ISO-8859-1').read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = createVocabList(docList)
    # python3.x range返回的是range对象，不返回数组对象
    trainingSet = list(range(50))
    testSet = []
    for i in range(10):
        randIndex = int(random.uniform(0,len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del trainingSet[randIndex]
    trainMat = []
    trainClasses = []
    for docIndex in trainingSet:
        trainMat.append(setOfWords2Vec(vocabList,docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))
    errCount = 0
    for docIndex in testSet:
        wordVector = setOfWords2Vec(vocabList,docList[docIndex])
        if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
            errCount += 1
            print('classification error',docList[docIndex])
    print('the error rate is: ',float(errCount)/len(testSet))

In [8]:
spamTest()

  return _compile(pattern, flags).split(string, maxsplit)


classification error ['home', 'based', 'business', 'opportunity', 'knocking', 'your', 'door', 'don', 'rude', 'and', 'let', 'this', 'chance', 'you', 'can', 'earn', 'great', 'income', 'and', 'find', 'your', 'financial', 'life', 'transformed', 'learn', 'more', 'here', 'your', 'success', 'work', 'from', 'home', 'finder', 'experts']
the error rate is:  0.1


## 使用朴素贝叶斯分类器从个人广告中获取区域倾向
#### 使用朴素贝叶斯来发现地域相关的用词
1. 收集数据：从RSS源收集内容，这里需要对RSS源构建一个接口
2. 准备数据：将文本文件解析成词条向量
3. 分析数据：检查词条确保解析的正确性
4. 训练算法：使用之前建立的trainNB0()函数
5. 测试算法：观察错误率，确保分类器可以。可以修改切分程序，以降低错误率，提高分类结果
6. 使用算法：构建一个完整的程序，封装所有内容，给定两个RSS源，该程序会显示最常用的公共词

### 收集数据：导入RSS源
需要一个RSS阅读器，Universal Feed Parser是python中最常用的RSS程序库


In [15]:
import feedparser
ny = feedparser.parse('http://sfbay.craigslist.org/stp/index.rss')
ny['entries']
len(ny['entries'])

0

In [18]:
## RSS源分类器及高频词去除函数
# 计算出现频率,返回排序最高的30个单词
def calcMostFreq(vocabList,fullText):
    import operator
    freqDict = {}
    for token in vocabList:
        freqDict[token] = fullText.count(token)
    sortedFreq = sorted(freqDict.items(),key=operator.itemgetter(1),reverse=True)
    return sortedFreq[:30]

def localWords(feed1,feed0):
    import feedparser
    docList = []
    classList = []
    fullText = []
    minLen = min(len(feed1['entries']),len(feed0['entries']))
    for i in range(minLen):
        wordList = textParse(feed1['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(feed0['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = createVocabList(docList)
    top30Words = calcMostFreq(vocabList,fullText)
    ## 去掉出现次数最高的那些词
    for pairW in top30Words:
        if pairW[0] in vocabList:
            vocabList.remove(pairW[0])
    ## 构建训练集和测试集
    trainingSet = list(range(2*minLen))
    testSet = []
    for i in range(20):
        randIndex = int(random.uniform(0,len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[index])
    trainMat = []
    trainClasses = []
    for docIndex in trainingSet:
        trainMat.append(bagOfWords2VecMN(vocabList,docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))
    errorCount = 0
    for docIndex in testSet:
        wordVector = bagOfWords2VecMN(vocabList,docList[docIndex])
        if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
            errorCount += 1
    print("the error rate is:",float(errorCount)/len(testSet))
    return vocabList,p0V,p1V    
        

In [20]:
# 无法访问RSS源
ny = feedparser.parse('http://newyork.craigslist.org/stp/index.rss')
sf = feedparser.parse('http://sfbay.craigslist.org/stp/index.rss')
len(ny['entries'])
# vocabList,pSF,pNY = localWords(ny,sf)

0

## 分析数据：显示地域相关的用词

In [None]:
## 最具表征性的词汇显示函数
def getTopWords(ny,sf):
    import operator 
    vocabList,p0V,p1V = localWords(ny,sf)
    topNY = []
    topSF = []
    for i in range(len(p0V)):
        if p0V[i] > -6.0:
            # 按照特定阈值，创建列表用于存储元组[('单词'，概率值),(),..]
            topSF.append((vocabList[i],p0V[i]))
        if p1V[i] > -6.0:
            topNY.append((vocabList[i],p1V[i]))
    # 根据概率值进行降序排列
    sortedSF = sorted(topSF,key=lamda pair:pair[1],reverse=True)
    print("SF**SF**SF**SF**SF**SF**SF**SF**SF**SF")
    for item in soretdSF:
        # 按照概率值从大到小得到单词
        print(item[0])
    sortedNY = sorted(topNY.key=lamda pair:pair[1],reverse=True)
    print("NY**NY**NY**NY**NY**NY**NY**NY**NY**NY")
    for item in sortedNY:
        print(item[0])

## cookie：

In [8]:
# | 求并集
vocabSet = set(['my','dog','has','flea','problems','help','please'])
document = set(['maybe','not','take','him','to','dog','park','stupid'])
a = vocabSet | document
a

{'dog',
 'flea',
 'has',
 'help',
 'him',
 'maybe',
 'my',
 'not',
 'park',
 'please',
 'problems',
 'stupid',
 'take',
 'to'}

In [22]:
SF=[]
vocabList=[1,2,3]
p0V=['wo']
SF.append((vocabList[0],p0V[0]))
SF

[(1, 'wo')]