## 词表到向量的转换函数

In [1]:
#词表到向量的转换函数
def loadDataSet():
    postingList = [
        ['my','dog','has','flea','problems','help','please'],
        ['mybe','not','take','him','to','dog','park','stupid'],
        ['my','dalmation','is','so','cute','I','love','him'],
        ['stop','posting','stupid','worthless','garbage'],
        ['mr','licks','ate','my','steak','how','to','stop','him'],
        ['qiut','buying','worthless','dog','food','stupid']
    ]
    classVec = [0,1,0,1,0,1]#1代表侮辱性文字，0代表正常言论
    return postingList,classVec

def createVocabList(dataSet):
    vacabSet = set([])
    for ducument in dataSet:
        vacabSet = vacabSet | set(ducument)
    return vacabSet

def setOfWords2Vec(vocabList,inputSet):
    vocabList = list(vocabList)
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
        else:
            print("the word: %s is not in my Vocabulary!" %word)
    return returnVec

In [2]:
listOPosts,listClasses = loadDataSet()

## 创建非重复词表

In [3]:
myVocabList = createVocabList(listOPosts)

In [4]:
# myVocabList

In [5]:
# setOfWords2Vec(myVocabList,listOPosts[3])

## 训练算法：从词向量计算概率

In [6]:
# 朴素贝叶斯分类训练函数
from numpy import *
def trainNBO(trainMatrix,trainCategory):
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pAbusive = sum(trainCategory)/float(numTrainDocs)
    # 初始化概率
    p0Num = zeros(numWords)
    p1Num = zeros(numWords)
    p0Denom = 0.0
    p1Denom = 0.0
    # 向量相加
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p1Vect = p1Num/p1Denom
    p0Vect = p0Num/p0Denom
    return p0Vect,p1Vect,pAbusive                                

In [7]:
# myVocabList

## 得到信息矩阵

In [8]:
trainMat = []

In [9]:
for postinDoc in listOPosts:
    trainMat.append(setOfWords2Vec(myVocabList,postinDoc))

In [10]:
# trainMat

In [11]:
p0V,p1V,pAV = trainNBO(trainMat,listClasses)

In [12]:
p0V

array([0.04166667, 0.04166667, 0.04166667, 0.04166667, 0.125     ,
       0.        , 0.04166667, 0.04166667, 0.        , 0.        ,
       0.        , 0.08333333, 0.04166667, 0.04166667, 0.        ,
       0.04166667, 0.        , 0.04166667, 0.04166667, 0.        ,
       0.04166667, 0.        , 0.        , 0.04166667, 0.        ,
       0.04166667, 0.04166667, 0.04166667, 0.04166667, 0.        ,
       0.04166667, 0.04166667])

In [13]:
p1V

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.05263158, 0.        , 0.        , 0.05263158, 0.05263158,
       0.05263158, 0.05263158, 0.        , 0.        , 0.05263158,
       0.05263158, 0.05263158, 0.10526316, 0.        , 0.05263158,
       0.        , 0.05263158, 0.05263158, 0.        , 0.15789474,
       0.        , 0.        , 0.        , 0.05263158, 0.10526316,
       0.        , 0.        ])

In [14]:
pAV

0.5

In [15]:
myVocabList

{'I',
 'ate',
 'buying',
 'cute',
 'dalmation',
 'dog',
 'flea',
 'food',
 'garbage',
 'has',
 'help',
 'him',
 'how',
 'is',
 'licks',
 'love',
 'mr',
 'my',
 'mybe',
 'not',
 'park',
 'please',
 'posting',
 'problems',
 'qiut',
 'so',
 'steak',
 'stop',
 'stupid',
 'take',
 'to',
 'worthless'}

## 朴素贝叶斯分类函数

In [16]:
# 朴素贝叶斯分类函数
from math import log
def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
    # 这里的相乘指的是两个向量相称的结果，这里的相乘是指对应元素相乘，即先将两个向量中的第一个元素相乘
    # 然后再将第二个元素相乘，依次类推
    p1 = sum(vec2Classify * p1Vec) + log(pClass1)
    p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
    if p1 > p0:
        return 1
    else:
        return 0

from numpy import array
def testingNB():
    list0posts,listClasses = loadDataSet()
    myVocabList = createVocabList(list0posts)
    trainMat = []
    for postinDoc in list0posts:
        trainMat.append(setOfWords2Vec(myVocabList,postinDoc))
    p0V,p1V,pAb = trainNBO(array(trainMat),array(listClasses))
    testEntry = ['love','my','dalmation']
    thisDoc = array(setOfWords2Vec(myVocabList,testEntry))
    print(testEntry,"classify as: ",classifyNB(thisDoc,p0V,p1V,pAb))

    testEntry = ['stupid','garbage']
    thisDoc = array(setOfWords2Vec(myVocabList,testEntry))
    print(testEntry,"classify as: ",classifyNB(thisDoc,p0V,p1V,pAb))

## 测试数据集

In [17]:
testingNB()

['love', 'my', 'dalmation'] classify as:  0
['stupid', 'garbage'] classify as:  1


## 切分数据

In [18]:
mySent = 'This book is the best book on Python or M.L. I have ever laid eyes upon.'

In [19]:
import re
regEx = re.compile(r'\W*')
# mySent.split()

In [20]:
listOfTokens = regEx.split(mySent)

  """Entry point for launching an IPython kernel.


In [21]:
listOfTokens

['This',
 'book',
 'is',
 'the',
 'best',
 'book',
 'on',
 'Python',
 'or',
 'M',
 'L',
 'I',
 'have',
 'ever',
 'laid',
 'eyes',
 'upon',
 '']

## 删除空字符串

In [22]:
[tok for tok in listOfTokens if len(tok)>0]

['This',
 'book',
 'is',
 'the',
 'best',
 'book',
 'on',
 'Python',
 'or',
 'M',
 'L',
 'I',
 'have',
 'ever',
 'laid',
 'eyes',
 'upon']

## 将字符串全部转换成小写

In [23]:
[tok.lower() for tok in listOfTokens if len(tok)>0]

['this',
 'book',
 'is',
 'the',
 'best',
 'book',
 'on',
 'python',
 'or',
 'm',
 'l',
 'i',
 'have',
 'ever',
 'laid',
 'eyes',
 'upon']

## 集中一封完整的电子邮件的实际处理结果

In [24]:
emailText = open(r'F:\Deeplearning\机器学习实战10月25日\MLAdata\Ch04\email\ham\6.txt').read()

In [25]:
emailText

'Hello,\n\nSince you are an owner of at least one Google Groups group that uses the customized welcome message, pages or files, we are writing to inform you that we will no longer be supporting these features starting February 2011. We made this decision so that we can focus on improving the core functionalities of Google Groups -- mailing lists and forum discussions.  Instead of these features, we encourage you to use products that are designed specifically for file storage and page creation, such as Google Docs and Google Sites.\n\nFor example, you can easily create your pages on Google Sites and share the site (http://www.google.com/support/sites/bin/answer.py?hl=en&answer=174623) with the members of your group. You can also store your files on the site by attaching files to pages (http://www.google.com/support/sites/bin/answer.py?hl=en&answer=90563) on the site. If you抮e just looking for a place to upload your files so that your group members can download them, we suggest you try G

## 测试算法：使用朴素贝叶斯进行交叉验证

##  文件解析及完整的垃圾邮件测试函数

In [28]:
# 数据切分和转换
def textParse(bigString):
    import re
    listOfTokens = re.split(r'\W*',bigString)
    return [tok.lower() for tok in  listOfTokens if len(tok) > 2]

In [39]:
# 样本检测
def spamTest():
    docList = []
    classList = []
    fullText = []
    for i in range(1,26):
        wordList = textParse(open(r'F:\Deeplearning\机器学习实战10月25日\MLAdata\Ch04\email\spam\%d.txt' %i).read())
        docList.append(wordList)
        fullText.append(wordList)
        classList.append(1)
        wordList = textParse(open(r'F:\Deeplearning\机器学习实战10月25日\MLAdata\Ch04\email\ham\%d.txt' %i).read())
        docList.append(wordList)
        fullText.append(wordList)
        classList.append(0)
    vocabList = createVocabList(docList)
    trainingSet = list(range(50))
    testSet = []
    import random
    for i in range(10):
        randIndex = int(random.uniform(0,len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])
    trainMat = []
    trainClasses = []
    for docIndex in  trainingSet:
        trainMat.append(setOfWords2Vec(vocabList,docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V,p1V,pSpam = trainNBO(array(trainMat),array(trainClasses))
    errorCount = 0
    for docIndex in testSet:
        wordVector = setOfWords2Vec(vocabList,docList[docIndex])
        if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
            errorCount += 1
    print("the error rate is : ",float(errorCount)/len(testSet))

In [40]:
spamTest()

the error rate is :  0.2


  return _compile(pattern, flags).split(string, maxsplit)


In [41]:
pip install feedparser


The following command must be run outside of the IPython shell:

    $ pip install feedparser

The Python package manager (pip) can only be used from outside of IPython.
Please reissue the `pip` command in a separate terminal or command prompt.

See the Python documentation for more information on how to install packages:

    https://docs.python.org/3/installing/
