## 從Python入手+演算法-鄭捷 Ch02 中文文字分類

#### jieba分詞簡單範例

.py 的話，include下列：
```
import sys  
import os
import jieba

#设置utf-8 unicode环境
reload(sys)
sys.setdefaultencoding('utf-8')
```

In [1]:
import jieba
seg_list = jieba.cut("小明1995年畢業於北京清華大學", cut_all=False)
print "Default Mode:", " ".join(seg_list)  # 預設模式

seg_list = jieba.cut("小明1995年畢業於北京清華大學")
print "  ".join(seg_list)

seg_list = jieba.cut("小明1995年畢業於北京清華大學", cut_all=True)
print "Full Mode:", "/ ".join(seg_list)  # 全模式

seg_list = jieba.cut_for_search("小明碩士畢業於中國科學院計算所，後在日本京都大學深造")  # 搜索引擎模式
print "/  ".join(seg_list)

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/_y/yxtl0q3x3qq4r0f00gk36xjw0000gn/T/jieba.cache


Default Mode:

Loading model cost 0.380 seconds.
Prefix dict has been built succesfully.


 小明 1995 年 畢業 於 北京 清華大學
小明  1995  年  畢業  於  北京  清華大學
Full Mode: 小/ 明/ 1995/ 年/ 畢/ 業/ 於/ 北京/ 清/ 華/ 大/ 學
小明/  碩士/  畢業/  於/  中國/  科學院/  計算/  所/  ，/  後/  在/  日本/  京都/  大學/  深造


#### 將目錄中的文件轉成分詞檔案

In [4]:
import sys  
import os
# 保存至文件
def savefile(savepath,content):
    fp = open(savepath,"wb")
    fp.write(content)
    fp.close()
    
# 讀取文件
def readfile(path):
    fp = open(path,"rb")
    content = fp.read()
    fp.close()
    return content

In [2]:
import sys  
import os
# 保存至文件
def savefile(savepath,content):
    fp = open(savepath,"wb")
    fp.write(content)
    fp.close()
    
# 讀取文件
def readfile(path):
    fp = open(path,"rb")
    content = fp.read()
    fp.close()
    return content
    
corpus_path = "train_corpus_small/"  # 未分詞分類語料庫路徑
seg_path = "train_corpus_seg/"      # 分詞後分類語料庫路徑

catelist = os.listdir(corpus_path)  # 獲取corpus_path下的所有子目錄

# 獲取每個目錄下所有的文件
for mydir in catelist:
    class_path = corpus_path+mydir+"/"    # 拼出分類子目錄的路徑
    seg_dir = seg_path+mydir+"/"          # 拼出分詞後語料分類目錄
    if not os.path.exists(seg_dir):       # 是否存在目錄，如果沒有創建
            os.makedirs(seg_dir)	
    file_list = os.listdir(class_path)    # 獲取class_path下的所有文件
    for file_path in file_list:           # 遍歷類別目錄下文件
        fullname = class_path + file_path   # 拼出文件名全路徑
        content = readfile(fullname).strip()  # 讀取文件內容
        content = content.replace("\r\n","") # 刪除換行和多餘的空格
        content_seg = jieba.cut(content.strip())		# 為文件內容分詞
        savefile(seg_dir+file_path," ".join(content_seg))  # 將處理後的文件保存到分詞後語料目錄

print "中文語料分詞結束！！！"

OSError: [Errno 2] No such file or directory: 'train_corpus_small/'

### 產生向量空間，使用 sklearn 的Bunch 資料結構

In [5]:
import cPickle as pickle
from sklearn.datasets.base import Bunch

# Bunch類提供一種key,value的對象形式
# target_name:所有分類集名稱列表
# label:每個文件的分類標籤列表
# filenames:文件路徑
# contents:分詞後文件詞向量形式
bunch = Bunch(target_name=[],label=[],filenames=[],contents=[])	

wordbag_path = "train_word_bag/train_set.dat"  # 未分詞分類語料庫路徑
seg_path = "train_corpus_seg/"      # 分詞後分類語料庫路徑

catelist = os.listdir(seg_path)  # 獲取seg_path下的所有子目錄
bunch.target_name.extend(catelist)
# 獲取每個目錄下所有的文件
for mydir in catelist:
    class_path = seg_path+mydir+"/"    # 拼出分類子目錄的路徑
    file_list = os.listdir(class_path)    # 獲取class_path下的所有文件
    for file_path in file_list:           # 遍歷類別目錄下文件
        fullname = class_path + file_path   # 拼出文件名全路徑
        bunch.label.append(mydir)
        bunch.filenames.append(fullname)
        bunch.contents.append(readfile(fullname).strip())		# 讀取文件內容

#對象持久化                                                                                              
file_obj = open(wordbag_path, "wb")
pickle.dump(bunch,file_obj)                      
file_obj.close()

print "構建文本對象結束！！！"

構建文本對象結束！！！


### if-idf詞向量空間創建

In [10]:
#引入持久化類
import cPickle as pickle
from sklearn import feature_extraction  
from sklearn.feature_extraction.text import TfidfTransformer  
from sklearn.feature_extraction.text import TfidfVectorizer  

# 讀取文件
def readfile(path):
    fp = open(path,"rb")
    content = fp.read()
    fp.close()
    return content

#計算訓練語料的tfidf權值並持久化為詞袋

#讀取bunch對象
def readbunchobj(path):
    file_obj = open(path, "rb")
    bunch = pickle.load(file_obj) 
    file_obj.close()
    return bunch

#寫入bunch對象
def writebunchobj(path,bunchobj):
    file_obj = open(path, "wb")
    pickle.dump(bunchobj,file_obj) 
    file_obj.close()

# 1. 讀取停用詞表
stopword_path = "train_word_bag/hlt_stop_words.txt"
stpwrdlst = readfile(stopword_path).splitlines()

# 2. 導入分詞後的詞向量bunch對象
path = "train_word_bag/train_set.dat"        # 詞向量空間保存路徑
bunch = readbunchobj(path)

# 3. 構建tf-idf詞向量空間對象
tfidfspace = Bunch(target_name=bunch.target_name,label=bunch.label,filenames=bunch.filenames,tdm=[],vocabulary={})

# 4. 使用TfidfVectorizer初始化向量空間模型 
vectorizer = TfidfVectorizer(stop_words=stpwrdlst,sublinear_tf = True,max_df = 0.5)
transformer=TfidfTransformer() # 該類會統計每個詞語的tf-idf權值
# 文本轉為詞頻矩陣,單獨保存字典文件 
tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)
tfidfspace.vocabulary = vectorizer.vocabulary_

# 創建詞袋的持久化
space_path = "train_word_bag/tfdifspace.dat"        # 詞向量空間保存路徑
writebunchobj(space_path,tfidfspace)

print "if-idf詞向量空間創建成功！！！"

if-idf詞向量空間創建成功！！！


### 構建test_set文本對象

In [7]:
# Bunch類提供一種key,value的對象形式
# target_name:所有分類集名稱列表
# label:每個文件的分類標籤列表
# filenames:文件路徑
# contents:分詞後文件詞向量形式
bunch = Bunch(target_name=[],label=[],filenames=[],contents=[])

wordbag_path = "test_word_bag/test_set.dat"  # 未分詞分類語料庫路徑
seg_path = "test_corpus_seg/"      # 分詞後分類語料庫路徑

catelist = os.listdir(seg_path)  # 獲取seg_path下的所有子目錄
bunch.target_name.extend(catelist)
# 獲取每個目錄下所有的文件
for mydir in catelist:
    class_path = seg_path+mydir+"/"    # 拼出分類子目錄的路徑
    file_list = os.listdir(class_path)    # 獲取class_path下的所有文件
    for file_path in file_list:           # 遍歷類別目錄下文件
        fullname = class_path + file_path   # 拼出文件名全路徑
        bunch.label.append(mydir)
        bunch.filenames.append(fullname)
        bunch.contents.append(readfile(fullname).strip())# 讀取文件內容

#對象持久化                                                                                              
file_obj = open(wordbag_path, "wb")
pickle.dump(bunch,file_obj)                      
file_obj.close()

print "構建文本對象結束！！！"

構建文本對象結束！！！


In [8]:
bunch

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


## 使用貝氏分類

In [6]:
import numpy as np

def loadDataSet():
    postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                 ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                 ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him','my'],
                 ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                 ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                 ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0,1,0,1,0,1]    #1 is abusive, 0 not
    return postingList,classVec


class NBayes(object):
    def __init__(self):
        self.vocabulary = [] # 詞典
        self.idf=0           # 詞典的idf權值向量
        self.tf=0            # 訓練集的權值矩陣
        self.tdm=0           # P(x|yi)
        self.Pcates = {}     # P(yi)--是個類別字典
        self.labels=[]       # 對應每個文本的分類，是個外部導入的列表
        self.doclength = 0   # 訓練集文本數
        self.vocablen = 0    # 詞典詞長
        self.testset = 0     # 測試集
    # 加載訓練集並生成詞典，以及tf, idf值
    def train_set(self,trainset,classVec):
        self.cate_prob(classVec)   # 計算每個分類在數據集中的概率：P(yi) 
        self.doclength = len(trainset)
        tempset = set()
        [tempset.add(word) for doc in trainset for word in doc ] # 生成詞典
        self.vocabulary = list(tempset) 
        self.vocablen = len(self.vocabulary)
        self.calc_wordfreq(trainset)
        # self.calc_tfidf(trainset)  # 生成tf-idf權值
        self.build_tdm()           # 按分類累計向量空間的每維值：P(x|yi)

    # 生成 tf-idf
    def calc_tfidf(self,trainset):
        self.idf = np.zeros([1,self.vocablen])
        self.tf = np.zeros([self.doclength,self.vocablen])
        for indx in xrange(self.doclength):
            for word in trainset[indx]:
                self.tf[indx,self.vocabulary.index(word)] +=1
            # 消除不同句長導致的偏差
            self.tf[indx] = self.tf[indx]/float(len(trainset[indx]))
            for signleword in set(trainset[indx]):
                self.idf[0,self.vocabulary.index(signleword)] +=1 
        self.idf = np.log(float(self.doclength)/self.idf)
        self.tf = np.multiply(self.tf,self.idf) # 矩陣與向量的點乘

    # 生成普通的詞頻向量
    def calc_wordfreq(self,trainset):
        self.idf = np.zeros([1,self.vocablen]) # 1*詞典數
        self.tf = np.zeros([self.doclength,self.vocablen]) # 訓練集文件數*詞典數
        for indx in xrange(self.doclength):    # 遍歷所有的文本
            for word in trainset[indx]:          # 遍歷文本中的每個詞
                self.tf[indx,self.vocabulary.index(word)] +=1  # 找到文本的詞在字典中的位置+1
            for signleword in set(trainset[indx]):              
                self.idf[0,self.vocabulary.index(signleword)] +=1 

    # 計算每個分類在數據集中的概率：P(yi)
    def cate_prob(self,classVec):
        self.labels = classVec
        labeltemps = set(self.labels) # 獲取全部分類
        for labeltemp in labeltemps:  
            # 統計列表中重復的值：self.labels.count(labeltemp)
            self.Pcates[labeltemp] = float(self.labels.count(labeltemp))/float(len(self.labels))

    #按分類累計向量空間的每維值：P(x|yi)
    def build_tdm(self):
        self.tdm = np.zeros([len(self.Pcates),self.vocablen]) #類別行*詞典列
        sumlist = np.zeros([len(self.Pcates),1])  # 統計每個分類的總值
        for indx in xrange(self.doclength):
            self.tdm[self.labels[indx]] += self.tf[indx]  # 將同一類別的詞向量空間值加總
            sumlist[self.labels[indx]]= np.sum(self.tdm[self.labels[indx]])  # 統計每個分類的總值--是個標量
        self.tdm = self.tdm/sumlist # P(x|yi)

    # 測試集映射到當前詞典
    def map2vocab(self,testdata):
        self.testset = np.zeros([1,self.vocablen])
        for word in testdata:
            self.testset[0,self.vocabulary.index(word)] +=1

    # 輸出分類類別
    def predict(self,testset):
        if np.shape(testset)[1] != self.vocablen:
            print "輸入錯誤"
            exit(0)
        predvalue = 0
        predclass = ""
        for tdm_vect,keyclass in zip(self.tdm,self.Pcates):
            # P(x|yi)P(yi)
            temp = np.sum(testset*tdm_vect*self.Pcates[keyclass])
            if temp > predvalue:
                predvalue = temp
                predclass = keyclass
        return predclass


In [7]:
dataSet,listClasses = loadDataSet()
nb = NBayes()
nb.train_set(dataSet,listClasses)
nb.map2vocab(dataSet[3])
print nb.predict(nb.testset)

1


### test詞向量空間創建

In [12]:
from sklearn import feature_extraction  
from sklearn.feature_extraction.text import TfidfTransformer  
from sklearn.feature_extraction.text import TfidfVectorizer  

# 1. 讀取停用詞表	
stopword_path = "train_word_bag/hlt_stop_words.txt"
stpwrdlst = readfile(stopword_path).splitlines()

# 2. 導入分詞後的詞向量bunch對象
path = "test_word_bag/test_set.dat"        # 詞向量空間保存路徑
bunch = readbunchobj(path)

# 3. 構建測試集tfidf向量空間
testspace = Bunch(target_name=bunch.target_name,label=bunch.label,filenames=bunch.filenames,tdm=[],vocabulary={})



In [13]:
# 4. 導入訓練集的詞袋
trainbunch = readbunchobj("train_word_bag/tfdifspace.dat")
# 5. 使用TfidfVectorizer初始化向量空間模型 
vectorizer = TfidfVectorizer(stop_words=stpwrdlst,sublinear_tf = True,max_df = 0.5,vocabulary=trainbunch.vocabulary)
transformer = TfidfTransformer() # 該類會統計每個詞語的tf-idf權值

# 文本轉為tf-idf矩陣,單獨保存字典文件 
testspace.tdm = vectorizer.fit_transform(bunch.contents)
testspace.vocabulary = trainbunch.vocabulary

# 創建詞袋的持久化
space_path = "test_word_bag/testspace.dat"        # 詞向量空間保存路徑
writebunchobj(space_path,testspace)

print "test詞向量空間創建成功！！！"

test詞向量空間創建成功！！！


### 預測

In [18]:
from sklearn import feature_extraction  
from sklearn.feature_extraction.text import TfidfTransformer  
from sklearn.feature_extraction.text import TfidfVectorizer  
from sklearn.naive_bayes import MultinomialNB #導入多項式貝葉斯算法
from sklearn import metrics


#計算分類精度：
def metrics_result(actual,predict):
    print '精度:{0:.3f}'.format(metrics.precision_score(actual,predict,average='macro'))  
    print '召回:{0:0.3f}'.format(metrics.recall_score(actual,predict,average='macro'))  
    print 'f1-score:{0:.3f}'.format(metrics.f1_score(actual,predict,average='macro'))  
# average='macro' 是後來自己加的，否則會有error
    
# 導入訓練集
trainpath = "train_word_bag/tfdifspace.dat"
train_set = readbunchobj(trainpath)

# 導入測試集
testpath = "test_word_bag/testspace.dat"
test_set = readbunchobj(testpath)
# 應用樸素貝葉斯算法 
# 1. 輸入詞袋向量和分類標籤
#alpha:0.001 alpha越小，迭代次數越多，精度越高
clf = MultinomialNB(alpha = 0.001).fit(train_set.tdm, train_set.label)

# 預測分類結果
predicted = clf.predict(test_set.tdm)
total = len(predicted);rate = 0
for flabel,file_name,expct_cate in zip(test_set.label,test_set.filenames,predicted):
    if flabel != expct_cate:
        rate += 1
        print file_name,": 實際類別:",flabel," -->預測類別:",expct_cate
# 精度
print "error rate:",float(rate)*100/float(total),"%"
print "預測完畢!!!"

metrics_result(test_set.label,predicted)

test_corpus_seg/art/3143.txt : 實際類別: art  -->預測類別: education
error rate: 1.40845070423 %
預測完畢!!!
精度:0.987
召回:0.987
f1-score:0.986
