# 对于文本分类来说，除了深度学习，最好就用朴素贝叶斯

In [5]:
sample = ["Machine learning is fascinating, it is wonderful"
          ,"Machine learning is a sensational techonology"
          ,"Elsa is a popular character"]

from sklearn.feature_extraction.text import CountVectorizer #首先将文本编码，单词计数向量
import pandas as pd
vec = CountVectorizer()

X = vec.fit_transform(sample)

X
# 测试一下
# 3:三个句子，11：11个特征

<3x11 sparse matrix of type '<class 'numpy.int64'>'
	with 15 stored elements in Compressed Sparse Row format>

In [6]:
vec.get_feature_names() #使用接口get_feature_names()调用每个列的名称
CVresult=pd.DataFrame(X.toarray(),columns=vec.get_feature_names())

In [7]:
CVresult

Unnamed: 0,character,elsa,fascinating,is,it,learning,machine,popular,sensational,techonology,wonderful
0,0,0,1,2,1,1,1,0,0,0,1
1,0,0,0,1,0,1,1,0,1,1,0
2,1,1,0,1,0,0,0,1,0,0,0


+ 问题：长句子显然贡献的特征越多，但同时含有无效信息的可能也越大。分类器在预测时会倾向于长句子，故需要用
L2范式进行处理，使权重更平衡
+ 问题2：有无意义单词，如is it，虽然占有权重很大但没有用
+ 故更常用的不是该模块，是利用单词在句子中出现的比例来衡量权重，而不是出现的次数

TFIDF：词的频率越常见，权重就越小，以此来压制无意义单词

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer as TFIDF
vec = TFIDF()

X = vec.fit_transform(sample)

X #每一个单词作为一个特征，每个单词在这个句子中所占的比例

#同样使用接口get_feature_names()调用每个列的名称
TFIDFresult = pd.DataFrame(X.toarray(),columns=vec.get_feature_names())

TFIDFresult

Unnamed: 0,character,elsa,fascinating,is,it,learning,machine,popular,sensational,techonology,wonderful
0,0.0,0.0,0.424396,0.50131,0.424396,0.322764,0.322764,0.0,0.0,0.0,0.424396
1,0.0,0.0,0.0,0.315444,0.0,0.406192,0.406192,0.0,0.534093,0.534093,0.0
2,0.546454,0.546454,0.0,0.322745,0.0,0.0,0.0,0.546454,0.0,0.0,0.0


In [9]:
#使用TF-IDF编码之后，出现得多的单词的权重被降低了么？ theta

CVresult.sum(axis=0)/CVresult.sum(axis=0).sum()

character      0.0625
elsa           0.0625
fascinating    0.0625
is             0.2500
it             0.0625
learning       0.1250
machine        0.1250
popular        0.0625
sensational    0.0625
techonology    0.0625
wonderful      0.0625
dtype: float64

In [10]:
TFIDFresult.sum(axis=0) / TFIDFresult.sum(axis=0).sum()
#将原本出现次数比较多的词压缩我们的权重
#将原本出现次数比较少的词增加我们的权重

character      0.083071
elsa           0.083071
fascinating    0.064516
is             0.173225
it             0.064516
learning       0.110815
machine        0.110815
popular        0.083071
sensational    0.081192
techonology    0.081192
wonderful      0.064516
dtype: float64

### 接下来都用TFIDF，实例

In [11]:
from sklearn.datasets import fetch_20newsgroups

#初次使用这个数据集的时候，会在实例化的时候开始下载
data = fetch_20newsgroups()
#不同类型的新闻
#标签的分类都有哪些
data.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [12]:
#其实fetch_20newsgroups也是一个类，既然是类，应该就有可以调用的参数
#20个类太多，只选4个
import numpy as np
import pandas as pd
categories = ["sci.space" #科学技术 - 太空
              ,"rec.sport.hockey" #运动 - 曲棍球
              ,"talk.politics.guns" #政治 - 枪支问题
              ,"talk.politics.mideast"] #政治 - 中东问题

train = fetch_20newsgroups(subset="train",categories = categories)
test = fetch_20newsgroups(subset="test",categories = categories)
# 得到训练集和测试集

In [13]:
type(train) # 注意类型，是一个类字典结构

sklearn.utils.Bunch

In [17]:
train.data  # 列表，每个元素都是一篇文章
train.target_names

['rec.sport.hockey',
 'sci.space',
 'talk.politics.guns',
 'talk.politics.mideast']

In [18]:
np.unique(train.target)

array([0, 1, 2, 3], dtype=int64)

In [20]:
# 查看是否存在样本不均衡问题
for i in [0,1,2,3]:
    print(i,sum((train.target==i))/len(train.target))
#不存在问题，不需要用补集贝叶斯

0 0.26052974381241856
1 0.25749023013460703
2 0.23708206686930092
3 0.24489795918367346


In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer as TFIDF

Xtrain = train.data
Xtest = test.data
Ytrain = train.target
Ytest = test.target

In [26]:
tfidf = TFIDF().fit(Xtrain)
Xtrain_ = tfidf.transform(Xtrain)
Xtest_ = tfidf.transform(Xtest)
# 注意这里用了Xtrain fit过后的模型直接来转换Xtest，不能直接对Xtest进行fit_transform，因为结构会不一样，特征数目不同

In [27]:
Xtrain_

<2303x40725 sparse matrix of type '<class 'numpy.float64'>'
	with 430306 stored elements in Compressed Sparse Row format>

In [28]:
tosee = pd.DataFrame(Xtrain_.toarray(),columns=tfidf.get_feature_names())

tosee.head()

Unnamed: 0,00,000,0000,00000,000000,000021,000062david42,000152,000246,000256,...,zwrm,zx,zx6wre,zxp,zxqi,zy,zyg,zz,zz_g9q3,zzzzzz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.058046,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
from sklearn.naive_bayes import MultinomialNB, ComplementNB, BernoulliNB
from sklearn.metrics import log_loss

name = ["Multinomial","Complement","Bournulli"]
#注意高斯朴素贝叶斯不接受稀疏矩阵
models = [MultinomialNB(),ComplementNB(),BernoulliNB()]

for name,clf in zip(name,models):
    clf.fit(Xtrain_,Ytrain)
    y_pred = clf.predict(Xtest_)
    proba = clf.predict_proba(Xtest_)
    score = clf.score(Xtest_,Ytest)
    Lscore = log_loss(Ytest,proba)
    print("\tLscore:{:.3f}".format(Lscore))
    print("\tAccuracy:{:.3f}".format(score))
    print("\n")
# 发现补集贝叶斯效果是相对来说最好的，伯努利不行

	Lrier:0.268
	Accuracy:0.975


	Lrier:0.331
	Accuracy:0.986


	Lrier:2.083
	Accuracy:0.902




## 采用概率校准

In [43]:
from sklearn.calibration import CalibratedClassifierCV

name = ["Multinomial"
        ,"Multinomial + Isotonic"
        ,"Multinomial + Sigmoid"
        ,"Complement"
        ,"Complement + Isotonic"
        ,"Complement + Sigmoid"
        ,"Bernoulli"
        ,"Bernoulli + Isotonic"
        ,"Bernoulli + Sigmoid"]

models = [MultinomialNB()
          ,CalibratedClassifierCV(MultinomialNB(), cv=2, method='isotonic')
          ,CalibratedClassifierCV(MultinomialNB(), cv=2, method='sigmoid')
          ,ComplementNB()
          ,CalibratedClassifierCV(ComplementNB(), cv=2, method='isotonic')
          ,CalibratedClassifierCV(ComplementNB(), cv=2, method='sigmoid')
          ,BernoulliNB()
          ,CalibratedClassifierCV(BernoulliNB(), cv=2, method='isotonic')
          ,CalibratedClassifierCV(BernoulliNB(), cv=2, method='sigmoid')
         ]

for name,clf in zip(name,models):
    clf.fit(Xtrain_,Ytrain)
    y_pred = clf.predict(Xtest_)
    proba = clf.predict_proba(Xtest_)
    score = clf.score(Xtest_,Ytest)
    Lscore = log_loss(Ytest,proba)
    print("{}".format(name))
    print("\tLscore:{:.3f}".format(Lscore))
    print("\tAccuracy:{:.3f}".format(score))
# 最后选择表现最优的Complement + Sigmoid

Multinomial
	Lscore:0.268
	Accuracy:0.975
Multinomial + Isotonic
	Lscore:0.140
	Accuracy:0.973
Multinomial + Sigmoid
	Lscore:0.091
	Accuracy:0.973
Complement
	Lscore:0.331
	Accuracy:0.986
Complement + Isotonic
	Lscore:0.071
	Accuracy:0.985
Complement + Sigmoid
	Lscore:0.071
	Accuracy:0.986
Bernoulli
	Lscore:2.083
	Accuracy:0.902
Bernoulli + Isotonic
	Lscore:0.229
	Accuracy:0.937
Bernoulli + Sigmoid
	Lscore:0.443
	Accuracy:0.879
