In [1]:
"""
朴素贝叶斯进行文本分类
:return: None
"""
from sklearn.datasets import fetch_20newsgroups #新闻数据
import numpy as np

news = fetch_20newsgroups(subset='all', data_home='../MachineData/Data')

print(len(news.data))  #样本数，包含的特征
print('-'*50)
print(news.data[0]) #第一个样本 特征
print('-'*50)

print(news.target) #18846个样本分别是什么标签
print(np.unique(news.target))#对所有标签去重，一共有多少个标签，也就是多少个种类
print(news.target_names)

18846
--------------------------------------------------
From: Mamatha Devineni Ratnam <mr47+@andrew.cmu.edu>
Subject: Pens fans reactions
Organization: Post Office, Carnegie Mellon, Pittsburgh, PA
Lines: 12
NNTP-Posting-Host: po4.andrew.cmu.edu



I am sure some bashers of Pens fans are pretty confused about the lack
of any kind of posts about the recent Pens massacre of the Devils. Actually,
I am  bit puzzled too and a bit relieved. However, I am going to put an end
to non-PIttsburghers' relief with a bit of praise for the Pens. Man, they
are killing those Devils worse than I thought. Jagr just showed you why
he is much better than his regular season stats. He is also a lot
fo fun to watch in the playoffs. Bowman should let JAgr have a lot of
fun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final
regular season game.          PENS RULE!!!


----------------------------------------

In [2]:
from sklearn.model_selection import train_test_split #数据分割
from sklearn.feature_extraction.text import TfidfVectorizer #特征抽取

print('-'*50)
# 进行数据分割
x_train, x_test, y_train, y_test = train_test_split(news.data, news.target, test_size=0.25, random_state=1)


print('-'*50)
print(type(x_train))
print('-'*50)
print(x_train[0])
print('-'*50)
print(x_train[1])
print('-'*50)

# 对数据集进行特征抽取
tf = TfidfVectorizer()
print('-'*50)

# 以训练集当中的词的列表进行每篇文章重要性统计['a','b','c','d']
#x_train经过tf.fit transform之后，变成了一个稀疏矩阵
#横坐标是文章的序号，纵坐标是词的序号，值是词的重要性
x_train = tf.fit_transform(x_train)
print(type(x_train))
print(x_train.shape)
print('-'*50)

#针对特征内容，可以自行打印
#tf.get_feature_names()是所有词（特征）的列表
print(len(tf.get_feature_names_out()))
print(tf.get_feature_names_out()[0:10])



--------------------------------------------------
--------------------------------------------------
<class 'list'>
--------------------------------------------------
From: feustel@netcom.com (David Feustel)
Subject: Re: BATF/FBI Murders Almost Everyone in Waco Today! 4/19
Organization: DAFCO: OS/2 Software Support & Consulting
Lines: 10

It's truly unfortunate that we don't have the Japanese tradition of
Hari-Kari for public officials to salvage some tatters of honor after
they commit offenses against humanity like were perpetrated in Waco,
Texas today.
-- 
Dave Feustel N9MYI <feustel@netcom.com>

I'm beginning to look forward to reaching the %100 allocation of taxes
to pay for the interest on the national debt. At that point the
federal government will be will go out of business for lack of funds.

--------------------------------------------------
From: donyee@athena.mit.edu (Donald Yee)
Subject: S3 86c805 w/2MB = 1024x768x32k colors = Orchid Pipe Dream?
Organization: Massachusetts

In [5]:
# 下面这段代码请注意，不要重复运行，第一次运行时，可以跑通，如果重复运行。
# 会报如下错误：
#  AttributeError: 'csr_matrix' object has no attribute 'lower'

In [6]:
from sklearn.naive_bayes import MultinomialNB
import time
start=time.time()
#测试集的特征值经过transform之后，变成了一个稀疏矩阵
x_test = tf.transform(x_test)  #特征数目不发生改变
print(len(tf.get_feature_names_out()))


# 进行朴素贝叶斯算法的预测,alpha是拉普拉斯平滑系数，分子和分母加上一个系数，分母加alpha*特征词数目
mlt = MultinomialNB(alpha=1.0)

print(x_train.toarray())#稀疏矩阵转换为数组
# 训练
mlt.fit(x_train, y_train)#训练集的特征值和目标值

end=time.time()
end-start

AttributeError: 'csr_matrix' object has no attribute 'lower'

In [7]:
start=time.time()
y_predict = mlt.predict(x_test) #y_predict是预测的目标值，拿x_test去预测(mlt,是训练好的模型)

print("预测的文章类别为：", y_predict)

# 得出准确率,这个是很难提高准确率，为什么呢？
print("准确率为：", mlt.score(x_test, y_test))
end=time.time()
end-start

预测的文章类别为： [16 19 18 ... 13  7 14]
准确率为： 0.8518675721561969


0.03490042686462402

In [8]:
from sklearn.metrics import classification_report# 准确率，召回率，F1分数
# 目前这个场景我们不需要召回率，support是真实的为那个类别的有多少个样本
print("每个类别的精确率和召回率：", classification_report(y_test, y_predict,
                                             target_names=news.target_names))


每个类别的精确率和召回率：                           precision    recall  f1-score   support

             alt.atheism       0.91      0.77      0.83       199
           comp.graphics       0.83      0.79      0.81       242
 comp.os.ms-windows.misc       0.89      0.83      0.86       263
comp.sys.ibm.pc.hardware       0.80      0.83      0.81       262
   comp.sys.mac.hardware       0.90      0.88      0.89       234
          comp.windows.x       0.92      0.85      0.88       230
            misc.forsale       0.96      0.67      0.79       257
               rec.autos       0.90      0.87      0.88       265
         rec.motorcycles       0.90      0.95      0.92       251
      rec.sport.baseball       0.89      0.96      0.93       226
        rec.sport.hockey       0.95      0.98      0.96       262
               sci.crypt       0.76      0.97      0.85       257
         sci.electronics       0.84      0.80      0.82       229
                 sci.med       0.97      0.86      0.91      

In [9]:
from sklearn.metrics import roc_auc_score
# 把0-19总计20个分类，变为0和1
# 5是可以改为0到19的
y_test1 = np.where(y_test == 5, 1, 0)
print(y_test1.sum())
y_predict1 = np.where(y_predict == 5, 1, 0)
print(y_predict1.sum())
# roc_auc_score的y_test只能是二分类,针对多分类如何计算AUC
print("AUC指标：", roc_auc_score(y_test1, y_predict1))

230
214
AUC指标： 0.924078924393225


In [10]:
del news
del x_train
del x_test
del y_test
del y_predict
del tf


