题目二、以新闻数据分类为例 学习朴素贝叶斯

In [6]:
from sklearn.datasets import fetch_20newsgroups

# 加载数据集
data = fetch_20newsgroups(subset='all')

# 获取数据集的详细信息
print("数据集包含的类别：", data.target_names)
print("数据集的大小：", data.filenames.shape[0])
print("数据集的描述：", data.DESCR)

# 查看样本
print("样本：", data.data[0])


数据集包含的类别： ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
数据集的大小： 18846
数据集的描述： .. _20newsgroups_dataset:

The 20 newsgroups text dataset
------------------------------

The 20 newsgroups dataset comprises around 18000 newsgroups posts on
20 topics split in two subsets: one for training (or development)
and the other one for testing (or for performance evaluation). The split
between the train and test set is based upon a messages posted before
and after a specific date.

This module contains two loaders. The first one,
:func:`sklearn.datasets.fetch_20newsgroups`,
returns a list of the raw texts that can be fed to text feature
extractors such as :

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

# 将文本数据转换为词袋模型
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data.data)
print("词袋模型：", X)



词袋模型：   (0, 76869)	1
  (0, 106418)	1
  (0, 63606)	1
  (0, 133114)	1
  (0, 112961)	1
  (0, 39368)	2
  (0, 55748)	2
  (0, 68505)	2
  (0, 148466)	1
  (0, 124549)	6
  (0, 73535)	2
  (0, 133565)	1
  (0, 121189)	1
  (0, 127449)	1
  (0, 119906)	1
  (0, 52263)	1
  (0, 108761)	1
  (0, 126093)	1
  (0, 122831)	1
  (0, 102149)	1
  (0, 4705)	1
  (0, 117497)	1
  (0, 127481)	1
  (0, 86318)	1
  (0, 126850)	1
  :	:
  (18845, 56601)	1
  (18845, 31329)	1
  (18845, 118065)	1
  (18845, 159779)	1
  (18845, 160722)	1
  (18845, 145242)	1
  (18845, 40304)	1
  (18845, 149286)	1
  (18845, 145822)	1
  (18845, 59471)	2
  (18845, 90305)	1
  (18845, 69174)	1
  (18845, 74960)	3
  (18845, 127185)	1
  (18845, 142556)	1
  (18845, 90774)	1
  (18845, 54399)	3
  (18845, 143483)	2
  (18845, 42827)	1
  (18845, 137745)	1
  (18845, 138334)	1
  (18845, 62419)	1
  (18845, 73892)	1
  (18845, 38933)	1
  (18845, 127183)	1


In [8]:
from sklearn.model_selection import train_test_split

#将数据集分为训练集和测试集，训练集比例0.2，随机种子42
X_train, X_test, y_train, y_test = train_test_split(X, data.target, test_size=0.2, random_state=42)

# 创建并训练朴素贝叶斯分类器
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(X_train, y_train)

# 使用测试集进行预测
y_pred = nb.predict(X_test)

# 用 Accuracy、Recall、F1 Score 这三个评估指标来评估实验效果
from sklearn.metrics import accuracy_score, recall_score, f1_score

accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

print(f"Accuracy: {accuracy:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# 横坐标为 Predicted，纵坐标为 Actual，画出混淆矩阵Confusion Matrix
from sklearn.metrics import confusion_matrix

matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(matrix)


Accuracy: 0.85
Recall: 0.85
F1 Score: 0.84
Confusion Matrix:
[[134   0   0   0   0   0   0   0   0   0   0   0   0   0   2   8   1   1
    3   2]
 [  0 184   0   5   0   3   0   0   0   0   0   2   0   2   2   2   0   2
    0   0]
 [  1  51  43  48   7  27   0   3   0   0   0   4   3   0   0   1   1   0
    6   0]
 [  0  11   0 158   6   1   0   1   0   0   1   2   0   1   1   0   0   0
    1   0]
 [  0   2   0  11 179   1   0   1   0   0   0   3   4   0   1   0   0   1
    2   0]
 [  0  23   0   6   1 178   0   0   1   1   0   1   0   0   3   0   1   0
    0   0]
 [  0   5   0  27   2   0 119  12   1   2   2   6   8   3   2   0   1   2
    1   0]
 [  0   1   0   0   0   0   2 183   2   0   0   0   2   1   0   0   2   0
    3   0]
 [  0   0   0   1   0   0   3   4 154   0   0   0   0   0   0   0   4   0
    2   0]
 [  0   0   0   0   0   0   0   0   1 202   5   0   0   0   0   2   0   1
    0   0]
 [  0   1   1   1   0   0   0   0   1   1 190   0   0   0   0   1   0   1
    1   0]
 [  