In [1]:
from sklearn.datasets import fetch_20newsgroups
news = fetch_20newsgroups(subset='all')

In [2]:
from sklearn.model_selection import train_test_split
# 随机采样25%的数据样本作为测试集
X_train, X_test, y_train, y_test = train_test_split(news.data, news.target, test_size=0.25, random_state=33)


In [3]:
from sklearn.feature_extraction.text import CountVectorizer
# CountVectorizer只考虑每个词汇在这条训练文本中出现的频率

# 像什么the a这些出现太多的词，没什么贡献价值，有的时候需要过滤掉
# 这些词也被叫做停用词

In [4]:
count_vec = CountVectorizer()
# 默认不去除停用词

In [5]:
# 将原始的训练和测试文本转化为特征向量
x_count_train = count_vec.fit_transform(X_train)
x_count_test = count_vec.transform(X_test)

In [6]:
from sklearn.naive_bayes import MultinomialNB
# 导入朴素贝叶斯分类器

In [7]:
mnb_count = MultinomialNB()

In [8]:
# 对训练样本进行学习
mnb_count.fit(x_count_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [9]:
print(mnb_count.score(x_count_test, y_test))

0.8397707979626485


In [None]:
# 其他的评价方法自己查去

In [10]:
# 接下来用另外一个方法进行对比
from sklearn.feature_extraction.text import TfidfVectorizer
# TfidfVectorizer这个还关注包含这个词汇的文本条数的倒数
# 文本条目越多，这个越有用
# 这个能更好的找到有贡献的词汇并且压制停用词
tfidf_vec = TfidfVectorizer()
# 这些默认都不会除去停用词

In [11]:
x_tfidf_train = tfidf_vec.fit_transform(X_train)
x_tfidf_test = tfidf_vec.transform(X_test)

In [12]:
mnb_tfidf = MultinomialNB()
mnb_tfidf.fit(x_tfidf_train, y_train)
print(mnb_tfidf.score(x_tfidf_test, y_test))

0.8463497453310697


In [13]:
# 接下来用过滤停用词的来看看效果
count_filter_vec, tfidf_filter_vec = CountVectorizer(analyzer='word', stop_words='english'), TfidfVectorizer(analyzer='word', stop_words='english')

In [14]:
x_count_filter_train = count_filter_vec.fit_transform(X_train)
x_count_filter_test = count_filter_vec.transform(X_test)

In [15]:
x_tfidf_filter_train = tfidf_filter_vec.fit_transform(X_train)
x_tfidf_filter_test = tfidf_filter_vec.transform(X_test)

In [16]:
mnb_count_filter = MultinomialNB()
mnb_count_filter.fit(x_count_filter_train, y_train)
print(mnb_count_filter.score(x_count_filter_test, y_test))

0.8637521222410866


In [None]:
# 另外一个也这样子来评估就行了
