In [3]:
from sklearn.datasets import fetch_20newsgroups

newsgroups_train = fetch_20newsgroups(subset='train') #训练集
print(len(newsgroups_train.data))
newsgroups_test = fetch_20newsgroups(subset='test') #测试集
print(len(newsgroups_test.data))
news = fetch_20newsgroups(subset='all')
print(len(news.data))

11314
7532
18846


In [3]:
print(newsgroups_train.target_names)

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [10]:
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
twenty_train = fetch_20newsgroups(subset = 'train', categories = categories, 
                                  shuffle = True, random_state = 42)

In [9]:
# 提取特征，词频统计，首先要进行分词，这里的数据已经进行了分词
#统计文章中各个词出现的次数（分词后）

from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
# print(count_vect.get_feature_names())

In [13]:
# 特征是TPIDF特征
from sklearn.feature_extraction.text import TfidfTransformer

#获取所有数据集中，各个分词所占的权重

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

print(X_train_tfidf.toarray())
print(X_train_tfidf.toarray().sum(axis=0))

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[4.67345222 3.27795162 0.01121216 ... 0.21430177 0.10555802 0.11587165]


In [14]:
# 使用朴素贝叶斯分类，做出简单的预测

from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target) #得到我们训练的模型
docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
predicted = clf.predict(X_new_tfidf)
for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))

'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.graphics


In [30]:

#jieba分词工具
import jieba

#分三种模式，精确模式，全模式和搜索引擎模式。
s = u'我想和女朋友一起去北京故宫博物院参观和闲逛。'
cut = jieba.cut(s)
print(','.join(cut))

Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\XIAOZH~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.880 seconds.
Prefix dict has been built succesfully.


我,想,和,女朋友,一起,去,北京故宫博物院,参观,和,闲逛,。


In [17]:
#使用测试集来评估模型的好坏

from sklearn import metrics
import numpy as np

twenty_test = fetch_20newsgroups(subset='test', categories=categories, 
                                shuffle=True, random_state=42)
docs_test = twenty_test.data
X_test_counts = count_vect.transform(docs_test)#提取特征词
X_test_tfidf = tfidf_transformer.transform(X_test_counts) # 计算特征词权重
predicted = clf.predict(X_test_tfidf)
print('准确率-->' + str(np.mean(predicted == twenty_test.target)))

准确率-->0.8348868175765646
