In [6]:
#导入数据
from sklearn import datasets
news_dataset_train = datasets.fetch_20newsgroups(subset = 'train')
news_dataset_test = datasets.fetch_20newsgroups(subset = 'test')
X_train = news_dataset_train.data
y_train = news_dataset_train.target
X_test = news_dataset_test.data
y_test = news_dataset_test.target

In [62]:
#预处理：CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer_transformer = CountVectorizer()
count_vectorizer_transformer.fit(X_train)
X_train_transformed_count = count_vectorizer_transformer.transform(X_train)
X_test_transformed_count = count_vectorizer_transformer.transform(X_test)

In [63]:
#构建、评估分类器
from sklearn.naive_bayes import MultinomialNB
multinomialnb_clf = MultinomialNB()
from sklearn.model_selection import cross_val_score
scores = cross_val_score(multinomialnb_clf,X_train_transformed_count,y_train,scoring = 'accuracy')
import numpy as np
print('the average score of {0:.1f}%'.format(np.mean(scores)*100))

the average score of 81.2%


In [64]:
#预处理：TfidfTransformer(将单词出现次数转化为单词出现频率)
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
tfidf_transformer.fit(X_train_transformed_count)
X_train_transformed_tfidf = tfidf_transformer.transform(X_train_transformed_count)
X_test_transformed_tfidf = tfidf_transformer.transform(X_test_transformed_count)

In [39]:
#构建、评估分类器
from sklearn.naive_bayes import MultinomialNB
multinomialnb_clf = MultinomialNB()
from sklearn.model_selection import cross_val_score
scores = cross_val_score(multinomialnb_clf,X_train_transformed_tfidf,y_train,scoring = 'accuracy')
import numpy as np
print('the average score of {0:.1f}%'.format(np.mean(scores)*100))

the average score of 82.7%


In [58]:
#预处理：TfidfVectorizer(直接将原数据转化为单词出现频率)
from sklearn.feature_extraction.text import TfidfVectorizer
tfidfvectorizer_transformer = TfidfVectorizer()
tfidfvectorizer_transformer.fit(X_train)
X_train_transformed_tfidfvect = tfidfvectorizer_transformer.transform(X_train)
X_test_transformed_tfidfvect = tfidfvectorizer_transformer.transform(X_test)

In [40]:
#构建、评估分类器
from sklearn.naive_bayes import MultinomialNB
multinomialnb_clf = MultinomialNB()
from sklearn.model_selection import cross_val_score
scores = cross_val_score(multinomialnb_clf,X_train_transformed_tfidfvect,y_train,scoring = 'accuracy')
import numpy as np
print('the average score of {0:.1f}%'.format(np.mean(scores)*100))

the average score of 82.7%


In [68]:
#预处理：
from sklearn.feature_extraction.text import HashingVectorizer
hashingvectorizer_transformer = HashingVectorizer(non_negative = True)
hashingvectorizer_transformer.fit(X_train)
X_train_transformed_hash = hashingvectorizer_transformer.transform(X_train)
X_test_transformed_hash = hashingvectorizer_transformer.transform(X_test)

In [69]:
#构建、评估分类器
from sklearn.naive_bayes import MultinomialNB
multinomialnb_clf = MultinomialNB()
from sklearn.model_selection import cross_val_score
scores = cross_val_score(multinomialnb_clf,X_train_transformed_hash,y_train,scoring = 'accuracy')
import numpy as np
print('the average score of {0:.1f}%'.format(np.mean(scores)*100))

the average score of 73.4%


In [71]:
#测试分类器
from sklearn.naive_bayes import MultinomialNB
multinomialnb_clf = MultinomialNB()
multinomialnb_clf.fit(X_train_transformed_count,y_train)
y_predicted = multinomialnb_clf.predict(X_test_transformed_count)
import numpy as np
print(np.mean(y_test == y_predicted))

0.772835900159


In [72]:
#测试分类器
from sklearn.naive_bayes import MultinomialNB
multinomialnb_clf = MultinomialNB()
multinomialnb_clf.fit(X_train_transformed_tfidf,y_train)
y_predicted = multinomialnb_clf.predict(X_test_transformed_tfidf)
import numpy as np
print(np.mean(y_test == y_predicted))

0.77389803505


In [73]:
#测试分类器
from sklearn.naive_bayes import MultinomialNB
multinomialnb_clf = MultinomialNB()
multinomialnb_clf.fit(X_train_transformed_tfidfvect,y_train)
y_predicted = multinomialnb_clf.predict(X_test_transformed_tfidfvect)
import numpy as np
print(np.mean(y_test == y_predicted))

0.77389803505


In [74]:
#测试分类器
from sklearn.naive_bayes import MultinomialNB
multinomialnb_clf = MultinomialNB()
multinomialnb_clf.fit(X_train_transformed_hash,y_train)
y_predicted = multinomialnb_clf.predict(X_test_transformed_hash)
import numpy as np
print(np.mean(y_test == y_predicted))

0.701540095592


In [89]:
#构建pipeline
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer,TfidfVectorizer,HashingVectorizer
countvectorizer_multinomialnb_pipeline = Pipeline([('count_vectorizer_transformer',CountVectorizer()),
                                                  ('multinomialnb_clf',MultinomialNB())])
tfidf_multinomialnb_pipeline = Pipeline([('tfidf_transformer',TfidfTransformer()),
                                         ('multinomialnb_clf',MultinomialNB())])
tfidfvectorizer_multinomialnb_pipeline = Pipeline([('tfidfvectorizer_transformer',TfidfVectorizer()),
                                                   ('multinomialnb_clf',MultinomialNB())])
hashingvectorizer_multinomialnb_pipeline = Pipeline([('hashingvectorizer_transformer',HashingVectorizer(non_negative = True)),
                                                  ('multinomialnb_clf',MultinomialNB())])

In [82]:
#评估countvectorizer_multinomialnb_pipeline
from sklearn.model_selection import cross_val_score
scores_countvectorizer_multinomialnb_pipeline = cross_val_score(countvectorizer_multinomialnb_pipeline,
                                                               X_train,y_train,scoring = 'accuracy')
print('the average score is {0:.1f}%'.format(np.mean(scores_countvectorizer_multinomialnb_pipeline)*100))

the average score is 82.1%


In [84]:
#评估tfidf_multinomialnb_pipeline 
from sklearn.model_selection import cross_val_score
scores_tfidf_multinomialnb_pipeline = cross_val_score(tfidf_multinomialnb_pipeline,X_train_transformed_count,y_train,scoring = 'accuracy')
print('the average score is {0:.1f}%'.format(np.mean(scores_tfidf_multinomialnb_pipeline)*100))

the average score is 82.8%


In [85]:
#评估tfidfvectorizer_multinomialnb_pipeline
from sklearn.model_selection import cross_val_score
scores_tfidfvectorizer_multinomialnb_pipeline = cross_val_score(tfidfvectorizer_multinomialnb_pipeline,
                                                               X_train,y_train,scoring = 'accuracy')
print('the average score is {0:.1f}%'.format(np.mean(scores_tfidfvectorizer_multinomialnb_pipeline)*100))

the average score is 83.0%


In [90]:
#评估hashingvectorizer_multinomialnb_pipeline
from sklearn.model_selection import cross_val_score
scores_hashingvectorizer_multinomialnb_pipeline = cross_val_score(hashingvectorizer_multinomialnb_pipeline,
                                                               X_train,y_train,scoring = 'accuracy')
print('the average score is {0:.1f}%'.format(np.mean(scores_hashingvectorizer_multinomialnb_pipeline)*100))

the average score is 73.4%


In [92]:
#测试pipeline
countvectorizer_multinomialnb_pipeline.fit(X_train,y_train)
tfidf_multinomialnb_pipeline.fit(X_train_transformed_count,y_train)
tfidfvectorizer_multinomialnb_pipeline.fit(X_train,y_train)
hashingvectorizer_multinomialnb_pipeline.fit(X_train,y_train)
y_predicted_countvectorizer = countvectorizer_multinomialnb_pipeline.predict(X_test)
y_predicted_tfidf = tfidf_multinomialnb_pipeline.predict(X_test_transformed_count)
y_predicted_tfidfvect = tfidfvectorizer_multinomialnb_pipeline.predict(X_test)
y_predicted_hashingvectorizer = hashingvectorizer_multinomialnb_pipeline.predict(X_test)
print(np.mean(y_predicted_countvectorizer == y_test))
print(np.mean(y_predicted_tfidf == y_test))
print(np.mean(y_predicted_tfidfvect == y_test))
print(np.mean(y_predicted_hashingvectorizer == y_test))

0.772835900159
0.77389803505
0.77389803505
0.701540095592


In [94]:
#三层pipeline
countvectorizer_tfidf_multinomialnb_pipeline = Pipeline([('count_vectorizer_transformer',CountVectorizer()),
                                                        ('tfidf_transformer',TfidfTransformer()),
                                                        ('multinomialnb_clf',MultinomialNB())])
countvectorizer_tfidf_multinomialnb_pipeline.fit(X_train,y_train)
y_predicted_countvectorizer_tfidf = countvectorizer_tfidf_multinomialnb_pipeline.predict(X_test)
print(np.mean(y_predicted_countvectorizer_tfidf == y_test))

0.77389803505


In [100]:
#结果报告
from sklearn import metrics
print(metrics.classification_report(y_test,y_predicted_countvectorizer_tfidf,target_names = news_dataset.target_names))
print(metrics.confusion_matrix(y_test,y_predicted_countvectorizer_tfidf))

                          precision    recall  f1-score   support

             alt.atheism       0.80      0.52      0.63       319
           comp.graphics       0.81      0.65      0.72       389
 comp.os.ms-windows.misc       0.82      0.65      0.73       394
comp.sys.ibm.pc.hardware       0.67      0.78      0.72       392
   comp.sys.mac.hardware       0.86      0.77      0.81       385
          comp.windows.x       0.89      0.75      0.82       395
            misc.forsale       0.93      0.69      0.80       390
               rec.autos       0.85      0.92      0.88       396
         rec.motorcycles       0.94      0.93      0.93       398
      rec.sport.baseball       0.92      0.90      0.91       397
        rec.sport.hockey       0.89      0.97      0.93       399
               sci.crypt       0.59      0.97      0.74       396
         sci.electronics       0.84      0.60      0.70       393
                 sci.med       0.92      0.74      0.82       396
         

In [103]:
#使用梯度搜索寻找最优参数
#备选参数组成字典，键为pipeline中的转换器或分类器的名字加两个下划线加参数名，值为备选的参数值
#GridSearchCV返回一个分类器，对训练数据拟合后产生最优参数best_params_
from sklearn.model_selection import GridSearchCV
parameters = {'count_vectorizer_transformer__ngram_range':[(1, 1), (1, 2)],
              'tfidf_transformer__use_idf':(True,False),
              'multinomialnb_clf__alpha':(1,0.3,0.1,0.03,0.01,0.003,0.001),}
gs_clf = GridSearchCV(countvectorizer_tfidf_multinomialnb_pipeline,parameters,n_jobs = -1)
gs_clf.fit(X_train[:400],y_train[:400])

In [112]:
print(gs_clf.best_score_)
print(gs_clf.best_params_)
for item in gs_clf.best_params_.items():
    print(item)

0.6
{'count_vectorizer_transformer__ngram_range': (1, 1), 'multinomialnb_clf__alpha': 0.003, 'tfidf_transformer__use_idf': True}
('count_vectorizer_transformer__ngram_range', (1, 1))
('multinomialnb_clf__alpha', 0.003)
('tfidf_transformer__use_idf', True)


In [113]:
#注：
#一、CountVectorizer,TfidfVectorizer,HashingVectorizer都有stop_words参数，用于忽略一些不想列入计算的单词，比如
#频繁出现的单词
#二、HashingVectorizer有non_negative参数，需要设为True
#三、MultinomialNB有alpha参数，用于指定拉普拉斯平滑系数，默认为1
#四、词根还原（stemming）与词形还原（lemmatization）可以提高分类器准确率
#五、关于特征提取的详细介绍http://blog.csdn.net/u013719780/article/details/51743867