# 新闻分类——朴素贝叶斯分类器
20newsgroups数据集是用于文本分类、文本挖据和信息检索研究的国际标准数据集之一。数据集收集了大约20,000左右的新闻组文档，均匀分为20个不同主题的新闻组集合。
在sklearn中，该模型有两种装载方式：
第一种是sklearn.datasets.fetch_20newsgroups，返回一个可以被文本特征提取器（sklearn.feature_extraction.text.CountVectorizer）自定义参数提取特征的原始文本序列；
第二种是sklearn.datasets.fetch_20newsgroups_vectorized，返回一个已提取特征的文本序列，即不需要使用特征提取器。

## 导入工具包

In [1]:
import sys

reload(sys)

sys.setdefaultencoding('utf-8')

import pandas as pd
import numpy as np
from sklearn import metrics

import matplotlib.pyplot as plt
%matplotlib inline

## 读取数据

In [2]:
from sklearn.datasets import fetch_20newsgroups

twenty_news = fetch_20newsgroups()
y = twenty_news.target
X = twenty_news.data
#n_samples = len(twenty_news.data)

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

# 初始化TFIV对象，去停用词，加2元语言模型
tfv = TfidfVectorizer(min_df=3,  max_features=None, strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}', ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1, stop_words = 'english')

# 提取特征，会有点慢
X = tfv.fit_transform(X)

In [4]:
#将数据分割训练数据与测试数据
from sklearn.model_selection import train_test_split

# 随机采样20%的数据构建测试样本，其余作为训练样本
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=33, test_size=0.2)
X_train.shape

(9051, 155785)

In [5]:
X_test.shape

(2263, 155785)

## 模型训练

In [6]:
# 多项朴素贝叶斯
from sklearn.naive_bayes import MultinomialNB

MNB = MultinomialNB()
MNB.fit(X_train, y_train) #特征数据直接灌进来

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

## 测试

In [7]:
#输出每类的概率
#y_test_pred = MNB.predict_proba(X_test)
y_test_pred = MNB.predict(X_test)

## 性能

In [8]:
print(metrics.classification_report(y_test, y_test_pred, target_names=twenty_news.target_names))
#print(metrics.confusion_matrix(y_test, y_test_pred))

In [9]:
print(metrics.accuracy_score(y_test, y_test_pred))

## ComplementNB

In [10]:
# 多项朴素贝叶斯
from sklearn.naive_bayes import ComplementNB

CNB = ComplementNB()
CNB.fit(X_train, y_train) 

y_test_pred_cmb = CNB.predict(X_test)
print(metrics.classification_report(y_test, y_test_pred_cmb, target_names=twenty_news.target_names))
print(metrics.accuracy_score(y_test, y_test_pred_cmb))

ImportError: cannot import name ComplementNB

In [10]:
# 和逻辑回归比较
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

#设置超参数搜索范围
penaltys = ['l1','l2']
Cs = [ 0.1, 1, 10, 100, 1000]
tuned_parameters = dict(penalty = penaltys, C = Cs)

# LR学习器实例
lr_penalty= LogisticRegression(tol=0.0001)

#GridSearchCV实例
grid= GridSearchCV(lr_penalty, tuned_parameters, cv=5, scoring='neg_log_loss',n_jobs = 4,verbose=5)

# 模型训练
grid.fit(X_train,y_train)

#输出结果
print grid.grid_scores_

In [11]:
grid.grid_scores_

[mean: -2.68736, std: 0.00647, params: {'penalty': 'l1', 'C': 0.1},
 mean: -2.59736, std: 0.00379, params: {'penalty': 'l2', 'C': 0.1},
 mean: -1.22203, std: 0.01919, params: {'penalty': 'l1', 'C': 1},
 mean: -1.35552, std: 0.01238, params: {'penalty': 'l2', 'C': 1},
 mean: -0.48638, std: 0.01420, params: {'penalty': 'l1', 'C': 10},
 mean: -0.54781, std: 0.01026, params: {'penalty': 'l2', 'C': 10},
 mean: -0.43411, std: 0.02614, params: {'penalty': 'l1', 'C': 100},
 mean: -0.31979, std: 0.01257, params: {'penalty': 'l2', 'C': 100},
 mean: -0.48533, std: 0.03688, params: {'penalty': 'l1', 'C': 1000},
 mean: -0.28822, std: 0.01710, params: {'penalty': 'l2', 'C': 1000}]

### 最佳超参数在搜索范围边界，扩大超参数搜索范围

In [14]:
#设置超参数搜索范围
penaltys = ['l1','l2']
Cs = [10000,100000]
tuned_parameters = dict(penalty = penaltys, C = Cs)

# LR学习器实例
lr_penalty= LogisticRegression(tol=0.0001)

#GridSearchCV实例
grid= GridSearchCV(lr_penalty, tuned_parameters, cv=5, scoring='neg_log_loss',n_jobs = 4,verbose=5)

# 模型训练
grid.fit(X_train,y_train)

#输出结果
print grid.grid_scores_

In [15]:
grid.grid_scores_

[mean: -0.38240, std: 0.02941, params: {'penalty': 'l1', 'C': 10000},
 mean: -0.31564, std: 0.02151, params: {'penalty': 'l2', 'C': 10000},
 mean: -0.37915, std: 0.03106, params: {'penalty': 'l1', 'C': 100000},
 mean: -0.35333, std: 0.02678, params: {'penalty': 'l2', 'C': 100000}]

## 测试

In [12]:
y_test_pred = grid.best_estimator_.predict(X_test)

In [13]:
print(metrics.classification_report(y_test, y_test_pred, target_names=twenty_news.target_names))
print(metrics.accuracy_score(y_test, y_test_pred))