In [31]:
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd

In [26]:
# 获取训练数据，测试数据
train_data = []
with open('./after_preprocess_traindata.txt',encoding='utf-8') as f:
    for line in f:
        train_data.append(line.strip())

test_data = []
with open('./after_preprocess_testdata.txt',encoding='utf-8') as f:
    for line in f:
        test_data.append(line.strip())

In [27]:
# 获取标签编码
df1 = pd.read_csv('../dataset/cnews/cnews.train.txt',sep='\t',names=['label','content'],encoding='UTF-8',engine='python')
df2 = pd.read_csv('../dataset/cnews/cnews.test.txt',sep='\t',names=['label','content'],encoding='UTF-8',engine='python')
encoder = LabelEncoder()

train_y = encoder.fit_transform(df1['label'])
test_y = encoder.transform(df2['label'])

### tfidf 筛选前6000个特征

In [28]:
# 获取tfidf_transformer
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_transformer = TfidfVectorizer(max_features=6000)
tfidf_transformer.fit(train_data)

# 将train_data，test_data转换成tfidf矩阵
train_x = tfidf_transformer.transform(train_data)
test_x = tfidf_transformer.transform(test_data)

In [29]:
# 训练朴素贝叶斯
gs = GaussianNB()
gs.fit(train_x.toarray(),train_y)

GaussianNB(priors=None, var_smoothing=1e-09)

In [32]:
# 测试
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
pred = gs.predict(test_x.toarray())
print(pred.shape,test_y.shape, test_x.shape)
report = classification_report(test_y,pred)
print(report)

mat = confusion_matrix(test_y,pred)
print(mat)

acc = np.sum(pred == test_y)/len(test_y)
print('acc', acc)

(10000,) (10000,) (10000, 6000)
              precision    recall  f1-score   support

           0       1.00      0.98      0.99      1000
           1       0.89      0.94      0.92      1000
           2       0.90      0.39      0.55      1000
           3       0.64      0.85      0.73      1000
           4       0.84      0.88      0.86      1000
           5       0.91      0.91      0.91      1000
           6       0.89      0.87      0.88      1000
           7       0.94      0.94      0.94      1000
           8       0.90      0.97      0.94      1000
           9       0.88      0.96      0.92      1000

   micro avg       0.87      0.87      0.87     10000
   macro avg       0.88      0.87      0.86     10000
weighted avg       0.88      0.87      0.86     10000

[[984   6   0   2   1   0   0   4   2   1]
 [  0 944   0   1  12  24   1  18   0   0]
 [  0  13 394 381  53  42  28   7  36  46]
 [  0  12   8 846  29   6  37   1   5  56]
 [  0  18   4   4 881   4  34   9  29

### 下面是先转成tfidf，然后用互信息选取6000个特征。但是筛选的速度很慢，最终预测结果也比直接tfidf筛选略低

In [52]:
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest

tfidf_transformer2 = TfidfVectorizer()
tfidf_transformer2.fit(train_data)
print('--1--')
# 将train_data，test_data转换成tfidf矩阵
train2_x = tfidf_transformer2.transform(train_data)
print('--2--')
test2_x = tfidf_transformer2.transform(test_data)
print('--3--')
selector = SelectKBest(mutual_info_classif, k=6000)
print('--4--')
new_train_x = selector.fit_transform(train2_x,train_y)
print(new_train_x.shape)

--1--
--2--
--3--
--4--
(50000, 6000)


In [53]:
gs2 = GaussianNB()
gs2.fit(new_train_x.toarray(),train_y)

new_test_x = selector.transform(test2_x)
pred = gs2.predict(new_test_x.toarray())

report = classification_report(test_y,pred)
print(report)

mat = confusion_matrix(test_y,pred)
print(mat)

acc = np.sum(pred == test_y)/len(test_y)
print('acc', acc)

              precision    recall  f1-score   support

           0       1.00      0.98      0.99      1000
           1       0.89      0.93      0.91      1000
           2       0.87      0.31      0.46      1000
           3       0.62      0.84      0.71      1000
           4       0.81      0.86      0.83      1000
           5       0.91      0.91      0.91      1000
           6       0.89      0.88      0.89      1000
           7       0.92      0.94      0.93      1000
           8       0.91      0.96      0.93      1000
           9       0.86      0.95      0.90      1000

   micro avg       0.86      0.86      0.86     10000
   macro avg       0.87      0.86      0.85     10000
weighted avg       0.87      0.86      0.85     10000

[[984   6   0   0   2   0   1   5   2   0]
 [  0 932   0   3  15  26   2  17   5   0]
 [  3   8 314 421  83  34  25  15  35  62]
 [  0  11  11 839  31   2  49   1   2  54]
 [  1  25   3   5 860  11  25  22  31  17]
 [  0  38  24   1  18 912 

### 使用SVM来进行分类

In [None]:
from sklearn.svm import SVC
svc_model = SVC()
print('start fit.')
svc_model.fit(train_x,train_y)
print('end fit.')

pred = svc_model.predict(test_x.toarray())

report = classification_report(test_y,pred)
print(report)

mat = confusion_matrix(test_y,pred)
print(mat)

acc = np.sum(pred == test_y)/len(test_y)
print('acc', acc)