In [1]:
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd
import pickle
from sklearn.decomposition import LatentDirichletAllocation

In [2]:
# 获取训练数据，测试数据
train_data = []
with open('./after_preprocess_traindata.txt',encoding='utf-8') as f:
    for line in f:
        train_data.append(line.strip())

test_data = []
with open('./after_preprocess_testdata.txt',encoding='utf-8') as f:
    for line in f:
        test_data.append(line.strip())

In [3]:
# 获取标签编码
df1 = pd.read_csv('../dataset/cnews/cnews.train.txt',sep='\t',names=['label','content'],encoding='UTF-8',engine='python')
df2 = pd.read_csv('../dataset/cnews/cnews.test.txt',sep='\t',names=['label','content'],encoding='UTF-8',engine='python')
encoder = LabelEncoder()

train_y = encoder.fit_transform(df1['label'])
test_y = encoder.transform(df2['label'])

### 接着我们要把词转化为词频向量（词袋），注意由于LDA是基于词频统计的，因此一般不用TF-IDF来做文档特征。

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
bow = cv.fit_transform(train_data)
# print(bow)

### 这里是根据词袋，计算与标签的互信息，取相关性最大的前5000个词

In [6]:
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest
selector = SelectKBest(mutual_info_classif, k=5000)
new_train_x = selector.fit_transform(bow,train_y)
print(new_train_x.shape)

#### 保存剩余筛选器模型

In [12]:
pkl = pickle.dumps(selector)
with open('./selector_model.pkl','wb') as f:
    f.write(pkl)

#### 测试读取筛选器

In [5]:
with open('./selector_model.pkl','rb') as f:
    model = pickle.loads(f.read())
new_train_x = model.transform(bow)
print(new_train_x.shape)

(50000, 5000)


### 开始创建LDA模型

In [None]:
lda = LatentDirichletAllocation(n_components=100,max_iter=800,random_state=1)
lda.fit(new_train_x)

In [6]:
def load_lda(path):
    with open(path,'rb') as f:
        lda = pickle.loads(f.read())
    return lda

In [7]:
lda = load_lda('./lda_model.pkl')

In [9]:
lda_feature = lda.transform(new_train_x)
print(lda_feature[0])

[6.02409639e-05 6.02409639e-05 6.02409639e-05 6.02409639e-05
 6.02409639e-05 5.24025550e-02 6.02409639e-05 6.02409639e-05
 6.02409639e-05 6.02409639e-05 6.02409639e-05 6.02409639e-05
 6.02409639e-05 6.02409639e-05 6.02409639e-05 6.02409639e-05
 6.02409639e-05 6.02409639e-05 6.02409639e-05 6.02409639e-05
 6.02409639e-05 6.02409639e-05 2.14631940e-01 6.02409639e-05
 6.02409639e-05 6.02409639e-05 6.02409639e-05 6.02409639e-05
 6.02409639e-05 6.02409639e-05 6.02409639e-05 6.02409639e-05
 6.02409639e-05 6.02409639e-05 6.02409639e-05 6.02409639e-05
 1.41775938e-01 6.02409639e-05 6.02409639e-05 6.02409639e-05
 6.02409639e-05 6.02409639e-05 6.02409639e-05 6.02409639e-05
 6.02409639e-05 6.02409639e-05 5.42019779e-02 6.02409639e-05
 6.02409639e-05 6.02409639e-05 6.02409639e-05 6.02409639e-05
 6.02409639e-05 6.02409639e-05 6.02409639e-05 6.02409639e-05
 6.02409639e-05 6.02409639e-05 6.02409639e-05 6.02409639e-05
 6.02409639e-05 6.02409639e-05 6.02409639e-05 6.02409639e-05
 6.02409639e-05 6.024096

In [10]:
print(lda_feature.shape)

(50000, 100)


In [11]:
train_X = np.concatenate((new_train_x.toarray(),lda_feature),axis=1)
print(train_X.shape)

(50000, 5100)


In [12]:
from sklearn.naive_bayes import GaussianNB

In [13]:
gs = GaussianNB()
gs.fit(train_X,train_y)
print(gs.score(train_X,train_y))

0.93752


In [14]:
test_x_bag = cv.transform(test_data)
test_x_bag = model.transform(test_x_bag)
test_x_lda = lda.transform(test_x_bag)
test_X = np.concatenate((test_x_bag.toarray(),test_x_lda),axis=1)
print(test_X.shape)

(10000, 5100)


In [15]:
pred_y = gs.predict(test_X)
acc = np.sum(pred_y == test_y)/len(test_y)
print('acc', acc)

acc 0.868


In [16]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
print(classification_report(test_y,pred_y))
print(confusion_matrix(test_y,pred_y))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1000
           1       0.91      0.93      0.92      1000
           2       0.82      0.45      0.58      1000
           3       0.64      0.80      0.71      1000
           4       0.90      0.82      0.86      1000
           5       0.88      0.92      0.90      1000
           6       0.84      0.89      0.87      1000
           7       0.95      0.95      0.95      1000
           8       0.91      0.97      0.94      1000
           9       0.86      0.97      0.91      1000

   micro avg       0.87      0.87      0.87     10000
   macro avg       0.87      0.87      0.86     10000
weighted avg       0.87      0.87      0.86     10000

[[994   4   0   0   0   0   1   0   1   0]
 [  0 928   3   1   4  41   0  17   5   1]
 [  0   6 450 369  27  33  30   9  28  48]
 [  0   8  37 795  18   7  59   1   7  68]
 [  3  24   5   7 817  11  66  15  31  21]
 [  1  31  35   2   3 920 

In [17]:
# 不拼接lda特征
train_X = new_train_x.toarray()
gs = GaussianNB()
gs.fit(train_X,train_y)
test_x_bag = cv.transform(test_data)
test_x_bag = model.transform(test_x_bag)
test_X = test_x_bag.toarray()
print(test_X.shape)
print(gs.score(train_X,train_y))
print(gs.score(test_X,test_y))
print()

(10000, 5000)
0.9327
0.8616


### 采用tfidf+lda+朴素贝叶斯

In [18]:
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
tfidf_train_x = transformer.fit_transform(new_train_x)

tfidf_train_X = np.concatenate((tfidf_train_x.toarray(),lda_feature),axis=1)
# tfidf_train_X = tfidf_train_x.toarray()

gs2 = GaussianNB()
gs2.fit(tfidf_train_X,train_y)

GaussianNB(priors=None, var_smoothing=1e-09)

In [21]:
pred = gs2.predict(tfidf_train_X)
print(gs2.score(tfidf_train_X,train_y))
report = classification_report(train_y,pred)
print(report)

0.95844
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5000
           1       0.97      0.99      0.98      5000
           2       0.95      0.97      0.96      5000
           3       0.92      0.95      0.94      5000
           4       0.96      0.89      0.93      5000
           5       0.97      0.96      0.97      5000
           6       0.92      0.96      0.94      5000
           7       0.99      0.98      0.99      5000
           8       0.94      0.96      0.95      5000
           9       0.96      0.92      0.94      5000

   micro avg       0.96      0.96      0.96     50000
   macro avg       0.96      0.96      0.96     50000
weighted avg       0.96      0.96      0.96     50000



In [23]:
test_x_tfidf = transformer.fit_transform(test_x_bag)
test_x_tfidf_lda = lda.transform(test_x_tfidf)
test_tfidf_X = np.concatenate((test_x_tfidf.toarray(),test_x_tfidf_lda),axis=1)
# test_tfidf_X = test_x_tfidf.toarray()
print(gs2.score(test_tfidf_X,test_y))
pred = gs2.predict(test_tfidf_X)
report = classification_report(test_y,pred)
print(report)

0.878
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      1000
           1       0.91      0.95      0.93      1000
           2       0.88      0.40      0.55      1000
           3       0.66      0.85      0.74      1000
           4       0.86      0.89      0.87      1000
           5       0.92      0.92      0.92      1000
           6       0.89      0.89      0.89      1000
           7       0.94      0.95      0.94      1000
           8       0.90      0.98      0.94      1000
           9       0.88      0.97      0.92      1000

   micro avg       0.88      0.88      0.88     10000
   macro avg       0.88      0.88      0.87     10000
weighted avg       0.88      0.88      0.87     10000

acc 0.878


bow-->互信息筛选5000个词--->bow <br>

|model|train|test|
|--|---|---|
|bow + lda + gs|  0.93752 |0.868|
|bow  + gs|0.9327 |0.8616 |
|tfidf + gs|0.95678|0.875|
|tfidf + lda + gs|0.95844|0.878|