In [1]:
from scipy import sparse
from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import MultinomialNB,GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report,precision_recall_fscore_support,confusion_matrix
import pandas as pd
import numpy as np

In [None]:
def tag_to_coarse(tags): #粗分类标签映射
    tag_map={0:4,1:0,2:0,3:0,4:0,5:0,6:4,7:1,8:1,9:1,10:1,11:2,12:2,13:2,14:2,15:2,16:3,17:3,18:3,19:3}
    tags_coarse=np.array([tag_map[tag] for tag in tags])
    return tags_coarse

In [None]:
def evaluate_MultinomialNB(train_docs,train_tags,test_docs,test_tags,tagnames): #测试MultinomialNB分类效果，以下测试同理（训练集网格搜索交叉验证得到最佳参数和模型，测试集查看分类得分、宏平均得分、混淆矩阵）
    parameters={'alpha':[0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1]}
    clf=MultinomialNB()
    GS=GridSearchCV(clf,parameters,cv=10,scoring='f1_macro',n_jobs=-1) #网格搜索十折交叉验证
    dim_nums=[1000,2000,5000,8000,10000] #不同top-k维度，因dim已从高到低排序，可直接切片比较
    scores=[] #保存每种top-k维度的最佳模型的交叉验证得分
    parameter_dicts=[] #保存每种top-k维度的最佳模型的参数
    models=[] #保存每种top-k维度的最佳模型
    for dim_num in dim_nums:
        train_docs_sliced=train_docs[:,:dim_num]
        GS.fit(train_docs_sliced,train_tags)
        scores.append(GS.best_score_)
        parameter_dicts.append(GS.best_params_)
        models.append(GS.best_estimator_)
    best_score=max(scores)
    best_idx=scores.index(best_score)
    best_dim=dim_nums[best_idx]
    best_parameters=parameter_dicts[best_idx]
    report_str=f'''MultinomialNB最佳模型f1 macro(on train): {best_score}\n(alpha: {best_parameters['alpha']}; 特征数量(TopNbyFreq): {best_dim})'''
    print(report_str)
    with open('./data_home/report/parameters_scores.txt','a',encoding='utf-8') as outfile:
        outfile.write(report_str+'\n\n')
    best_model=models[best_idx] #最佳top-k维度的最佳模型（依据得分）
    test_docs_sliced=test_docs[:,:best_dim] #测试集保持与训练输入相同的最佳top-k维度
    predict_tags=best_model.predict(test_docs_sliced)
    classification_report_dict=classification_report(test_tags,predict_tags,target_names=tagnames,output_dict=True) #分类报告
    classification_report_df=pd.DataFrame(classification_report_dict).transpose()
    print(f'''MultinomialNB分类报告：{classification_report_df}''')
    classification_report_df.to_csv('./data_home/report/MultinomialNB_report.csv',encoding='utf-8',float_format='%.2f') #保存分类报告
    macro_prf=precision_recall_fscore_support(test_tags,predict_tags,average='macro')[:3] #宏平均precision、recall、f1
    with open('./data_home/report/clfs_prf.txt','a',encoding='utf-8') as outfile: #保存宏平均precision、recall、f1
        outfile.write(f'''MultinomialNB\t{macro_prf[0]}\t{macro_prf[1]}\t{macro_prf[2]}'''+'\n')
    cm=confusion_matrix(test_tags,predict_tags) #混淆矩阵
    cm_df=pd.DataFrame(cm,index=tagnames,columns=tagnames)
    cm_df.to_csv('./data_home/report/MultinomialNB_cmatrix.csv',encoding='utf-8') #保存混淆矩阵


In [None]:
def evaluate_GaussianNB(train_docs,train_tags,test_docs,test_tags,tagnames):
    parameters={'var_smoothing':[1e-9, 1e-8, 1e-7, 1e-6, 1e-5]}
    clf=GaussianNB()
    GS=GridSearchCV(clf,parameters,cv=10,scoring='f1_macro',n_jobs=-1)
    dim_nums=[1000,2000,5000,8000,10000]
    scores=[]
    parameter_dicts=[]
    models=[]
    for dim_num in dim_nums:
        train_docs_sliced=train_docs[:,:dim_num].toarray()
        GS.fit(train_docs_sliced,train_tags)
        scores.append(GS.best_score_)
        parameter_dicts.append(GS.best_params_)
        models.append(GS.best_estimator_)
    best_score=max(scores)
    best_idx=scores.index(best_score)
    best_dim=dim_nums[best_idx]
    best_parameters=parameter_dicts[best_idx]
    report_str=f'''GaussianNB最佳模型f1 macro(on train): {best_score}\n(var_smoothing: {best_parameters['var_smoothing']}; 特征数量(TopNbyFreq): {best_dim})'''
    print(report_str)
    with open('./data_home/report/parameters_scores.txt','a',encoding='utf-8') as outfile:
        outfile.write(report_str+'\n\n')
    best_model=models[best_idx]
    test_docs_sliced=test_docs[:,:best_dim].toarray()
    predict_tags=best_model.predict(test_docs_sliced)
    classification_report_dict=classification_report(test_tags,predict_tags,target_names=tagnames,output_dict=True)
    classification_report_df=pd.DataFrame(classification_report_dict).transpose()
    print(f'''GaussianNB分类报告：{classification_report_df}''')
    classification_report_df.to_csv('./data_home/report/GaussianNB_report.csv',encoding='utf-8',float_format='%.2f')
    macro_prf=precision_recall_fscore_support(test_tags,predict_tags,average='macro')[:3]
    with open('./data_home/report/clfs_prf.txt','a',encoding='utf-8') as outfile:
        outfile.write(f'''GaussianNB\t{macro_prf[0]}\t{macro_prf[1]}\t{macro_prf[2]}'''+'\n')
    cm=confusion_matrix(test_tags,predict_tags)
    cm_df=pd.DataFrame(cm,index=tagnames,columns=tagnames)
    cm_df.to_csv('./data_home/report/GaussianNB_cmatrix.csv',encoding='utf-8')

In [None]:
def evaluate_DecisionTreeClassifier(train_docs,train_tags,test_docs,test_tags,tagnames):
    parameters={'criterion':["gini", "entropy", "log_loss"]}
    clf=DecisionTreeClassifier(random_state=42) #random_state均设为42
    GS=GridSearchCV(clf,parameters,cv=10,scoring='f1_macro',n_jobs=-1)
    dim_nums=[1000,2000,5000,8000,10000]
    scores=[]
    parameter_dicts=[]
    models=[]
    for dim_num in dim_nums:
        train_docs_sliced=train_docs[:,:dim_num]
        GS.fit(train_docs_sliced,train_tags)
        scores.append(GS.best_score_)
        parameter_dicts.append(GS.best_params_)
        models.append(GS.best_estimator_)
    best_score=max(scores)
    best_idx=scores.index(best_score)
    best_dim=dim_nums[best_idx]
    best_parameters=parameter_dicts[best_idx]
    report_str=f'''DecisionTreeClassifier最佳模型f1 macro(on train): {best_score}\n(criterion: {best_parameters['criterion']}; 特征数量(TopNbyFreq): {best_dim})'''
    print(report_str)
    with open('./data_home/report/parameters_scores.txt','a',encoding='utf-8') as outfile:
        outfile.write(report_str+'\n\n')
    best_model=models[best_idx]
    test_docs_sliced=test_docs[:,:best_dim]
    predict_tags=best_model.predict(test_docs_sliced)
    classification_report_dict=classification_report(test_tags,predict_tags,target_names=tagnames,output_dict=True)
    classification_report_df=pd.DataFrame(classification_report_dict).transpose()
    print(f'''DecisionTreeClassifier分类报告：{classification_report_df}''')
    classification_report_df.to_csv('./data_home/report/DecisionTreeClassifier_report.csv',encoding='utf-8',float_format='%.2f')
    macro_prf=precision_recall_fscore_support(test_tags,predict_tags,average='macro')[:3]
    with open('./data_home/report/clfs_prf.txt','a',encoding='utf-8') as outfile:
        outfile.write(f'''DecisionTreeClassifier\t{macro_prf[0]}\t{macro_prf[1]}\t{macro_prf[2]}'''+'\n')
    cm=confusion_matrix(test_tags,predict_tags)
    cm_df=pd.DataFrame(cm,index=tagnames,columns=tagnames)
    cm_df.to_csv('./data_home/report/DecisionTreeClassifier_cmatrix.csv',encoding='utf-8')

In [8]:
def evaluate_KNN(train_docs,train_tags,test_docs,test_tags,tagnames):
    parameters=[{"n_neighbors":[1,3,5,7],"p":[2],"metric":["minkowski"]},
                {"n_neighbors":[1,3,5,7],"metric":["cosine"]},
                {"n_neighbors":[1,3,5,7],"p":[1],"metric":["minkowski"]}]
    clf=KNeighborsClassifier()
    GS=GridSearchCV(clf,parameters,cv=10,scoring='f1_macro',n_jobs=-1)
    dim_nums=[1000,2000,5000,8000,10000]
    scores=[]
    parameter_dicts=[]
    models=[]
    for dim_num in dim_nums:
        train_docs_sliced=train_docs[:,:dim_num]
        GS.fit(train_docs_sliced,train_tags)
        scores.append(GS.best_score_)
        parameter_dicts.append(GS.best_params_)
        models.append(GS.best_estimator_)
    best_score=max(scores)
    best_idx=scores.index(best_score)
    best_dim=dim_nums[best_idx]
    best_parameters=parameter_dicts[best_idx]
    report_str=f'''KNN最佳模型f1 macro(on train): {best_score}\n(n_neighbors: {best_parameters['n_neighbors']}; metric: {best_parameters['metric']}; p: {best_parameters.get('p','None')}; 特征数量(TopNbyFreq): {best_dim})'''
    print(report_str)
    with open('./data_home/report/parameters_scores.txt','a',encoding='utf-8') as outfile:
        outfile.write(report_str+'\n\n')
    best_model=models[best_idx]
    test_docs_sliced=test_docs[:,:best_dim]
    predict_tags=best_model.predict(test_docs_sliced)
    classification_report_dict=classification_report(test_tags,predict_tags,target_names=tagnames,output_dict=True)
    classification_report_df=pd.DataFrame(classification_report_dict).transpose()
    print(f'''KNN分类报告：{classification_report_df}''')
    classification_report_df.to_csv('./data_home/report/KNN_report.csv',encoding='utf-8',float_format='%.2f')
    macro_prf=precision_recall_fscore_support(test_tags,predict_tags,average='macro')[:3]
    with open('./data_home/report/clfs_prf.txt','a',encoding='utf-8') as outfile:
        outfile.write(f'''KNN\t{macro_prf[0]}\t{macro_prf[1]}\t{macro_prf[2]}'''+'\n')
    cm=confusion_matrix(test_tags,predict_tags)
    cm_df=pd.DataFrame(cm,index=tagnames,columns=tagnames)
    cm_df.to_csv('./data_home/report/KNN_cmatrix.csv',encoding='utf-8')

In [9]:
def evaluate_SVC(train_docs,train_tags,test_docs,test_tags,tagnames):
    parameters={"kernel":['linear','poly'],"degree":[2,3,5]}
    clf=SVC(random_state=42)
    GS=GridSearchCV(clf,parameters,cv=10,scoring='f1_macro',n_jobs=-1)
    dim_nums=[1000,2000,5000,8000,10000]
    scores=[]
    parameter_dicts=[]
    models=[]
    for dim_num in dim_nums:
        train_docs_sliced=train_docs[:,:dim_num]
        GS.fit(train_docs_sliced,train_tags)
        scores.append(GS.best_score_)
        parameter_dicts.append(GS.best_params_)
        models.append(GS.best_estimator_)
    best_score=max(scores)
    best_idx=scores.index(best_score)
    best_dim=dim_nums[best_idx]
    best_parameters=parameter_dicts[best_idx]
    report_str=f'''SVC最佳模型f1 macro(on train): {best_score}\n(kernel: {best_parameters['kernel']}; degree: {best_parameters['degree']}; 特征数量(TopNbyFreq): {best_dim})'''
    print(report_str)
    with open('./data_home/report/parameters_scores.txt','a',encoding='utf-8') as outfile:
        outfile.write(report_str+'\n\n')
    best_model=models[best_idx]
    test_docs_sliced=test_docs[:,:best_dim]
    predict_tags=best_model.predict(test_docs_sliced)
    classification_report_dict=classification_report(test_tags,predict_tags,target_names=tagnames,output_dict=True)
    classification_report_df=pd.DataFrame(classification_report_dict).transpose()
    print(f'''SVC分类报告：{classification_report_df}''')
    classification_report_df.to_csv('./data_home/report/SVC_report.csv',encoding='utf-8',float_format='%.2f')
    macro_prf=precision_recall_fscore_support(test_tags,predict_tags,average='macro')[:3]
    with open('./data_home/report/clfs_prf.txt','a',encoding='utf-8') as outfile:
        outfile.write(f'''SVC\t{macro_prf[0]}\t{macro_prf[1]}\t{macro_prf[2]}'''+'\n')
    cm=confusion_matrix(test_tags,predict_tags)
    cm_df=pd.DataFrame(cm,index=tagnames,columns=tagnames)
    cm_df.to_csv('./data_home/report/SVC_cmatrix.csv',encoding='utf-8')

In [None]:
def evaluate_LogisticRegression(train_docs,train_tags,test_docs,test_tags,tagnames):
    parameters={"tol":[1e-3,1e-4,1e-5],"C":[0.1,0.5,1.0,2.5]}
    clf=LogisticRegression(solver='saga',random_state=42,max_iter=1000) #saga更适合处理大规模文本数据，max_iter为1000确保收敛
    GS=GridSearchCV(clf,parameters,cv=10,scoring='f1_macro',n_jobs=-1)
    dim_nums=[1000,2000,5000,8000,10000]
    scores=[]
    parameter_dicts=[]
    models=[]
    for dim_num in dim_nums:
        train_docs_sliced=train_docs[:,:dim_num]
        GS.fit(train_docs_sliced,train_tags)
        scores.append(GS.best_score_)
        parameter_dicts.append(GS.best_params_)
        models.append(GS.best_estimator_)
    best_score=max(scores)
    best_idx=scores.index(best_score)
    best_dim=dim_nums[best_idx]
    best_parameters=parameter_dicts[best_idx]
    report_str=f'''LogisticRegression最佳模型f1 macro(on train): {best_score}\n(tol: {best_parameters['tol']}; C: {best_parameters['C']}; 特征数量(TopNbyFreq): {best_dim})'''
    print(report_str)
    with open('./data_home/report/parameters_scores.txt','a',encoding='utf-8') as outfile:
        outfile.write(report_str+'\n\n')
    best_model=models[best_idx]
    test_docs_sliced=test_docs[:,:best_dim]
    predict_tags=best_model.predict(test_docs_sliced)
    classification_report_dict=classification_report(test_tags,predict_tags,target_names=tagnames,output_dict=True)
    classification_report_df=pd.DataFrame(classification_report_dict).transpose()
    print(f'''LogisticRegression分类报告：{classification_report_df}''')
    classification_report_df.to_csv('./data_home/report/LogisticRegression_report.csv',encoding='utf-8',float_format='%.2f')
    macro_prf=precision_recall_fscore_support(test_tags,predict_tags,average='macro')[:3]
    with open('./data_home/report/clfs_prf.txt','a',encoding='utf-8') as outfile:
        outfile.write(f'''LogisticRegression\t{macro_prf[0]}\t{macro_prf[1]}\t{macro_prf[2]}'''+'\n')
    cm=confusion_matrix(test_tags,predict_tags)
    cm_df=pd.DataFrame(cm,index=tagnames,columns=tagnames)
    cm_df.to_csv('./data_home/report/LogisticRegression_cmatrix.csv',encoding='utf-8')

In [None]:
def create_clf_macro_report(): #生成宏平均得分报告
    clf_macro_report_df=pd.read_csv('./data_home/report/clfs_prf.txt',sep='\t',index_col=0,header=None,encoding='utf-8')
    clf_macro_report_df.columns=['MacroAvg-Precision','MacroAvg-Recall','MacroAvg-F1']
    clf_macro_report_df.index.name='Classifiers'
    print(clf_macro_report_df)
    clf_macro_report_df.to_csv('./data_home/report/clf_macro_report.csv',encoding='utf-8',float_format='%.4f')

In [None]:
train_docs=sparse.load_npz('./data_home/news_train_docs_vectorized.npz') #加载本地向量
test_docs=sparse.load_npz('./data_home/news_test_docs_vectorized.npz')
train,test=fetch_20newsgroups(subset='train',data_home='data_home'),fetch_20newsgroups(subset='test',data_home='data_home')
train_tags,test_tags=train.target,test.target
tagnames=test.target_names
tagnames_coarse=['comp','rec','sci','talk','other']
print(f'''细分类tagnames为{tagnames}\n粗分类tagnames为{tagnames_coarse}''')
train_tags_coarse,test_tags_coarse=tag_to_coarse(train_tags),tag_to_coarse(test_tags) #映射为粗分类


细分类tagnames为['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
粗分类tagnames为['comp', 'rec', 'sci', 'talk', 'other']


In [None]:
evaluate_MultinomialNB(train_docs,train_tags_coarse,test_docs,test_tags_coarse,tagnames_coarse) #测试模型

MultinomialNB最佳模型f1 macro(on train): 0.8909886135522541
(alpha: {'alpha': 0.01}; 特征数量(TopNbyFreq): 10000)
MultinomialNB分类报告：              precision    recall  f1-score      support
comp           0.851681  0.945780  0.896268  1955.000000
rec            0.941213  0.956604  0.948846  1590.000000
sci            0.847071  0.862924  0.854924  1977.000000
talk           0.878399  0.893928  0.886095  1301.000000
other          0.899263  0.516220  0.655914   709.000000
accuracy       0.876925  0.876925  0.876925     0.876925
macro avg      0.883525  0.835091  0.848409  7532.000000
weighted avg   0.878465  0.876925  0.872133  7532.000000


In [6]:
evaluate_GaussianNB(train_docs,train_tags_coarse,test_docs,test_tags_coarse,tagnames_coarse)

GaussianNB最佳模型f1 macro(on train): 0.8426573635826211
(var_smoothing: {'var_smoothing': 1e-05}; 特征数量(TopNbyFreq): 10000)
GaussianNB分类报告：              precision    recall  f1-score      support
comp           0.868593  0.814834  0.840855  1955.000000
rec            0.894835  0.904403  0.899593  1590.000000
sci            0.737312  0.793627  0.764434  1977.000000
talk           0.781448  0.854727  0.816446  1301.000000
other          0.594444  0.452750  0.514011   709.000000
accuracy       0.800982  0.800982  0.800982     0.800982
macro avg      0.775326  0.764068  0.767068  7532.000000
weighted avg   0.798815  0.800982  0.798213  7532.000000


In [12]:
evaluate_DecisionTreeClassifier(train_docs,train_tags_coarse,test_docs,test_tags_coarse,tagnames_coarse)

DecisionTreeClassifier最佳模型f1 macro(on train): 0.6856712891697478
(criterion: {'criterion': 'gini'}; 特征数量(TopNbyFreq): 10000)
DecisionTreeClassifier分类报告：              precision    recall  f1-score      support
comp           0.678829  0.723274  0.700347  1955.000000
rec            0.719199  0.723270  0.721229  1590.000000
sci            0.586404  0.571573  0.578893  1977.000000
talk           0.637670  0.614143  0.625685  1301.000000
other          0.583582  0.551481  0.567078   709.000000
accuracy       0.648433  0.648433  0.648433     0.648433
macro avg      0.641137  0.636748  0.638646  7532.000000
weighted avg   0.647016  0.648433  0.647435  7532.000000


In [13]:
evaluate_KNN(train_docs,train_tags_coarse,test_docs,test_tags_coarse,tagnames_coarse)

KNN最佳模型f1 macro(on train): 0.8569925179508393
(n_neighbors: 1; metric: cosine; p: None; 特征数量(TopNbyFreq): 10000)
KNN分类报告：              precision    recall  f1-score      support
comp           0.804151  0.772890  0.788211  1955.000000
rec            0.688508  0.859119  0.764410  1590.000000
sci            0.808948  0.676783  0.736987  1977.000000
talk           0.703566  0.727902  0.715527  1301.000000
other          0.633782  0.598025  0.615385   709.000000
accuracy       0.741636  0.741636  0.741636     0.741636
macro avg      0.727791  0.726944  0.724104  7532.000000
weighted avg   0.747587  0.741636  0.740918  7532.000000


In [14]:
evaluate_SVC(train_docs,train_tags_coarse,test_docs,test_tags_coarse,tagnames_coarse)

SVC最佳模型f1 macro(on train): 0.9012221299961748
(kernel: linear; degree: 2; 特征数量(TopNbyFreq): 10000)
SVC分类报告：              precision    recall  f1-score      support
comp           0.848078  0.936573  0.890131  1955.000000
rec            0.951380  0.910692  0.930591  1590.000000
sci            0.839303  0.853313  0.846250  1977.000000
talk           0.877971  0.851653  0.864612  1301.000000
other          0.853195  0.696756  0.767081   709.000000
accuracy       0.872013  0.872013  0.872013     0.872013
macro avg      0.873986  0.849797  0.859733  7532.000000
weighted avg   0.873227  0.872013  0.871164  7532.000000


In [15]:
evaluate_LogisticRegression(train_docs,train_tags_coarse,test_docs,test_tags_coarse,tagnames_coarse)

LogisticRegression最佳模型f1 macro(on train): 0.9031495758358445
(tol: 0.0001; C: 2.5; 特征数量(TopNbyFreq): 10000)
LogisticRegression分类报告：              precision    recall  f1-score      support
comp           0.880173  0.935550  0.907017  1955.000000
rec            0.946168  0.939623  0.942884  1590.000000
sci            0.846723  0.868993  0.857713  1977.000000
talk           0.871461  0.875480  0.873466  1301.000000
other          0.868275  0.660085  0.750000   709.000000
accuracy       0.882634  0.882634  0.882634     0.882634
macro avg      0.882560  0.855946  0.866216  7532.000000
weighted avg   0.882700  0.882634  0.881072  7532.000000


In [None]:
create_clf_macro_report() #生成宏平均得分报告

                        MacroAvg-Precision  MacroAvg-Recall  MacroAvg-F1
Classifiers                                                             
MultinomialNB                     0.883525         0.835091     0.848409
GaussianNB                        0.775326         0.764068     0.767068
DecisionTreeClassifier            0.641137         0.636748     0.638646
KNN                               0.727791         0.726944     0.724104
SVC                               0.873986         0.849797     0.859733
LogisticRegression                0.882560         0.855946     0.866216
