# 库


In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
from tqdm import tqdm
import csv
import time  

plt.style.use('ggplot')
%matplotlib inline

#数据预处理相关
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder,OrdinalEncoder,MinMaxScaler

#模型训练相关
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_validate, StratifiedKFold
from sklearn.metrics import precision_recall_curve, precision_score, recall_score, f1_score, accuracy_score
from sklearn.metrics import roc_curve, auc, roc_auc_score, confusion_matrix, classification_report
from sklearn.metrics import cohen_kappa_score,zero_one_loss,hamming_loss

#分类器相关
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,IsolationForest,ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC,OneClassSVM
#警告
from sklearn.exceptions import UndefinedMetricWarning
import warnings

#多标签
from skmultilearn.problem_transform import ClassifierChain,LabelPowerset,BinaryRelevance
from mllearn.problem_transform import CalibratedLabelRanking
import scipy.sparse as sp
from mllearn.metrics import accuracy

# 函数定义

## hamming_score

In [2]:
def hamming_score(y_true, y_pred):
    temp = 0
    for i in range(y_true.shape[0]):
        temp += sum(np.logical_and(y_true[i], y_pred[i])) / sum(np.logical_or(y_true[i], y_pred[i]))
    return temp / y_true.shape[0]

## binary_classifier

In [3]:
def binary_classifier(model,method,other,x_train,y_train,x_test,y_test):
    ##########模型训练
    #模型设置
    clf=model 
    # 定义开始时间  
    start_time = time.time()
    #训练和预测
    clf.fit(x_train,y_train)
    # 计算训练时间  
    train_time = time.time() - start_time
    # 定义预测时间  
    predict_start_time = time.time()
    #使用测试集进行测试
    y_pred_class=clf.predict(x_test)
    # 计算预测时间  
    predict_time = time.time() - predict_start_time
#     #模型设置
#     clf=model
#     #训练和预测
#     clf.fit(X=x_train,y=y_train)
    #使用测试集进行测试
    #y_pred_class=clf.predict(x_test)
    y_pred_score=clf.predict_proba(x_test)[:,1]  #正类（类别为1）的预测概率
    #######评价结果
    #kappa score
#     kappa_score=cohen_kappa_score(y_test, y_pred_class)
    #混淆矩阵
    cm_ontest=confusion_matrix(y_true=y_test,y_pred=y_pred_class)
    #precision分数
    precision_ontest=precision_score(y_true=y_test,y_pred=y_pred_class)
    #recall分数
    recall_ontest=recall_score(y_true=y_test,y_pred=y_pred_class)
    #accuracy
    auc_ontest=accuracy_score(y_true=y_test,y_pred=y_pred_class)
    #f1
    f1_ontest=f1_score(y_true=y_test,y_pred=y_pred_class)
    #auc
    roc_ontest=roc_auc_score(y_true=y_test,y_score=y_pred_score)
    #分类报告
    cls_report_ontest=classification_report(y_true=y_test,y_pred=y_pred_class)
    #打印混淆矩阵
    print('Confusion Matrix:\n',cm_ontest)
    #打印分类报告
    print('Classification Report:\n',cls_report_ontest)
    
    #记录值
    values=[]
    columns=[]
    columns.append('Model')
    values.append(str(model))
    columns.append('Method')
    values.append(method)
    columns.append('Other')
    values.append(other)

    #打印结果
    print('The accuracy score on the test set: ',auc_ontest)
    columns.append('accuracy')
    values.append(auc_ontest)
    print('The precision score on the test set: ',precision_ontest)
    columns.append('precision')
    values.append(precision_ontest)
    print('The recall score on the test set: ',recall_ontest)
    columns.append('recall')
    values.append(recall_ontest)
    print('The f1 score on the test set: ',f1_ontest)
    columns.append('f1')
    values.append(f1_ontest)
    print('The auc score on the test set: ',roc_ontest)
    columns.append('auc')
    values.append(roc_ontest)
    columns.append('Matrix')
    values.append(cm_ontest)
    # 时间 
    print("训练时间（秒）：",train_time)
    columns.append('train time')
    values.append(train_time)
    print("预测时间（秒）：",predict_time)
    columns.append('predict time')
    values.append(predict_time)
    print()
    #写入文件
    # 将列名和列值写入CSV文件
#     with open('./output/multi_model_compare_result.csv', 'a', newline='') as csvfile:
#     with open('./output/binary_model_result.csv', 'a', newline='') as csvfile:
    with open('./output/allclass_binary_model_result.csv', 'a', newline='') as csvfile:
        writer = csv.writer(csvfile)
        # 如果文件为空，则写入列名
        if csvfile.tell() == 0:
            writer.writerow(columns)
        # 写入值
        writer.writerow(values)   
    return clf,y_pred_class,columns,values

## multi_label_classifier

In [4]:
def multi_label_classifier(model,method,other,x_train,y_train,x_test,y_test):
    ##########模型训练
    #模型设置
    clf=model 
    # 定义开始时间  
    start_time = time.time()
    #训练和预测
    clf.fit(x_train,y_train)
    # 计算训练时间  
    train_time = time.time() - start_time
    # 定义预测时间  
    predict_start_time = time.time()
    #使用测试集进行测试
    y_pred_class=clf.predict(x_test)
    # 计算预测时间  
    predict_time = time.time() - predict_start_time
    
    if not isinstance(y_pred_class, np.ndarray):
        print('y_pred_class to array')
        y_pred_class=y_pred_class.toarray()
    #y_pred_score=clf.predict_proba(x_test)[:,1]  #正类（类别为1）的预测概率
    #######评价结果
    # 禁用警告
    warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
    #precision分数
    precision_ontest=precision_score(y_true=y_test,y_pred=y_pred_class, average='samples')
    #recall分数
    recall_ontest=recall_score(y_true=y_test,y_pred=y_pred_class, average='samples')
    #accuracy
    auc_ontest=accuracy_score(y_true=y_test,y_pred=y_pred_class)
    #f1
    f1_ontest=f1_score(y_true=y_test,y_pred=y_pred_class, average='samples')
    #hamming_loss
    
    #hamming_score
    if isinstance(y_test, np.ndarray):
        hloss_ontest=hamming_loss(y_test, y_pred_class)
        hscore_ontest=hamming_score(y_test, y_pred_class)
    else:
        hloss_ontest=hamming_loss(y_test.values, y_pred_class)
        hscore_ontest=hamming_score(y_test.values, y_pred_class)#from mllearn.metrics import accuracy
    #zero_one_loss
    zloss_ontest=zero_one_loss(y_test, y_pred_class)
    
    #分类报告
    cls_report_ontest=classification_report(y_true=y_test,y_pred=y_pred_class,digits=6,target_names=y_col)
    #打印分类报告
    print('Classification Report:\n',cls_report_ontest)
    report_dict = classification_report(y_true=y_test,y_pred=y_pred_class,digits=6,target_names=y_col,output_dict=True)
    # 将字典转换为 DataFrame
    df=pd.DataFrame(report_dict)  #{key: value for key, value in list(report_dict.items())[:-4]}
    df.T.to_csv('./output/allclass_real_binary+multilabel_model_result'+other+'.csv',index=True)

    #记录值
    values=[]
    columns=[]
    columns.append('Model')
    values.append(str(model))
    columns.append('Method')
    values.append(method)
    columns.append('Other')
    values.append(other)
    
    #打印结果
    print('绝对匹配率（Exact Match Ratio）:',auc_ontest)
    columns.append('accuracy')
    values.append(auc_ontest)
    print('精确率（Precision）:',precision_ontest)
    columns.append('precision')
    values.append(precision_ontest)
    print('召回率（Recall）：',recall_ontest)
    columns.append('recall')
    values.append(recall_ontest)
    print('F1值（F1Measure）：',f1_ontest)
    columns.append('f1')
    values.append(f1_ontest)
    print('Hamming Loss：',hloss_ontest)
    columns.append('Hamming Loss')
    values.append(hloss_ontest)
    print('Hamming Score：',hscore_ontest)
    columns.append('Hamming Score')
    values.append(hscore_ontest)
    print('0-1损失（Zero-One Loss）：',zloss_ontest)
    columns.append('Zero-One Loss')
    values.append(zloss_ontest)
    # 时间 
    print("训练时间（秒）：",train_time)
    columns.append('train time')
    values.append(train_time)
    print("预测时间（秒）：",predict_time)
    columns.append('predict time')
    values.append(predict_time)
    print()
    #写入文件
    # 将列名和列值写入CSV文件
#     with open('./output/multi_model_compare_result.csv', 'a', newline='') as csvfile:
    #with open('./output/binary+multilabel_model_result.csv', 'a', newline='') as csvfile:
#     with open('./output/vir_binary+multilabel_model_result.csv', 'a', newline='') as csvfile:
    with open('./output/allclass_real_binary+multilabel_model_result.csv', 'a', newline='') as csvfile:
        writer = csv.writer(csvfile)
        # 如果文件为空，则写入列名
        if csvfile.tell() == 0:
            writer.writerow(columns)
        # 写入值
        writer.writerow(values)   
    return clf,y_pred_class,columns,values