# 一、指标统计
### 1、分段统计函数
* `segment_statistic(y_test=None, prob_y=None, bins=None)`

### 2、二分类评估指标
* `binary_classifier_metrics(test_labels, predict_labels, predict_prob, show_flag=True)`
* 简单版

### 3、其他
* `show_feature_importance` Lightgbm特征重要性
* `plt_feature_importance` 
* `plot_heatmap` 相关系数热力图
* `create_roc` AUC可视化

In [None]:
# import package


In [1]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, roc_auc_score, confusion_matrix, average_precision_score
# 二分类评估指标
# roc_auc_score = roc_curve + auc
# test_labels: true label
# predict_labels: predict lables
# predict_prob: predict output is prob
def binary_classifier_metrics(test_labels, predict_labels, predict_prob, show_flag=True):  # 评价标准
    accuracy = accuracy_score(test_labels, predict_labels)  # accuracy_score准确率
    precision = precision_score(test_labels, predict_labels)  # precision_score精确率
    recall = recall_score(test_labels, predict_labels)  # recall_score召回率
    f1_measure = f1_score(test_labels, predict_labels)  # f1_score  f1得分
    confusionMatrix = confusion_matrix(test_labels, predict_labels)  # confusion_matrix  混淆矩阵
    fpr, tpr, threshold = roc_curve(test_labels, predict_prob, pos_label=1)  # roc_curve ROC曲线
    Auc = auc(fpr, tpr) 
    MAP = average_precision_score(test_labels, predict_prob)  # average_precision_score

    TP, FP, FN, TN = confusionMatrix[1, 1], confusionMatrix[0, 1], confusionMatrix[1, 0], confusionMatrix[0, 0]
    if show_flag is True:
        print("------------------------- ")
        print("row: precision | col: recall ")
        print("confusion matrix:")
        print("------------------------- ")
        print("| TP: %5d | FP: %5d | P: %5d |" % (TP, FP, TP+FP))
        print("----------------------- ")
        print("| FN: %5d | TN: %5d | R: %.3f|" % (FN, TN, (TP+FP)/len(test_labels)))
        print("----------------------- ")
        print("| T: %5d  | R: %.3f | N: %5d |" % (TP+FN, (TP+FN)/len(test_labels), len(test_labels)))
        print(" ------------------------- ")
        print("Accuracy:       %.2f%%" % (accuracy * 100))
        print("Precision:      %.2f%%" % (precision * 100))
        print("Recall:         %.2f%%" % (recall * 100))
        print("F1-measure:     %.2f%%" % (f1_measure * 100))
        print("AUC:            %.2f%%" % (Auc * 100))
        print("MAP:            %.2f%%" % (MAP * 100))
        print("------------------------- ")
    return recall, precision, f1_measure

def binary_classifier_metrics(test_labels, predict_labels, predict_prob):  # 评价标准
    accuracy = accuracy_score(test_labels, predict_labels)  # accuracy_score准确率
    precision = precision_score(test_labels, predict_labels)  # precision_score精确率
    recall = recall_score(test_labels, predict_labels)  # recall_score召回率
    f1_measure = f1_score(test_labels, predict_labels)  # f1_score  f1得分
    auc = roc_auc_score(test_labels, predict_prob)
    print("Accuracy:       %.2f%%" % (accuracy * 100))
    print("Precision:      %.2f%%" % (precision * 100))
    print("Recall:         %.2f%%" % (recall * 100))
    print("F1-measure:     %.2f%%" % (f1_measure * 100))
    print("AUC:            %.2f%%" % (auc * 100))

In [1]:
# 分段统计函数
# y_test: ture label
# pro_y: predict probability
def segment_statistic(y_test=None, prob_y=None, bins=None):
    if bins is None:
        bins = np.arange(0, 1.1, 0.1)
    new_df = pd.DataFrame({'y_true': y_test, 'prob_y':prob_y})
    new_df['bins'] = pd.cut(new_df['prob_y'], bins)
    stra_df = new_df.groupby('bins').agg({'bins':'count', 'y_true':'sum'})
    stra_df.rename(columns={'bins':'pred_cnt', 'y_true':'real_unsat_cnt'}, inplace=True)
    stra_df = stra_df.sort_index(ascending=False)
    stra_df['pred_unsat_cnt_p'] = stra_df['pred_cnt'].cumsum()
    stra_df['recall_preson'] = stra_df['pred_unsat_cnt_p'] / stra_df['pred_cnt'].sum()
    stra_df['real_unsat_cnt_tp'] = stra_df['real_unsat_cnt'].cumsum()
    stra_df['recall'] = stra_df['real_unsat_cnt_tp'] / stra_df['real_unsat_cnt'].sum()
    stra_df['precision'] = stra_df['real_unsat_cnt_tp'] / stra_df['pred_unsat_cnt_p']
    return stra_df

In [None]:
# 1. the importance of feature 
import pandas as pd
pd.set_option('display.max_rows', None)
feature_imp = list()
for i in range(0, importances.shape[0]//5):
    m_df = list()
    m_df.append(importances.iloc[i,:].feature)
    m_df += list(importances.loc[[i]].mean().values)
    feature_imp.append(m_df)
imp_df = pd.DataFrame(feature_imp, columns=['feature', 'split', 'gain', 'fold'])
sort_imp_df = imp_df.sort_values(by=['gain'], ascending=False)
sort_imp_df[sort_imp_df.gain>20]

# 2. the importance of feature
def show_feature_importance(model, usage_col):
    import_df = pd.DataFrame()
    import_df['feature'] = usage_col
    import_df['split'] = model.feature_importance()
    import_df['gain'] = model.feature_importance(importance_type='gain')
    import_df = import_df.sort_values(by=['gain'], ascending=False)
    return import_df

def plt_feature_importance(model):
    lgb.plot_importance(model, ax=ax, 
                        title='Feature Importance', 
                        xlabel='Information Gain', 
                        ylabel='Feature Name', 
                        importance_type='gain', 
                        max_num_features=10, 
                        grid=False, 
                        precision=3, 
                        height=0.6)

In [None]:
# 解决余弦相似度计算值缺失的问题
# 去中心化的余弦相似度计算
# 1. 相关系数取值一般在-1~1之间
# 2. 绝对值越接近1说明变量之间的线性关系越强，绝对值越接近0说明变量间线性关系越弱。
# 3. ≥0.8高度相关，0.5~0.8中度相关，0.3~0.5低度相关，＜0.3相关关系极弱可视为不相关。
def plot_heatmap(dataframe):
    """
    """
    corr_df = dataframe
    mcorr = corr_df.corr(method="spearman")  

    ax = plt.subplots(figsize=(30, 25)) #调整画布大小
    mask = np.zeros_like(mcorr, dtype=np.bool)
    mask[np.triu_indices_from(mask)] = True
    cmap = sns.diverging_palette(220, 10, as_cmap=True)  # 颜色分布
    ax = sns.heatmap(mcorr, mask=mask, cmap=cmap, annot=True, fmt='.1f')#画热力图   annot=True 

In [None]:
# visualization AUC
# AUC可视化
import matplotlib.pyplot as plt
def create_roc(y_test, y_proba):
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    roc_auc = auc(fpr, tpr)
    plt.title('ROC Validation')
    plt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % roc_auc)
    plt.legend(loc='lower right')
    plt.plot([0, 1], [0, 1], 'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()