# Import Package

In [5]:
import os
import shap
import time
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE

from xgboost import XGBClassifier, plot_importance
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.metrics import roc_curve, auc, mean_squared_error
from sklearn import metrics

from loguru import logger

os.chdir(r'D:\Projects\MachineLearning\textrnn')
plt.switch_backend('agg')

# Some Functions

In [6]:
def plot_roc(labels, predict_prob, save_prefix=''):
    false_positive_rate, true_positive_rate, thresholds = roc_curve(labels, predict_prob)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    plt.clf()
    plt.title('ROC')
    plt.plot(false_positive_rate, true_positive_rate, 'b', label='AUC = %0.4f' % roc_auc)
    plt.legend(loc='lower right')
    plt.plot([0, 1], [0, 1], 'r--')
    plt.ylabel('TPR')
    plt.xlabel('FPR')
    if save_prefix:
        plt.savefig(f'{save_prefix}_roc.jpg')
    # plt.show()


def plot_learning_curve(algo, X_train, X_test, y_train, y_test, save_prefix=''):
    """绘制学习曲线：只需要传入算法(或实例对象)、X_train、X_test、y_train、y_test"""
    """当使用该函数时传入算法，该算法的变量要进行实例化，如：PolynomialRegression(degree=2)，变量 degree 要进行实例化"""
    train_score = []
    test_score = []
    for i in range(1, len(X_train) + 1):
        algo.fit(X_train[:i], y_train[:i])

        y_train_predict = algo.predict(X_train[:i])
        train_score.append(mean_squared_error(y_train[:i], y_train_predict))

        y_test_predict = algo.predict(X_test)
        test_score.append(mean_squared_error(y_test, y_test_predict))

    plt.clf()
    plt.plot([i for i in range(1, len(X_train) + 1)],
             np.sqrt(train_score), label="train")
    plt.plot([i for i in range(1, len(X_train) + 1)],
             np.sqrt(test_score), label="test")

    plt.legend()
    plt.axis([0, len(X_train) + 1, 0, 4])
    if save_prefix:
        plt.savefig(f'{save_prefix}_learning_curve.jpg')


def train_model(classifier, feature_vector_train, label, feature_vector_valid, valid_label, save_prefix=''):
    plot_learning_curve(classifier, feature_vector_train, feature_vector_valid, label, valid_label, save_prefix)
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)

    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)

    # ROC curve
    plot_roc(valid_label, predictions, save_prefix)
    #
    # # Feature importance
    plt.clf()
    fig, ax = plt.subplots(figsize=(15, 15))
    plot_importance(classifier,
                    height=0.5,
                    ax=ax,
                    max_num_features=64)
    # plt.show()
    if save_prefix:
        plt.savefig(f'{save_prefix}_feature_importance.jpg')

    return classifier, metrics.accuracy_score(predictions, valid_label), metrics.classification_report(predictions,
                                                                                                       valid_label)


def clock(time_start):
    time_end = time.time()
    return time_end - time_start

# Some Params

In [7]:
data_load_time = []
ngram_time = []
smote_time = []
train_with_all_feature_time = []
shap_pre_train_time = []
shap_time = []
shap_train_time = []

acc_all = []
acc_part = []
acc_shap = []

file = pd.read_csv('data.csv')
df = pd.DataFrame(file)
_header = ['underflow', 'overflow', 'callstack', 'tod', 'timestamp', 'reentrancy']

# Training

In [8]:
for i in _header:
    logger.info(f'start: {i}')
    X = []
    Y = []

    # 数据加载
    time_start = time.time()
    for x, y in zip(df.iloc[:, df.columns == 'opcode']['opcode'], df.iloc[:, df.columns == i][i]):
        X.append(x)
        Y.append(y)
    Y = np.array(Y)
    time_used = clock(time_start)
    logger.info(f'load data: {time_used}s')
    data_load_time.append(time_used)

    # Ngram
    time_start = time.time()
    ngram_vectorizer = CountVectorizer(ngram_range=(2, 2), decode_error="ignore", min_df=1)
    X = ngram_vectorizer.fit_transform(X)
    time_used = clock(time_start)
    logger.info(f'ngram: {time_used}s')
    ngram_time.append(time_used)

    # Smote
    time_start = time.time()
    sm = SMOTE(random_state=2)
    X_train_res, y_train_res = sm.fit_resample(X, Y.ravel())
    time_used = clock(time_start)
    logger.info(f'smote: {time_used}s')
    smote_time.append(time_used)

    all_df = pd.DataFrame(X_train_res.todense(), columns=ngram_vectorizer.get_feature_names_out())
    logger.info("After OverSampling, counts of label '1': {}".format(sum(y_train_res == 1)))
    logger.info("After OverSampling, counts of label '0': {}".format(sum(y_train_res == 0)))

    # 使用源数据集训练
    logger.info("{:*^50}".format("Train with all data"))
    X_train, X_test, y_train, y_test = train_test_split(all_df, y_train_res, test_size=0.3, random_state=0)
    model = XGBClassifier(learning_rate=0.2, max_depth=12, verbosity=0)
    time_start = time.time()
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    accuracy = metrics.accuracy_score(predictions, y_test)
    confusion_matrix = metrics.classification_report(predictions, y_test)
    logger.info(f"Xgb, Accuracy: {accuracy}")
    acc_all.append(accuracy)
    print(confusion_matrix)
    time_used = clock(time_start)
    logger.info(f'train with all data: {time_used}s')
    train_with_all_feature_time.append(time_used)
    logger.info("{:*^50}".format(""))

    # 使用十分之一的数据集先进行训练
    # for j in [10]:
    logger.info("{:*^50}".format("PreTrain with 10% data"))
    _, test_df_x, _, test_df_y = train_test_split(all_df, y_train_res, test_size=0.1, random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(test_df_x, test_df_y, test_size=0.3, random_state=0)
    model = XGBClassifier()
    time_start = time.time()
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    accuracy = metrics.accuracy_score(predictions, y_test)
    confusion_matrix = metrics.classification_report(predictions, y_test)
    logger.info(f"Xgb, Accuracy: {accuracy}")
    acc_part.append(accuracy)
    print(confusion_matrix)
    time_used = clock(time_start)
    logger.info(f'train with {10}% of data: {time_used}s')
    shap_pre_train_time.append(time_used)
    logger.info("{:*^50}".format(""))
    # break

    # Shap解释
    logger.info("{:*^50}".format("SHAP Explain"))
    time_start = time.time()
    explainer = shap.TreeExplainer(model)
    shap_values_train = explainer.shap_values(X_train)
    time_used = clock(time_start)
    logger.info(f'ngram: {time_used}s')
    shap_time.append(time_used)
    logger.info("{:*^50}".format(""))

    txt_dense_df = pd.DataFrame(X_train.loc[::], columns=ngram_vectorizer.get_feature_names_out())
    shap_values_train_df = pd.DataFrame(shap_values_train, columns=txt_dense_df.columns)

    # 特征筛选
    logger.info("{:*^50}".format("Feature selection"))
    tmp = shap_values_train_df.apply(np.sum, axis=0) != 0
    new_df = all_df.reindex(columns=list(filter(lambda x: tmp[x], tmp.keys())))
    logger.info(f'useful features: {len(new_df.columns)}')
    logger.info(f'all features: {len(all_df.columns)}')
    logger.info("{:*^50}".format(""))
    # continue
    # break
    # 新数据集训练
    logger.info("{:*^50}".format("Final Train"))
    X_train, X_test, y_train, y_test = train_test_split(new_df, y_train_res, test_size=0.1, random_state=0)
    model = XGBClassifier(learning_rate=0.2, max_depth=12)
    time_start = time.time()
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    accuracy = metrics.accuracy_score(predictions, y_test)
    confusion_matrix = metrics.classification_report(predictions, y_test)
    logger.info(f"Xgb, Accuracy: {accuracy}")
    acc_shap.append(accuracy)
    print(confusion_matrix)
    time_used = clock(time_start)
    logger.info(f'train after shap: {time_used}s')
    shap_train_time.append(time_used)
    logger.info("{:*^50}".format(""))

    # 绘制曲线
    # plot_learning_curve(model, X_train, X_test, y_train, y_test, save_prefix=f'pic/{i}')
    # # ROC curve
    # plot_roc(y_test, predictions,  save_prefix=f'pic/{i}')
    # #
    # # # Feature importance
    # logger.info('--------start plot feature importance')
    # plt.clf()
    # fig, ax = plt.subplots(figsize=(15, 15))
    # plot_importance(model,
    #                 height=0.5,
    #                 ax=ax,
    #                 max_num_features=64)
    # plt.savefig(f'pic/{i}_feature_importance.jpg')

2022-06-02 17:48:23.052 | INFO     | __main__:<module>:2 - start: underflow
2022-06-02 17:48:23.082 | INFO     | __main__:<module>:13 - load data: 0.02263188362121582s
2022-06-02 17:49:04.321 | INFO     | __main__:<module>:21 - ngram: 41.23729968070984s
2022-06-02 17:50:54.149 | INFO     | __main__:<module>:29 - smote: 109.8266761302948s
2022-06-02 17:50:54.623 | INFO     | __main__:<module>:33 - After OverSampling, counts of label '1': 22030
2022-06-02 17:50:54.708 | INFO     | __main__:<module>:34 - After OverSampling, counts of label '0': 22030
2022-06-02 17:50:54.709 | INFO     | __main__:<module>:37 - ***************Train with all data****************
2022-06-02 17:52:15.496 | INFO     | __main__:<module>:45 - Xgb, Accuracy: 0.9268421848993796
2022-06-02 17:52:15.497 | INFO     | __main__:<module>:49 - train with all data: 80.37347888946533s
2022-06-02 17:52:15.498 | INFO     | __main__:<module>:51 - **************************************************
2022-06-02 17:52:15.498 | INFO

              precision    recall  f1-score   support

           0       0.93      0.93      0.93      6665
           1       0.93      0.93      0.93      6553

    accuracy                           0.93     13218
   macro avg       0.93      0.93      0.93     13218
weighted avg       0.93      0.93      0.93     13218



2022-06-02 17:52:20.901 | INFO     | __main__:<module>:64 - Xgb, Accuracy: 0.8963691376701967
2022-06-02 17:52:20.902 | INFO     | __main__:<module>:68 - train with 10% of data: 4.767791032791138s
2022-06-02 17:52:20.902 | INFO     | __main__:<module>:70 - **************************************************
2022-06-02 17:52:20.903 | INFO     | __main__:<module>:74 - *******************SHAP Explain*******************


              precision    recall  f1-score   support

           0       0.91      0.89      0.90       676
           1       0.89      0.91      0.90       646

    accuracy                           0.90      1322
   macro avg       0.90      0.90      0.90      1322
weighted avg       0.90      0.90      0.90      1322



2022-06-02 17:52:21.640 | INFO     | __main__:<module>:79 - ngram: 0.7372868061065674s
2022-06-02 17:52:21.641 | INFO     | __main__:<module>:81 - **************************************************
2022-06-02 17:52:21.654 | INFO     | __main__:<module>:87 - ****************Feature selection*****************
2022-06-02 17:52:22.217 | INFO     | __main__:<module>:90 - useful features: 332
2022-06-02 17:52:22.218 | INFO     | __main__:<module>:91 - all features: 3249
2022-06-02 17:52:22.218 | INFO     | __main__:<module>:92 - **************************************************
2022-06-02 17:52:22.218 | INFO     | __main__:<module>:96 - *******************Final Train********************
2022-06-02 17:52:35.203 | INFO     | __main__:<module>:104 - Xgb, Accuracy: 0.9355424421243759
2022-06-02 17:52:35.204 | INFO     | __main__:<module>:108 - train after shap: 12.945550203323364s
2022-06-02 17:52:35.204 | INFO     | __main__:<module>:110 - **************************************************
202

              precision    recall  f1-score   support

           0       0.94      0.93      0.94      2233
           1       0.93      0.94      0.93      2173

    accuracy                           0.94      4406
   macro avg       0.94      0.94      0.94      4406
weighted avg       0.94      0.94      0.94      4406



2022-06-02 17:53:16.674 | INFO     | __main__:<module>:21 - ngram: 41.42358756065369s
2022-06-02 17:53:54.775 | INFO     | __main__:<module>:29 - smote: 38.09956169128418s
2022-06-02 17:53:55.357 | INFO     | __main__:<module>:33 - After OverSampling, counts of label '1': 28682
2022-06-02 17:53:55.467 | INFO     | __main__:<module>:34 - After OverSampling, counts of label '0': 28682
2022-06-02 17:53:55.467 | INFO     | __main__:<module>:37 - ***************Train with all data****************
2022-06-02 17:55:44.741 | INFO     | __main__:<module>:45 - Xgb, Accuracy: 0.9268448576409064
2022-06-02 17:55:44.742 | INFO     | __main__:<module>:49 - train with all data: 108.78346395492554s
2022-06-02 17:55:44.743 | INFO     | __main__:<module>:51 - **************************************************
2022-06-02 17:55:44.743 | INFO     | __main__:<module>:55 - **************PreTrain with 10% data**************


              precision    recall  f1-score   support

           0       0.91      0.94      0.93      8461
           1       0.94      0.91      0.93      8749

    accuracy                           0.93     17210
   macro avg       0.93      0.93      0.93     17210
weighted avg       0.93      0.93      0.93     17210



2022-06-02 17:55:52.738 | INFO     | __main__:<module>:64 - Xgb, Accuracy: 0.859465737514518
2022-06-02 17:55:52.738 | INFO     | __main__:<module>:68 - train with 10% of data: 7.052079916000366s
2022-06-02 17:55:52.739 | INFO     | __main__:<module>:70 - **************************************************
2022-06-02 17:55:52.739 | INFO     | __main__:<module>:74 - *******************SHAP Explain*******************


              precision    recall  f1-score   support

           0       0.84      0.88      0.86       848
           1       0.88      0.84      0.86       874

    accuracy                           0.86      1722
   macro avg       0.86      0.86      0.86      1722
weighted avg       0.86      0.86      0.86      1722



2022-06-02 17:55:53.784 | INFO     | __main__:<module>:79 - ngram: 1.0432264804840088s
2022-06-02 17:55:53.784 | INFO     | __main__:<module>:81 - **************************************************
2022-06-02 17:55:53.798 | INFO     | __main__:<module>:87 - ****************Feature selection*****************
2022-06-02 17:55:54.495 | INFO     | __main__:<module>:90 - useful features: 375
2022-06-02 17:55:54.496 | INFO     | __main__:<module>:91 - all features: 3249
2022-06-02 17:55:54.496 | INFO     | __main__:<module>:92 - **************************************************
2022-06-02 17:55:54.497 | INFO     | __main__:<module>:96 - *******************Final Train********************
2022-06-02 17:56:15.319 | INFO     | __main__:<module>:104 - Xgb, Accuracy: 0.9351577479518912
2022-06-02 17:56:15.320 | INFO     | __main__:<module>:108 - train after shap: 20.74999499320984s
2022-06-02 17:56:15.321 | INFO     | __main__:<module>:110 - **************************************************
2022

              precision    recall  f1-score   support

           0       0.92      0.95      0.93      2814
           1       0.95      0.92      0.94      2923

    accuracy                           0.94      5737
   macro avg       0.94      0.94      0.94      5737
weighted avg       0.94      0.94      0.94      5737



2022-06-02 17:56:57.386 | INFO     | __main__:<module>:21 - ngram: 42.018487215042114s
2022-06-02 17:56:58.271 | INFO     | __main__:<module>:29 - smote: 0.8839643001556396s
2022-06-02 17:56:59.066 | INFO     | __main__:<module>:33 - After OverSampling, counts of label '1': 43123
2022-06-02 17:56:59.232 | INFO     | __main__:<module>:34 - After OverSampling, counts of label '0': 43123
2022-06-02 17:56:59.232 | INFO     | __main__:<module>:37 - ***************Train with all data****************
2022-06-02 17:59:47.713 | INFO     | __main__:<module>:45 - Xgb, Accuracy: 0.9971399860864189
2022-06-02 17:59:47.719 | INFO     | __main__:<module>:49 - train with all data: 167.81923651695251s
2022-06-02 17:59:47.720 | INFO     | __main__:<module>:51 - **************************************************
2022-06-02 17:59:47.720 | INFO     | __main__:<module>:55 - **************PreTrain with 10% data**************


              precision    recall  f1-score   support

           0       1.00      1.00      1.00     12720
           1       1.00      1.00      1.00     13154

    accuracy                           1.00     25874
   macro avg       1.00      1.00      1.00     25874
weighted avg       1.00      1.00      1.00     25874



2022-06-02 17:59:58.635 | INFO     | __main__:<module>:64 - Xgb, Accuracy: 0.9783616692426584
2022-06-02 17:59:58.637 | INFO     | __main__:<module>:68 - train with 10% of data: 9.147855997085571s
2022-06-02 17:59:58.637 | INFO     | __main__:<module>:70 - **************************************************
2022-06-02 17:59:58.638 | INFO     | __main__:<module>:74 - *******************SHAP Explain*******************


              precision    recall  f1-score   support

           0       0.96      0.99      0.98      1258
           1       0.99      0.96      0.98      1330

    accuracy                           0.98      2588
   macro avg       0.98      0.98      0.98      2588
weighted avg       0.98      0.98      0.98      2588



2022-06-02 17:59:59.946 | INFO     | __main__:<module>:79 - ngram: 1.3080761432647705s
2022-06-02 17:59:59.947 | INFO     | __main__:<module>:81 - **************************************************
2022-06-02 17:59:59.955 | INFO     | __main__:<module>:87 - ****************Feature selection*****************
2022-06-02 18:00:00.831 | INFO     | __main__:<module>:90 - useful features: 370
2022-06-02 18:00:00.832 | INFO     | __main__:<module>:91 - all features: 3249
2022-06-02 18:00:00.833 | INFO     | __main__:<module>:92 - **************************************************
2022-06-02 18:00:00.833 | INFO     | __main__:<module>:96 - *******************Final Train********************
2022-06-02 18:00:29.863 | INFO     | __main__:<module>:104 - Xgb, Accuracy: 0.9977971014492754
2022-06-02 18:00:29.864 | INFO     | __main__:<module>:108 - train after shap: 28.918010473251343s
2022-06-02 18:00:29.865 | INFO     | __main__:<module>:110 - **************************************************
202

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4277
           1       1.00      1.00      1.00      4348

    accuracy                           1.00      8625
   macro avg       1.00      1.00      1.00      8625
weighted avg       1.00      1.00      1.00      8625



2022-06-02 18:01:11.690 | INFO     | __main__:<module>:21 - ngram: 41.75332427024841s
2022-06-02 18:01:15.143 | INFO     | __main__:<module>:29 - smote: 3.4534034729003906s
2022-06-02 18:01:15.921 | INFO     | __main__:<module>:33 - After OverSampling, counts of label '1': 39958
2022-06-02 18:01:16.091 | INFO     | __main__:<module>:34 - After OverSampling, counts of label '0': 39958
2022-06-02 18:01:16.092 | INFO     | __main__:<module>:37 - ***************Train with all data****************
2022-06-02 18:03:51.999 | INFO     | __main__:<module>:45 - Xgb, Accuracy: 0.9790615224191866
2022-06-02 18:03:52.000 | INFO     | __main__:<module>:49 - train with all data: 155.1601264476776s
2022-06-02 18:03:52.001 | INFO     | __main__:<module>:51 - **************************************************
2022-06-02 18:03:52.001 | INFO     | __main__:<module>:55 - **************PreTrain with 10% data**************


              precision    recall  f1-score   support

           0       0.97      0.99      0.98     11610
           1       0.99      0.97      0.98     12365

    accuracy                           0.98     23975
   macro avg       0.98      0.98      0.98     23975
weighted avg       0.98      0.98      0.98     23975



2022-06-02 18:04:01.442 | INFO     | __main__:<module>:64 - Xgb, Accuracy: 0.932443703085905
2022-06-02 18:04:01.443 | INFO     | __main__:<module>:68 - train with 10% of data: 8.227896928787231s
2022-06-02 18:04:01.443 | INFO     | __main__:<module>:70 - **************************************************
2022-06-02 18:04:01.444 | INFO     | __main__:<module>:74 - *******************SHAP Explain*******************


              precision    recall  f1-score   support

           0       0.90      0.95      0.93      1082
           1       0.96      0.91      0.94      1316

    accuracy                           0.93      2398
   macro avg       0.93      0.93      0.93      2398
weighted avg       0.93      0.93      0.93      2398



2022-06-02 18:04:02.810 | INFO     | __main__:<module>:79 - ngram: 1.3667628765106201s
2022-06-02 18:04:02.810 | INFO     | __main__:<module>:81 - **************************************************
2022-06-02 18:04:02.838 | INFO     | __main__:<module>:87 - ****************Feature selection*****************
2022-06-02 18:04:03.606 | INFO     | __main__:<module>:90 - useful features: 384
2022-06-02 18:04:03.606 | INFO     | __main__:<module>:91 - all features: 3249
2022-06-02 18:04:03.607 | INFO     | __main__:<module>:92 - **************************************************
2022-06-02 18:04:03.608 | INFO     | __main__:<module>:96 - *******************Final Train********************
2022-06-02 18:04:33.126 | INFO     | __main__:<module>:104 - Xgb, Accuracy: 0.9834834834834835
2022-06-02 18:04:33.127 | INFO     | __main__:<module>:108 - train after shap: 29.420985460281372s
2022-06-02 18:04:33.127 | INFO     | __main__:<module>:110 - **************************************************
202

              precision    recall  f1-score   support

           0       0.98      0.99      0.98      3860
           1       0.99      0.98      0.98      4132

    accuracy                           0.98      7992
   macro avg       0.98      0.98      0.98      7992
weighted avg       0.98      0.98      0.98      7992



2022-06-02 18:05:13.669 | INFO     | __main__:<module>:21 - ngram: 40.477155685424805s
2022-06-02 18:05:14.556 | INFO     | __main__:<module>:29 - smote: 0.886854887008667s
2022-06-02 18:05:15.371 | INFO     | __main__:<module>:33 - After OverSampling, counts of label '1': 42602
2022-06-02 18:05:15.528 | INFO     | __main__:<module>:34 - After OverSampling, counts of label '0': 42602
2022-06-02 18:05:15.528 | INFO     | __main__:<module>:37 - ***************Train with all data****************
2022-06-02 18:07:49.169 | INFO     | __main__:<module>:45 - Xgb, Accuracy: 0.9938189500039121
2022-06-02 18:07:49.170 | INFO     | __main__:<module>:49 - train with all data: 152.99942302703857s
2022-06-02 18:07:49.170 | INFO     | __main__:<module>:51 - **************************************************
2022-06-02 18:07:49.171 | INFO     | __main__:<module>:55 - **************PreTrain with 10% data**************


              precision    recall  f1-score   support

           0       0.99      1.00      0.99     12607
           1       1.00      0.99      0.99     12955

    accuracy                           0.99     25562
   macro avg       0.99      0.99      0.99     25562
weighted avg       0.99      0.99      0.99     25562



2022-06-02 18:07:58.877 | INFO     | __main__:<module>:64 - Xgb, Accuracy: 0.9655846695346109
2022-06-02 18:07:58.878 | INFO     | __main__:<module>:68 - train with 10% of data: 8.535696983337402s
2022-06-02 18:07:58.879 | INFO     | __main__:<module>:70 - **************************************************
2022-06-02 18:07:58.879 | INFO     | __main__:<module>:74 - *******************SHAP Explain*******************


              precision    recall  f1-score   support

           0       0.96      0.97      0.96      1225
           1       0.98      0.96      0.97      1332

    accuracy                           0.97      2557
   macro avg       0.97      0.97      0.97      2557
weighted avg       0.97      0.97      0.97      2557



2022-06-02 18:08:00.185 | INFO     | __main__:<module>:79 - ngram: 1.3052122592926025s
2022-06-02 18:08:00.186 | INFO     | __main__:<module>:81 - **************************************************
2022-06-02 18:08:00.212 | INFO     | __main__:<module>:87 - ****************Feature selection*****************
2022-06-02 18:08:00.983 | INFO     | __main__:<module>:90 - useful features: 363
2022-06-02 18:08:00.984 | INFO     | __main__:<module>:91 - all features: 3249
2022-06-02 18:08:00.985 | INFO     | __main__:<module>:92 - **************************************************
2022-06-02 18:08:00.985 | INFO     | __main__:<module>:96 - *******************Final Train********************
2022-06-02 18:08:26.817 | INFO     | __main__:<module>:104 - Xgb, Accuracy: 0.9951883581739233
2022-06-02 18:08:26.818 | INFO     | __main__:<module>:108 - train after shap: 25.739964246749878s
2022-06-02 18:08:26.818 | INFO     | __main__:<module>:110 - **************************************************
202

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      4203
           1       1.00      0.99      1.00      4318

    accuracy                           1.00      8521
   macro avg       1.00      1.00      1.00      8521
weighted avg       1.00      1.00      1.00      8521



2022-06-02 18:09:07.040 | INFO     | __main__:<module>:21 - ngram: 40.17670679092407s
2022-06-02 18:09:07.695 | INFO     | __main__:<module>:29 - smote: 0.6544938087463379s
2022-06-02 18:09:08.481 | INFO     | __main__:<module>:33 - After OverSampling, counts of label '1': 43654
2022-06-02 18:09:08.644 | INFO     | __main__:<module>:34 - After OverSampling, counts of label '0': 43654
2022-06-02 18:09:08.644 | INFO     | __main__:<module>:37 - ***************Train with all data****************
2022-06-02 18:11:40.054 | INFO     | __main__:<module>:45 - Xgb, Accuracy: 0.9988164776848777
2022-06-02 18:11:40.062 | INFO     | __main__:<module>:49 - train with all data: 150.6295599937439s
2022-06-02 18:11:40.063 | INFO     | __main__:<module>:51 - **************************************************
2022-06-02 18:11:40.063 | INFO     | __main__:<module>:55 - **************PreTrain with 10% data**************


              precision    recall  f1-score   support

           0       1.00      1.00      1.00     12922
           1       1.00      1.00      1.00     13271

    accuracy                           1.00     26193
   macro avg       1.00      1.00      1.00     26193
weighted avg       1.00      1.00      1.00     26193



2022-06-02 18:11:49.939 | INFO     | __main__:<module>:64 - Xgb, Accuracy: 0.9923664122137404
2022-06-02 18:11:49.940 | INFO     | __main__:<module>:68 - train with 10% of data: 8.622464895248413s
2022-06-02 18:11:49.941 | INFO     | __main__:<module>:70 - **************************************************
2022-06-02 18:11:49.941 | INFO     | __main__:<module>:74 - *******************SHAP Explain*******************


              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1284
           1       1.00      0.99      0.99      1336

    accuracy                           0.99      2620
   macro avg       0.99      0.99      0.99      2620
weighted avg       0.99      0.99      0.99      2620



2022-06-02 18:11:51.073 | INFO     | __main__:<module>:79 - ngram: 1.1311311721801758s
2022-06-02 18:11:51.074 | INFO     | __main__:<module>:81 - **************************************************
2022-06-02 18:11:51.105 | INFO     | __main__:<module>:87 - ****************Feature selection*****************
2022-06-02 18:11:51.899 | INFO     | __main__:<module>:90 - useful features: 337
2022-06-02 18:11:51.899 | INFO     | __main__:<module>:91 - all features: 3249
2022-06-02 18:11:51.900 | INFO     | __main__:<module>:92 - **************************************************
2022-06-02 18:11:51.900 | INFO     | __main__:<module>:96 - *******************Final Train********************
2022-06-02 18:12:14.815 | INFO     | __main__:<module>:104 - Xgb, Accuracy: 0.9988546558240752
2022-06-02 18:12:14.816 | INFO     | __main__:<module>:108 - train after shap: 22.81123971939087s
2022-06-02 18:12:14.816 | INFO     | __main__:<module>:110 - **************************************************


              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4291
           1       1.00      1.00      1.00      4440

    accuracy                           1.00      8731
   macro avg       1.00      1.00      1.00      8731
weighted avg       1.00      1.00      1.00      8731



In [9]:
logger.info(f'data_load_time = {data_load_time}')
logger.info(f'ngram_time = {ngram_time}')
logger.info(f'smote_time = {smote_time}')
logger.info(f'train_with_all_feature_time = {train_with_all_feature_time}')
logger.info(f'shap_pre_train_time = {shap_pre_train_time}')
logger.info(f'shap_time = {shap_time}')
logger.info(f'shap_train_time = {shap_train_time}')
logger.info(f'acc_all = {acc_all}')
logger.info(f'acc_part = {acc_part}')
logger.info(f'acc_shap = {acc_shap}')

2022-06-02 20:30:40.799 | INFO     | __main__:<module>:1 - data_load_time = [0.02263188362121582, 0.03789877891540527, 0.03757882118225098, 0.06781840324401855, 0.0549163818359375, 0.037899017333984375]
2022-06-02 20:30:40.801 | INFO     | __main__:<module>:2 - ngram_time = [41.23729968070984, 41.42358756065369, 42.018487215042114, 41.75332427024841, 40.477155685424805, 40.17670679092407]
2022-06-02 20:30:40.802 | INFO     | __main__:<module>:3 - smote_time = [109.8266761302948, 38.09956169128418, 0.8839643001556396, 3.4534034729003906, 0.886854887008667, 0.6544938087463379]
2022-06-02 20:30:40.802 | INFO     | __main__:<module>:4 - train_with_all_feature_time = [80.37347888946533, 108.78346395492554, 167.81923651695251, 155.1601264476776, 152.99942302703857, 150.6295599937439]
2022-06-02 20:30:40.803 | INFO     | __main__:<module>:5 - shap_pre_train_time = [4.767791032791138, 7.052079916000366, 9.147855997085571, 8.227896928787231, 8.535696983337402, 8.622464895248413]
2022-06-02 20:3