# 特征筛选，词频特征使用V2重要性前300的特征

### v2p1验证集结果：0.6373
### 使用全部特征V2: 0.6290

In [25]:
# -*- coding:utf-8 -*-

import pickle
import sys
import os
import pandas as pd
import warnings
import logging
from logging import handlers
from datetime import datetime, date
import numpy as np
import xgboost as xgb
import traceback
import re
import random


## 日志格式设置
# 日志级别关系映射
level_relations = {
    'debug': logging.DEBUG,
    'info': logging.INFO,
    'warning': logging.WARNING,
    'error': logging.ERROR,
    'crit': logging.CRITICAL
}
def get_logger(filename, level='info'):
    # 创建日志对象
    log = logging.getLogger(filename)
    # 设置日志级别
    log.setLevel(level_relations.get(level))
    # 日志输出格式
    fmt = logging.Formatter('%(asctime)s %(thread)d %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s')
    # 输出到控制台
    console_handler = logging.StreamHandler(sys.stdout)
    console_handler.setFormatter(fmt)
    # 输出到文件
    # 日志文件按天进行保存，每天一个日志文件
    file_handler = handlers.TimedRotatingFileHandler(filename=filename, when='D', backupCount=1, encoding='utf-8')
    # 按照大小自动分割日志文件，一旦达到指定的大小重新生成文件
    # file_handler = handlers.RotatingFileHandler(filename=filename, maxBytes=1*1024*1024*1024, backupCount=1, encoding='utf-8')
    file_handler.setFormatter(fmt)
    if not log.handlers:
        log.addHandler(console_handler)
        log.addHandler(file_handler)
    return log

# sn分组后，本次报错和上次报错之间的日志匹配到本次报错
def divideLogByFaultTime(log_label_df: pd.DataFrame):
    log_correspond_label_df = pd.DataFrame(columns=['sn', 'fault_time', 'msg', 'time', 'server_model', 'label'])
    no_label_log_list = []
    log_label_df = log_label_df.reset_index(drop=True)

    for sn, log in log_label_df.groupby('sn'):
        if len(log[log['label'] != '']) == 0:
            no_label_log_list.append(log)
        elif len(log[log['label'] != '']) == 1:
            msg_df = log[log['label'] == '']
            msg_df['label'] = log[log['label'] != '']['label'].iloc[0]
            msg_df['fault_time'] = log[log['label'] != '']['time'].iloc[0]
            log_correspond_label_df = pd.concat([log_correspond_label_df, msg_df])
        else:
            # 使用index的顺序取数时，要注意index必须按所需的顺序排列
            cutoff_index = [-1] + log.loc[log['label'] != ''].index.tolist() + [log.index.tolist()[-1] + 1]
            for kth in range(len(cutoff_index) - 1):
                temp_log = log.loc[(log.index <= cutoff_index[kth + 1]) & (log.index > cutoff_index[kth])]
                if len(temp_log) > 0:
                    if len(temp_log[temp_log['label'] != '']) == 0:
                        no_label_log_list.append(temp_log)
                    # 只有标签，没有日志的数据，把标签的部分数据直接作为日志
                    elif len(temp_log) == 1:
                        msg_df = temp_log
                        msg_df['fault_time'] = temp_log[temp_log['label'] != '']['time'].iloc[0]
                        log_correspond_label_df = pd.concat([log_correspond_label_df, msg_df])
                    else:
                        msg_df = temp_log[temp_log['label'] == '']
                        msg_df['label'] = temp_log[temp_log['label'] != '']['label'].iloc[0]
                        msg_df['fault_time'] = temp_log[temp_log['label'] != '']['time'].iloc[0]
                        log_correspond_label_df = pd.concat([log_correspond_label_df, msg_df])
    return log_correspond_label_df, no_label_log_list

# 计算统计特征
def calculateStatisticFeature(log_correspond_label_df: pd.DataFrame) -> pd.DataFrame:
    use_log_label_df = log_correspond_label_df

    use_log_label_df['msg_hour'] = use_log_label_df['time'].apply(
        lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S").hour)
    use_log_label_df['msg_minute'] = use_log_label_df['time'].apply(
        lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S").minute)
    use_log_label_df['fault_hour'] = use_log_label_df['fault_time'].apply(
        lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S").hour)
    use_log_label_df['fault_minute'] = use_log_label_df['fault_time'].apply(
        lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S").minute)

    # 0408新增
    # 最近一次日志时间距报错时间间隔，单位秒
    nearest_msg_fault_time_delta_list = []
    # 日志不去重时长度1,2,3,4日志数量统计
    all_msg_1_cnt_list = []
    all_msg_2_cnt_list = []
    all_msg_3_cnt_list = []
    all_msg_4_cnt_list = []

    fault_minute_list = []
    msg_1_cnt_list = []
    msg_2_cnt_list = []
    msg_3_cnt_list = []
    msg_4_cnt_list = []
    msg_hour_max_list = []
    msg_hour_min_list = []
    msg_hour_avg_list = []
    msg_hour_median_list = []
    msg_hour_mode_list = []
    msg_minute_max_list = []
    msg_minute_min_list = []
    msg_minute_avg_list = []
    msg_minute_median_list = []
    msg_minute_mode_list = []

    sn_list = []
    server_model_list = []
    msg_log_list = []
    msg_cnt_list = []
    fault_hour_list = []
    label_list = []
    fault_time_list = []
    for msg_log_df in use_log_label_df.groupby(['sn', 'fault_time', 'label']):
        msg_log_str = ''
        all_msg_1_cnt = 0
        all_msg_2_cnt = 0
        all_msg_3_cnt = 0
        all_msg_4_cnt = 0
        msg_1_cnt = 0
        msg_2_cnt = 0
        msg_3_cnt = 0
        msg_4_cnt = 0
        for info in msg_log_df[1]['msg']:
            if info == info:
                if len(info.split('|')) == 1:
                    all_msg_1_cnt += 1
                elif len(info.split('|')) == 2:
                    all_msg_2_cnt += 1
                elif len(info.split('|')) == 3:
                    all_msg_3_cnt += 1
                else:
                    all_msg_4_cnt += 1
        for info in msg_log_df[1]['msg'].drop_duplicates():
            if info == info:
                msg_log_str = msg_log_str + info.lower() + '.'
                if len(info.split('|')) == 1:
                    msg_1_cnt += 1
                elif len(info.split('|')) == 2:
                    msg_2_cnt += 1
                elif len(info.split('|')) == 3:
                    msg_3_cnt += 1
                else:
                    msg_4_cnt += 1
        nearest_msg_fault_time_delta = abs(datetime.strptime(msg_log_df[1].iloc[-1]['time'], '%Y-%m-%d %H:%M:%S'
                                                             ) - datetime.strptime(msg_log_df[0][1],
                                                                                   '%Y-%m-%d %H:%M:%S'))
        nearest_msg_fault_time_delta = nearest_msg_fault_time_delta.days * 24 * 3600 + nearest_msg_fault_time_delta.seconds
        sm = int(msg_log_df[1].iloc[0]['server_model'][2:])

        sn_list.append(msg_log_df[0][0])
        fault_time_list.append(msg_log_df[0][1])
        label_list.append(msg_log_df[0][2])

        nearest_msg_fault_time_delta_list.append(nearest_msg_fault_time_delta)
        server_model_list.append(sm)
        msg_log_list.append(msg_log_str)
        msg_cnt_list.append(len(msg_log_df[1]))

        fault_hour_list.append(msg_log_df[1].iloc[0]['fault_hour'])
        fault_minute_list.append(msg_log_df[1].iloc[0]['fault_minute'])

        all_msg_1_cnt_list.append(all_msg_1_cnt)
        all_msg_2_cnt_list.append(all_msg_2_cnt)
        all_msg_3_cnt_list.append(all_msg_3_cnt)
        all_msg_4_cnt_list.append(all_msg_4_cnt)

        msg_1_cnt_list.append(msg_1_cnt)
        msg_2_cnt_list.append(msg_2_cnt)
        msg_3_cnt_list.append(msg_3_cnt)
        msg_4_cnt_list.append(msg_4_cnt)

        msg_hour_max_list.append(msg_log_df[1]['msg_hour'].max())
        msg_hour_min_list.append(msg_log_df[1]['msg_hour'].min())
        msg_hour_avg_list.append(msg_log_df[1]['msg_hour'].mean())
        msg_hour_median_list.append(msg_log_df[1]['msg_hour'].median())
        msg_hour_mode_list.append(msg_log_df[1]['msg_hour'].mode()[0])

        msg_minute_max_list.append(msg_log_df[1]['msg_minute'].max())
        msg_minute_min_list.append(msg_log_df[1]['msg_minute'].min())
        msg_minute_avg_list.append(msg_log_df[1]['msg_minute'].mean())
        msg_minute_median_list.append(msg_log_df[1]['msg_minute'].median())
        msg_minute_mode_list.append(msg_log_df[1]['msg_minute'].mode()[0])

    msg_log_label_df = pd.DataFrame(
        {
            'sn': sn_list,
            'fault_time': fault_time_list,
            'server_model': server_model_list,
            'msg_cnt': msg_cnt_list,
            'fault_hour': fault_hour_list,
            'fault_minute': fault_minute_list,
            'nearest_msg_fault_time_delta': nearest_msg_fault_time_delta_list,
            'all_msg_1_cnt': all_msg_1_cnt_list,
            'all_msg_2_cnt': all_msg_2_cnt_list,
            'all_msg_3_cnt': all_msg_3_cnt_list,
            'all_msg_4_cnt': all_msg_4_cnt_list,
            'msg_1_cnt': msg_1_cnt_list,
            'msg_2_cnt': msg_2_cnt_list,
            'msg_3_cnt': msg_3_cnt_list,
            'msg_4_cnt': msg_4_cnt_list,
            'msg_hour_max': msg_hour_max_list,
            'msg_hour_min': msg_hour_min_list,
            'msg_hour_avg': msg_hour_avg_list,
            'msg_hour_median': msg_hour_median_list,
            'msg_hour_mode': msg_hour_mode_list,
            'msg_minute_max': msg_minute_max_list,
            'msg_minute_min': msg_minute_min_list,
            'msg_minute_avg': msg_minute_avg_list,
            'msg_minute_median': msg_minute_median_list,
            'msg_minute_mode': msg_minute_mode_list,
            'msg_log': msg_log_list,
            'label': label_list
        }
    )
    return msg_log_label_df

# 计算特征函数
def caculateFeature(log_df: pd.DataFrame, label_df: pd.DataFrame, word_list: list) -> pd.DataFrame:
    warnings.filterwarnings("ignore")
    logger = get_logger('./user_data/logs/{}_info.log'.format(date.today()), 'info')
    logger_error = get_logger('./user_data/logs/{}_error.log'.format(date.today()), 'error')

    logger.info('开始拼接日志和标签数据')
    log_df['label'] = ''
    label_df['time'] = label_df['fault_time']
    label_df['msg'] = ''
    label_df['server_model'] = label_df['sn'].map(dict(zip(log_df['sn'], log_df['server_model'])))
    label_df = label_df[['sn', 'time', 'msg', 'server_model', 'label']]
    log_label_df = pd.concat([log_df, label_df], axis=0).sort_values(by='time')
    log_label_df['fault_time'] = ''
    log_label_df = log_label_df[['sn', 'fault_time', 'msg', 'time', 'server_model', 'label']]
    logger.info('拼接日志和标签数据结束')

    logger.info('开始匹配日志和标签')
    logger.info('使用报错时间截断进行划分')
    # 使用报错时间截断进行划分
    # FaultTime_log_correspond_label_df, FaultTime_no_label_log_list = divideLogByFaultTime(log_label_df)
    # FaultTime_log_correspond_label_df.to_csv('./user_data/tmp_data/FaultTime_log_correspond_label_df.csv', index = None)
    FaultTime_log_correspond_label_df = pd.read_csv('./user_data/tmp_data/FaultTime_log_correspond_label_df.csv')
    logger.info('匹配日志和标签结束')

    logger.info('开始计算统计特征')
    # 使用报错时间截断进行划分
    msg_log_label_df = calculateStatisticFeature(FaultTime_log_correspond_label_df)
    logger.info('计算统计特征结束')

    msg_log_list = list(msg_log_label_df['msg_log'])
    label_list = list(msg_log_label_df['label'])

    # 计算词频向量
    logger.info('开始计算词频特征')
    frequency_vector_list = []
    tag = 0
    for word in word_list:
        if tag % 100 == 0:
            print(tag, datetime.now())
        pattern = re.compile(word)
        frequency_vector = [len(re.findall(pattern, log)) for log in msg_log_list]
        frequency_vector_list.append(frequency_vector)
        tag += 1
    logger.info('计算词频特征结束')

    frequency_vector_df = pd.DataFrame(frequency_vector_list)
    frequency_vector_df = frequency_vector_df.T
    frequency_vector_df.columns = word_list
    statistic_feature_list = list(msg_log_label_df.columns)[2:-2]
    feature_df = frequency_vector_df
    feature_df[statistic_feature_list] = msg_log_label_df[statistic_feature_list]

    feature_df['label'] = label_list
    feature_df[['sn', 'fault_time']] = msg_log_label_df[['sn', 'fault_time']]
    logger.info('最后3列为: label, sn, fault_time, 其余列均为特征')
    logger.info('数据条数: {}, 特征个数: {}'.format(feature_df.shape[0], feature_df.shape[1]-3))
    return feature_df


# xgb模型参数
xgb_params = {
    'booster': 'gbtree',
    'objective': 'multi:softmax',  # 多分类问题
    'num_class': 4,  # 类别数，与multi softmax并用
    'gamma': 0.1,  # 用于控制是否后剪枝的参数，越大越保守，一般0.1 0.2的样子
    'max_depth': 6,  # 构建树的深度，越大越容易过拟合
    'lambda': 2,  # 控制模型复杂度的权重值的L2 正则化项参数，参数越大，模型越不容易过拟合
    'subsample': 1,  # 随机采样训练样本
    'colsample_bytree': 1,  # 这个参数默认为1，是每个叶子里面h的和至少是多少
    # 对于正负样本不均衡时的0-1分类而言，假设h在0.01附近，min_child_weight为1
    # 意味着叶子节点中最少需要包含100个样本。这个参数非常影响结果，
    # 控制叶子节点中二阶导的和的最小值，该参数值越小，越容易过拟合
    'silent': 0,  # 设置成1 则没有运行信息输入，最好是设置成0
    'eta': 0.3,  # 如同学习率
    'seed': 1000,
    'nthread': 16,  # CPU线程数
    # 'eval_metric':'auc'
}

# 指标评估
def macro_f1(label_df: pd.DataFrame, prediction_df: pd.DataFrame) -> float:
    """
    计算得分
    :param label_df: [sn,fault_time,label]
    :param prediction_df: [sn,fault_time,label]
    :return:
    """
    warnings.filterwarnings("ignore")
    logger = get_logger('./user_data/logs/{}_info.log'.format(date.today()), 'info')
    logger_error = get_logger('./user_data/logs/{}_error.log'.format(date.today()), 'error')

    prediction_df.columns = ['sn', 'fault_time', 'prediction']
    outcome_df = pd.merge(label_df, prediction_df ,how = 'left', on = ['sn', 'fault_time'])
    weights = [5 / 11, 4 / 11, 1 / 11, 1 / 11]
    macro_F1 = 0.
    for i in range(len(weights)):
        TP = len(outcome_df[(outcome_df['label'] == i) & (outcome_df['prediction'] == i)])
        FP = len(outcome_df[(outcome_df['label'] != i) & (outcome_df['prediction'] == i)])
        FN = len(outcome_df[(outcome_df['label'] == i) & (outcome_df['prediction'] != i)])
        precision = TP / (TP + FP) if (TP + FP) > 0 else 0
        recall = TP / (TP + FN) if (TP + FN) > 0 else 0
        F1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
        macro_F1 += weights[i] * F1
        logger.info('Label {}:   Precision {: .2f}, Recall {: .2f}, F1 {: .2f}'.format(i, precision, recall, F1))
    logger.info('macro_f1: {}\n'.format(macro_F1))

    return macro_F1

# 模型训练函数
def xgbTrain(feature_df: pd.DataFrame) -> xgb.XGBModel:
    '''
    feature_df: 要求最后3列为: label, sn, fault_time, 其余列均为特征
    '''
    warnings.filterwarnings("ignore")
    logger = get_logger('./user_data/logs/{}_info.log'.format(date.today()), 'info')
    logger_error = get_logger('./user_data/logs/{}_error.log'.format(date.today()), 'error')

    feature_name_list = list(feature_df.columns)[0:-3]
    feature = np.array(feature_df[feature_name_list])
    label = np.array(feature_df['label'])
    label_df = feature_df[['sn', 'fault_time', 'label']]
    prediction_df = feature_df[['sn', 'fault_time']]

    train_data = xgb.DMatrix(feature, label=label)
    train_feature = xgb.DMatrix(feature)
    logger.info('开始训练xgb模型')
    xgb_model = xgb.train(xgb_params, train_data, num_boost_round=500)
    logger.info('训练xgb模型结束')
    # 训练集指标评估
    prediction = xgb_model.predict(train_feature)
    prediction_df['label'] = prediction
    logger.info('训练集评估效果: ')
    macro_f1(label_df, prediction_df)

    return xgb_model

# xgb模型预测函数
def xgbPredict(model: xgb.XGBModel, feature_df: pd.DataFrame, label_df = None) -> pd.DataFrame:
    '''
        feature_df: 要求最后3列为: label, sn, fault_time, 其余列均为特征
    '''
    warnings.filterwarnings("ignore")
    logger = get_logger('./user_data/logs/{}_info.log'.format(date.today()), 'info')
    logger_error = get_logger('./user_data/logs/{}_error.log'.format(date.today()), 'error')

    if label_df is None:
        feature_name_list = list(feature_df.columns)[0:-3]
        feature = np.array(feature_df[feature_name_list])
        prediction_df = feature_df[['sn', 'fault_time']]

        test_feature = xgb.DMatrix(feature)
        logger.info('开始xgb模型预测')
        prediction = model.predict(test_feature)
        logger.info('xgb模型预测结束')
        prediction_df['label'] = prediction
        prediction_df['label'] = prediction_df['label'].apply(lambda x: int(x))

    else:
        feature_name_list = list(feature_df.columns)[0:-3]
        feature = np.array(feature_df[feature_name_list])
        prediction_df = feature_df[['sn', 'fault_time']]

        test_feature = xgb.DMatrix(feature)
        logger.info('开始xgb模型预测')
        prediction = model.predict(test_feature)
        logger.info('xgb模型预测结束')
        # 测试集指标评估
        prediction_df['label'] = prediction
        prediction_df['label'] = prediction_df['label'].apply(lambda x: int(x))
        logger.info('测试集评估效果: ')
        macro_f1(label_df, prediction_df)

    return prediction_df


# xgb模型随机训练并投票预测
def xgbRandomTrainPredict(train_feature_df: pd.DataFrame, test_feature_df: pd.DataFrame, label_df = None) -> pd.DataFrame:
    ## 每个子模型样本均衡，利用投票规则生成最终预测
    random.seed(0)
    N = 100 # number of the models
    num_sample = 500 # number of samples for each label

    _label0_index_list = list(train_feature_df[train_feature_df['label'] == 0].index)
    _label1_index_list = list(train_feature_df[train_feature_df['label'] == 1].index)
    _label2_index_list = list(train_feature_df[train_feature_df['label'] == 2].index)
    _label3_index_list = list(train_feature_df[train_feature_df['label'] == 3].index)
    feature_name_list = list(train_feature_df.columns)[0:-3]
    test_feature = np.array(test_feature_df[feature_name_list])
    test_feature = xgb.DMatrix(test_feature)
    prediction_df = test_feature_df[['sn', 'fault_time']]

    for iter in np.arange(N):
        idx_0 = random.sample(_label0_index_list, num_sample)
        idx_1 = random.sample(_label1_index_list, num_sample)
        idx_2 = random.sample(_label2_index_list, num_sample)
        idx_3 = random.sample(_label3_index_list, num_sample)
        idx = np.hstack((idx_0, idx_1, idx_2, idx_3))
        random.shuffle(idx)
        sub_train_feature_df = train_feature_df[idx]
        sub_train_feature = np.array(sub_train_feature_df[feature_name_list])
        sub_train_label = np.array(sub_train_feature_df['label'])
        sub_train_data = xgb.DMatrix(sub_train_feature,label = sub_train_label)

        logger.info('开始第{}轮训练和预测'.format(iter))
        sub_xgb_model = xgb.train(xgb_params, sub_train_data, num_boost_round=500)
        sub_test_pred = sub_xgb_model.predict(test_feature)
        if iter == 0:
            val_pred = sub_test_pred
        else:
            val_pred = np.vstack((val_pred, sub_test_pred))
        logger.info('第{}轮训练和预测结束'.format(iter))

    # 训练集指标评估
    final_pred = [np.argmax(np.bincount(val_pred[:, i].astype(int))) for i in np.arange(val_pred.shape[1])]
    final_pred = np.array(final_pred).astype(int)
    prediction_df['label'] = final_pred
    logger.info('训练集评估效果: ')
    macro_f1(label_df, prediction_df)


In [26]:
os.chdir(os.path.dirname(sys.path[0]))
print(os.getcwd())
# 忽略warning
warnings.filterwarnings("ignore")
logger = get_logger('./user_data/logs/{}_info.log'.format(date.today()), 'info')
logger_error = get_logger('./user_data/logs/{}_error.log'.format(date.today()), 'error')


C:\workfile\python\Log-diagnosis\semi_final\v2_docker


In [28]:
# 读取数据
# 读取sel日志数据
sel_log_df = pd.read_csv('./data/preliminary_sel_log_dataset.csv').drop_duplicates()
# 读取训练标签数据：有重复数据！
train_label1 = pd.read_csv('./data/preliminary_train_label_dataset.csv')
train_label2 = pd.read_csv('./data/preliminary_train_label_dataset_s.csv')
train_label_df = pd.concat([train_label1,train_label2],axis=0).drop_duplicates()
# 读取初赛A榜测试集
preliminary_sel_log_dataset_a = pd.read_csv('./data/preliminary_sel_log_dataset_a.csv')
preliminary_submit_dataset_a = pd.read_csv('./data/preliminary_submit_dataset_a.csv')
preliminary_submit_dataset_a['label'] = -1
# 读取初赛B榜测试集
preliminary_sel_log_dataset_b = pd.read_csv('./data/preliminary_sel_log_dataset_b.csv')
preliminary_submit_dataset_b = pd.read_csv('./data/preliminary_submit_dataset_b.csv')
preliminary_submit_dataset_b['label'] = -1
# preliminary_submit_dataset_b_v1p8 = pd.read_csv('./user_data/tmp_data/preliminary_submit_dataset_b_v1p8.csv')
# 读取词列表
v1_word_list = list(pd.read_csv('./user_data/words/word_frequency_df.txt',sep='\t')['word'])
v1p1_word_list = list(pd.read_csv('./user_data/words/tags_incomplete.txt',sep='\t',names=['word'])['word'])
word_list = list(set(v1_word_list+v1p1_word_list))
important_word_list = list(pd.read_csv('./user_data/words/important_word_df.csv')['word'])

# 获取特征
# 计算特征
train_feature_df = caculateFeature(sel_log_df, train_label_df, important_word_list)
random.seed(0)
val_mask = [random.random() < 0.3 for _ in range(len(train_feature_df))]
train_mask = [not xx for xx in val_mask]
temp_feature_df = train_feature_df[train_mask]
val_feature_df = train_feature_df[val_mask]

# xgb训练
model = xgbTrain(temp_feature_df)
temp_prediction_df = xgbPredict(model, temp_feature_df, temp_feature_df[['sn', 'fault_time', 'label']])
temp_prediction_df.to_csv('./prediction_result/temp_prediction_df.csv',index=None)
val_prediction_df = xgbPredict(model, val_feature_df, val_feature_df[['sn', 'fault_time', 'label']])
val_prediction_df.to_csv('./prediction_result/val_prediction_df.csv',index=None)

2022-05-13 01:19:18,116 16580 2029871450.py[line:237] - INFO: 开始拼接日志和标签数据
2022-05-13 01:19:18,626 16580 2029871450.py[line:246] - INFO: 拼接日志和标签数据结束
2022-05-13 01:19:18,627 16580 2029871450.py[line:248] - INFO: 开始匹配日志和标签
2022-05-13 01:19:18,627 16580 2029871450.py[line:249] - INFO: 使用报错时间截断进行划分
2022-05-13 01:19:19,088 16580 2029871450.py[line:254] - INFO: 匹配日志和标签结束
2022-05-13 01:19:19,088 16580 2029871450.py[line:256] - INFO: 开始计算统计特征
2022-05-13 01:19:46,440 16580 2029871450.py[line:259] - INFO: 计算统计特征结束
2022-05-13 01:19:46,443 16580 2029871450.py[line:265] - INFO: 开始计算词频特征
0 2022-05-13 01:19:46.443824
100 2022-05-13 01:19:48.193824
200 2022-05-13 01:19:49.917413
2022-05-13 01:19:51,279 16580 2029871450.py[line:275] - INFO: 计算词频特征结束
2022-05-13 01:19:52,125 16580 2029871450.py[line:286] - INFO: 最后3列为: label, sn, fault_time, 其余列均为特征
2022-05-13 01:19:52,125 16580 2029871450.py[line:287] - INFO: 数据条数: 16604, 特征个数: 303
2022-05-13 01:19:52,219 16580 2029871450.py[line:357] - INFO: 开始训练xgb模型
P

In [29]:
# 获取特征
# 计算特征
train_feature_df = pd.read_csv('./user_data/feature_data/train_feature_df.csv')
random.seed(0)
val_mask = [random.random() < 0.3 for _ in range(len(train_feature_df))]
train_mask = [not xx for xx in val_mask]
temp_feature_df = train_feature_df[train_mask]
val_feature_df = train_feature_df[val_mask]

# xgb训练
model = xgbTrain(temp_feature_df)
temp_prediction_df = xgbPredict(model, temp_feature_df, temp_feature_df[['sn', 'fault_time', 'label']])
val_prediction_df = xgbPredict(model, val_feature_df, val_feature_df[['sn', 'fault_time', 'label']])

2022-05-13 01:24:15,433 16580 2029871450.py[line:357] - INFO: 开始训练xgb模型
Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


2022-05-13 01:25:45,387 16580 2029871450.py[line:359] - INFO: 训练xgb模型结束
2022-05-13 01:25:45,439 16580 2029871450.py[line:363] - INFO: 训练集评估效果: 
2022-05-13 01:25:45,451 16580 2029871450.py[line:335] - INFO: Label 0:   Precision  1.00, Recall  1.00, F1  1.00
2022-05-13 01:25:45,453 16580 2029871450.py[line:335] - INFO: Label 1:   Precision  1.00, Recall  1.00, F1  1.00
2022-05-13 01:25:45,456 16580 2029871450.py[line:335] - INFO: Label 2:   Precision  1.00, Recall  1.00, F1  1.00
2022-05-13 01:25:45,458 16580 2029871450.py[line:335] - INFO: Label 3:   Precision  1.00, Recall  1.00, F1  1.00
2022-05-13 01:25:45

# 查看特征重要性，并选取重要性前300的特征

In [6]:
# 读取模型
xgb_model_v2 = xgb.Booster()
xgb_model_v2.load_model('./user_data/model_data/xgb_model_v2.json')
# 读取特征
train_feature_df = pd.read_csv('./user_data/feature_data/train_feature_df.csv')
# 读取词
v1_word_list = list(pd.read_csv('./user_data/words/word_frequency_df.txt',sep='\t')['word'])
v1p1_word_list = list(pd.read_csv('./user_data/words/tags_incomplete.txt',sep='\t',names=['word'])['word'])
word_list = list(set(v1_word_list+v1p1_word_list))
# 获取重要性前300的特征
total_gain_dict=xgb_model_v2.get_score(importance_type='total_gain')
total_gain_dict=dict(sorted(total_gain_dict.items(),key=lambda item:item[1],reverse=True))
feature_names=list(train_feature_df.columns[:-3])
feature_importance_top_300_dict={}
i=0
for key in total_gain_dict:
    if i<300:
        feature_name=feature_names[int(key[1:])]
        feature_importance_top_300_dict[feature_name]=total_gain_dict[key]
        i+=1
    else:
        break
feature_importance_top_300_dict

{'memory': 16532.66796875,
 'processor': 7672.80712890625,
 'nearest_msg_fault_time_delta': 5905.58056640625,
 'or': 5613.32373046875,
 'cpu': 3260.965576171875,
 'e': 3223.5634765625,
 'mcerr': 2194.917236328125,
 'server_model': 1791.451416015625,
 'fault_minute': 1128.4400634765625,
 'caterr': 926.9019775390625,
 'msg_minute_avg': 921.4864501953125,
 'msg_minute_max': 833.5463256835938,
 'msg_minute_min': 808.7095336914062,
 'ecc': 706.3648071289062,
 'fault_hour': 634.3880615234375,
 'msg_cnt': 634.0474853515625,
 'deassert': 622.1248779296875,
 'all_msg_3_cnt': 609.456298828125,
 'msg_hour_avg': 597.8284301757812,
 'msg_minute_mode': 543.718505859375,
 'msg_minute_median': 542.3955078125,
 'config': 502.8052062988281,
 'c': 481.6231384277344,
 'msg_hour_max': 454.46051025390625,
 'msg_hour_min': 438.2335205078125,
 'microcontroller/coprocessor': 402.3030090332031,
 'uncorrectable ecc': 375.8287353515625,
 'ec': 318.05926513671875,
 'power supply': 306.5587158203125,
 '0': 274.9622

In [13]:
# important_words_list = []
# for word in feature_importance_top_300_dict:
#     if word in word_list:
#         important_words_list.append(word)
# important_word_df = pd.DataFrame({
#     'word': important_words_list
# })
# important_word_df.to_csv('./user_data/words/important_word_df.csv', index = None)

In [18]:
important_word_list = list(pd.read_csv('./user_data/words/important_word_df.csv')['word'])

In [19]:
important_word_list

['memory',
 'processor',
 'or',
 'cpu',
 'e',
 'mcerr',
 'caterr',
 'ecc',
 'deassert',
 'config',
 'c',
 'microcontroller/coprocessor',
 'uncorrectable ecc',
 'ec',
 'power supply',
 '0',
 'sta',
 'asserted',
 'dimm',
 '0x3b',
 'device disabled',
 'fa',
 'system',
 'on',
 'state',
 '1',
 'status',
 'ierr',
 'uncorrectable',
 'in',
 'failure detected',
 '0x19',
 'f',
 'power',
 'up',
 'cpu_caterr',
 'detected',
 'cpu1',
 'correctable',
 '0x7d',
 'cpu0',
 'error',
 'slot',
 'chassis',
 'ac',
 'op',
 'unknown',
 'boot',
 'warm',
 'enabled',
 'cpu0_status',
 '2',
 'cpu1_status',
 'to',
 'ev',
 'microcontroller',
 '0xe0',
 'state asserted',
 'event',
 'is',
 '6',
 'bios_boo',
 'configuration',
 'trip',
 'bus',
 'system boot initiated',
 'os',
 'no',
 'reset',
 'check',
 '0x13',
 '4',
 'memory dimm',
 '5',
 'initiated by power up',
 '8',
 '0xe2',
 'temp',
 'ff',
 'device',
 '7',
 'cpu1b0_dimm_stat',
 'restart',
 'temperature cpu',
 'lt',
 'memory cpu',
 'cpu0e0_dimm_stat',
 '17',
 'hard res

In [21]:
FaultTime_log_correspond_label_df = pd.read_csv('./user_data/tmp_data/FaultTime_log_correspond_label_df.csv')

In [23]:
FaultTime_log_correspond_label_df[FaultTime_log_correspond_label_df['label'] == 0]

Unnamed: 0,sn,fault_time,msg,time,server_model,label
336,SERVER_10037,2020-01-11 22:10:00,Memory #0xe2 | Correctable ECC | Asserted,2020-01-10 16:36:29,SM50,0
337,SERVER_10037,2020-01-11 22:10:00,Memory #0xe2 | Correctable ECC | Asserted,2020-01-10 16:37:11,SM50,0
338,SERVER_10037,2020-01-11 22:10:00,Memory #0xe2 | Correctable ECC | Asserted,2020-01-10 16:37:42,SM50,0
339,SERVER_10037,2020-01-11 22:10:00,Memory #0xe2 | Correctable ECC | Asserted,2020-01-10 16:38:13,SM50,0
340,SERVER_10037,2020-01-11 22:10:00,Memory #0xe2 | Correctable ECC | Asserted,2020-01-10 16:38:45,SM50,0
...,...,...,...,...,...,...
482118,SERVER_9940,2020-08-02 15:51:00,Memory CPU1D0_DIMM_Stat | Correctable ECC | A...,2020-08-02 13:13:04,SM56,0
482119,SERVER_9940,2020-08-02 15:51:00,Memory CPU1D0_DIMM_Stat | Correctable ECC | A...,2020-08-02 14:06:49,SM56,0
482120,SERVER_9940,2020-08-02 15:51:00,Memory CPU1D0_DIMM_Stat | Correctable ECC | A...,2020-08-02 14:39:56,SM56,0
482121,SERVER_9940,2020-08-02 15:51:00,Memory CPU1D0_DIMM_Stat | Uncorrectable ECC |...,2020-08-02 14:47:32,SM56,0


In [None]:
# xgb模型随机训练并投票预测
def xgbRandomTrainPredict(train_feature_df: pd.DataFrame, test_feature_df: pd.DataFrame, label_df = None) -> pd.DataFrame:
    ## 每个子模型样本均衡，利用投票规则生成最终预测
    random.seed(0)
    N = 100 # number of the models
    num_sample = 500 # number of samples for each label

    _label0_index_list = [x for x,y in list(enumerate(train_label)) if y == 0]
    _label1_index_list = [x for x,y in list(enumerate(train_label)) if y == 1]
    _label2_index_list = [x for x,y in list(enumerate(train_label)) if y == 2]
    _label3_index_list = [x for x,y in list(enumerate(train_label)) if y == 3]

    for iter in np.arange(N):
        idx_0 = random.sample(list_0, num_sample)
        idx_1 = random.sample(list_1, num_sample)
        idx_2 = random.sample(list_2, num_sample)
        idx_3 = random.sample(list_3, num_sample)
        idx = np.hstack((idx_0,idx_1,idx_2,idx_3))
        random.shuffle(idx)
        sub_train_feature = train_feature[idx]
        sub_train_label = train_label[idx]
        sub_train_data = xgb.DMatrix(sub_train_feature,label = sub_train_label)
        sub_train_feature = xgb.DMatrix(sub_train_feature)
        sub_xgb_model = xgb.train(xgb_params,sub_train_data,num_boost_round=500)
        sub_val_pred = sub_xgb_model.predict(val_feature)
        if iter == 0:
            val_pred = sub_val_pred
        else:
            val_pred = np.vstack((val_pred,sub_val_pred))

    final_pred = [np.argmax(np.bincount(val_pred[:,i].astype(int))) for i in np.arange(val_pred.shape[1])]
    final_pred = np.array(final_pred).astype(float)
    macro_f1(val_label,final_pred)

In [6]:
train_feature_df = caculateFeature(sel_log_df, train_label_df, word_list)

2022-05-11 23:56:44,774 19440 2281365420.py[line:237] - INFO: 开始拼接日志和标签数据
2022-05-11 23:56:45,375 19440 2281365420.py[line:246] - INFO: 拼接日志和标签数据结束
2022-05-11 23:56:45,376 19440 2281365420.py[line:248] - INFO: 开始匹配日志和标签
2022-05-11 23:56:45,376 19440 2281365420.py[line:249] - INFO: 使用报错时间截断进行划分
2022-05-12 00:04:21,781 19440 2281365420.py[line:252] - INFO: 匹配日志和标签结束
2022-05-12 00:04:21,782 19440 2281365420.py[line:254] - INFO: 开始计算统计特征
2022-05-12 00:04:50,263 19440 2281365420.py[line:257] - INFO: 计算统计特征结束
2022-05-12 00:04:50,266 19440 2281365420.py[line:263] - INFO: 开始计算词频特征
0 2022-05-12 00:04:50.267585
100 2022-05-12 00:04:52.077583
200 2022-05-12 00:04:53.891583
300 2022-05-12 00:04:55.696583
400 2022-05-12 00:04:57.519583
500 2022-05-12 00:04:59.339604
600 2022-05-12 00:05:01.153634
700 2022-05-12 00:05:02.961634
800 2022-05-12 00:05:04.834649
900 2022-05-12 00:05:06.668162
1000 2022-05-12 00:05:08.482162
1100 2022-05-12 00:05:10.265185
1200 2022-05-12 00:05:12.024185
1300 2022-05-12 

In [7]:
train_feature_df

Unnamed: 0,2116,ps0_status,f2a,chassis,deasserted os stop/shutdown,power supply power limiting,74,f9d,26,bp3_hdd1_status,...,msg_hour_median,msg_hour_mode,msg_minute_max,msg_minute_min,msg_minute_avg,msg_minute_median,msg_minute_mode,label,sn,fault_time
0,0,0,0,0,0,0,0,0,0,0,...,8.0,8,59,0,38.333333,54.0,59,1,SERVER_10001,2020-05-01 10:04:00
1,0,0,0,0,0,0,0,0,0,0,...,9.0,9,48,45,46.444444,46.0,46,2,SERVER_10003,2020-03-28 09:48:00
2,0,0,0,0,0,0,0,0,0,0,...,15.0,15,52,51,51.600000,52.0,52,1,SERVER_10008,2020-02-25 16:12:00
3,0,0,0,0,0,0,0,0,0,0,...,16.0,16,52,46,47.750000,46.5,46,2,SERVER_10008,2020-03-11 18:04:00
4,0,0,0,0,0,0,0,0,0,0,...,16.0,16,7,7,7.000000,7.0,7,3,SERVER_10009,2020-05-08 16:37:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16599,0,0,0,0,0,0,0,0,0,0,...,20.5,19,52,46,49.000000,49.0,46,2,SERVER_9991,2020-08-04 22:49:00
16600,0,0,0,0,0,0,0,0,0,0,...,18.0,18,14,14,14.000000,14.0,14,2,SERVER_9991,2020-10-07 18:42:00
16601,0,0,0,0,0,0,0,0,0,0,...,23.0,23,48,43,45.500000,45.5,43,2,SERVER_9993,2020-05-14 23:50:00
16602,0,0,0,0,0,0,0,0,0,0,...,11.0,11,19,4,11.500000,11.5,4,2,SERVER_9998,2020-05-29 11:25:00


In [8]:
check_frequency_vector_df = pd.read_csv('./user_data/tmp_data/check_frequency_vector_df.csv')

In [9]:
check_frequency_vector_df

Unnamed: 0,cpu2,bp4_hdd15_status,front1_9_status,deasserted drive slot / bay disk stat,fan8_rpm,ac lost,mcerr,deasserted unknown cpu caterr,hdd76,asserted system firmware error system progress,...,msg_hour_median,msg_hour_mode,msg_minute_max,msg_minute_min,msg_minute_avg,msg_minute_median,msg_minute_mode,label,sn,fault_time
0,0,0,0,0,0,0,0,0,0,0,...,8.0,8,59,0,38.333333,54.0,59,1,SERVER_10001,2020-05-01 10:04:00
1,0,0,0,0,0,0,0,0,0,0,...,9.0,9,48,45,46.444444,46.0,46,2,SERVER_10003,2020-03-28 09:48:00
2,0,0,0,0,0,0,0,0,0,0,...,15.0,15,52,51,51.600000,52.0,52,1,SERVER_10008,2020-02-25 16:12:00
3,0,0,0,0,0,0,0,0,0,0,...,16.0,16,52,46,47.750000,46.5,46,2,SERVER_10008,2020-03-11 18:04:00
4,0,0,0,0,0,0,0,0,0,0,...,16.0,16,7,7,7.000000,7.0,7,3,SERVER_10009,2020-05-08 16:37:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16599,0,0,0,0,0,0,0,0,0,0,...,20.5,19,52,46,49.000000,49.0,46,2,SERVER_9991,2020-08-04 22:49:00
16600,0,0,0,0,0,0,0,0,0,0,...,18.0,18,14,14,14.000000,14.0,14,2,SERVER_9991,2020-10-07 18:42:00
16601,0,0,0,0,0,0,0,0,0,0,...,23.0,23,48,43,45.500000,45.5,43,2,SERVER_9993,2020-05-14 23:50:00
16602,0,0,0,0,0,0,0,0,0,0,...,11.0,11,19,4,11.500000,11.5,4,2,SERVER_9998,2020-05-29 11:25:00


In [70]:
check_frequency_vector_df['2116']

0        0
1        0
2        0
3        0
4        0
        ..
16599    0
16600    0
16601    0
16602    0
16603    0
Name: 2116, Length: 16604, dtype: int64

In [13]:
feature_list = train_feature_df.columns[:-3]
check_feature_list = check_frequency_vector_df.columns[:-3]

In [14]:
len(check_feature_list)

2110

In [28]:
for i in feature_list:
    if i not in check_feature_list:
        print(1)

In [47]:
for i in range(len(train_feature_df)):
    if train_feature_df['sn'].iloc[i] != check_frequency_vector_df['sn'].iloc[i]:
        break

In [49]:
i

16603

In [50]:
for i in range(len(train_feature_df)):
    if train_feature_df['fault_time'].iloc[i] != check_frequency_vector_df['fault_time'].iloc[i]:
        break

In [51]:
i

16603

In [52]:
for i in range(len(train_feature_df)):
    if train_feature_df['label'].iloc[i] != check_frequency_vector_df['label'].iloc[i]:
        break

In [53]:
i

16603

In [65]:
for name in feature_list:
    temp = np.array(train_feature_df[name]) - np.array(check_frequency_vector_df[name])
    for i in range(len(temp)):
        if temp[i] > 10**-6:
            break
    if temp[i] > 10**-6:
        break

In [66]:
name

'msg_minute_mode'

In [67]:
train_feature_df[name].iloc[i]

56

In [68]:
check_frequency_vector_df[name].iloc[i]

56

In [69]:
feature_list[-1]

'msg_minute_mode'

In [57]:
train_feature_df[name].iloc[i]

0.18181818181818182

In [58]:
check_frequency_vector_df[name].iloc[i]

0.1818181818181818

In [37]:
for iter in train_feature_df.iterrows():
    check_iter = check_frequency_vector_df[(check_frequency_vector_df['sn'] == iter[1]['sn']) & 
                                           (check_frequency_vector_df['fault_time'] == iter[1]['fault_time']) & 
                                           (check_frequency_vector_df['label'] == iter[1]['label'])]
    for name in feature_list:
        if abs(check_iter[name].iloc[0] - iter[1][name]) > 10**-5:
            break
    if check_iter[name].iloc[0] != iter[1][name]:
        break

KeyboardInterrupt: 

In [38]:
model = xgbTrain(train_feature_df)

2022-05-12 00:34:04,767 19440 2281365420.py[line:355] - INFO: 开始训练xgb模型
Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


2022-05-12 00:37:03,518 19440 2281365420.py[line:357] - INFO: 训练xgb模型结束
2022-05-12 00:37:03,603 19440 2281365420.py[line:361] - INFO: 训练集评估效果: 
2022-05-12 00:37:03,622 19440 2281365420.py[line:333] - INFO: Label 0:   Precision  1.00, Recall  1.00, F1  1.00
2022-05-12 00:37:03,624 19440 2281365420.py[line:333] - INFO: Label 1:   Precision  1.00, Recall  0.99, F1  1.00
2022-05-12 00:37:03,627 19440 2281365420.py[line:333] - INFO: Label 2:   Precision  1.00, Recall  1.00, F1  1.00
2022-05-12 00:37:03,630 19440 2281365420.py[line:333] - INFO: Label 3:   Precision  1.00, Recall  0.99, F1  1.00
2022-05-12 00:37:03

In [40]:
check_model = xgb.Booster()
check_model.load_model('./user_data/tmp_data/check_xgb_model_v1p8.json')

In [41]:
test1 = xgbPredict(model, check_frequency_vector_df, check_frequency_vector_df[['sn', 'fault_time', 'label']])

2022-05-12 00:39:10,285 19440 2281365420.py[line:393] - INFO: 开始xgb模型预测
2022-05-12 00:39:10,361 19440 2281365420.py[line:395] - INFO: xgb模型预测结束
2022-05-12 00:39:10,395 19440 2281365420.py[line:399] - INFO: 测试集评估效果: 
2022-05-12 00:39:10,409 19440 2281365420.py[line:333] - INFO: Label 0:   Precision  0.93, Recall  0.06, F1  0.11
2022-05-12 00:39:10,411 19440 2281365420.py[line:333] - INFO: Label 1:   Precision  0.49, Recall  0.22, F1  0.31
2022-05-12 00:39:10,414 19440 2281365420.py[line:333] - INFO: Label 2:   Precision  0.60, Recall  0.58, F1  0.59
2022-05-12 00:39:10,417 19440 2281365420.py[line:333] - INFO: Label 3:   Precision  0.21, Recall  0.51, F1  0.30
2022-05-12 00:39:10,417 19440 2281365420.py[line:334] - INFO: macro_f1: 0.24173740905884014



In [42]:
test2 = xgbPredict(model, train_feature_df, train_feature_df[['sn', 'fault_time', 'label']])

2022-05-12 00:39:47,854 19440 2281365420.py[line:393] - INFO: 开始xgb模型预测
2022-05-12 00:39:47,918 19440 2281365420.py[line:395] - INFO: xgb模型预测结束
2022-05-12 00:39:47,950 19440 2281365420.py[line:399] - INFO: 测试集评估效果: 
2022-05-12 00:39:47,985 19440 2281365420.py[line:333] - INFO: Label 0:   Precision  1.00, Recall  1.00, F1  1.00
2022-05-12 00:39:47,988 19440 2281365420.py[line:333] - INFO: Label 1:   Precision  1.00, Recall  0.99, F1  1.00
2022-05-12 00:39:47,991 19440 2281365420.py[line:333] - INFO: Label 2:   Precision  1.00, Recall  1.00, F1  1.00
2022-05-12 00:39:47,995 19440 2281365420.py[line:333] - INFO: Label 3:   Precision  1.00, Recall  0.99, F1  1.00
2022-05-12 00:39:47,996 19440 2281365420.py[line:334] - INFO: macro_f1: 0.9980335422327866



In [44]:
test3 = xgbPredict(check_model, check_frequency_vector_df, check_frequency_vector_df[['sn', 'fault_time', 'label']])

2022-05-12 00:40:42,073 19440 2281365420.py[line:393] - INFO: 开始xgb模型预测
2022-05-12 00:40:42,146 19440 2281365420.py[line:395] - INFO: xgb模型预测结束
2022-05-12 00:40:42,180 19440 2281365420.py[line:399] - INFO: 测试集评估效果: 
2022-05-12 00:40:42,193 19440 2281365420.py[line:333] - INFO: Label 0:   Precision  1.00, Recall  1.00, F1  1.00
2022-05-12 00:40:42,196 19440 2281365420.py[line:333] - INFO: Label 1:   Precision  1.00, Recall  1.00, F1  1.00
2022-05-12 00:40:42,198 19440 2281365420.py[line:333] - INFO: Label 2:   Precision  1.00, Recall  1.00, F1  1.00
2022-05-12 00:40:42,200 19440 2281365420.py[line:333] - INFO: Label 3:   Precision  1.00, Recall  0.99, F1  1.00
2022-05-12 00:40:42,201 19440 2281365420.py[line:334] - INFO: macro_f1: 0.9983509563194598



In [45]:
test4 = xgbPredict(check_model, train_feature_df, train_feature_df[['sn', 'fault_time', 'label']])

2022-05-12 00:41:19,424 19440 2281365420.py[line:393] - INFO: 开始xgb模型预测
2022-05-12 00:41:19,505 19440 2281365420.py[line:395] - INFO: xgb模型预测结束
2022-05-12 00:41:19,537 19440 2281365420.py[line:399] - INFO: 测试集评估效果: 
2022-05-12 00:41:19,552 19440 2281365420.py[line:333] - INFO: Label 0:   Precision  0.96, Recall  0.01, F1  0.03
2022-05-12 00:41:19,596 19440 2281365420.py[line:333] - INFO: Label 1:   Precision  0.50, Recall  0.20, F1  0.29
2022-05-12 00:41:19,599 19440 2281365420.py[line:333] - INFO: Label 2:   Precision  0.81, Recall  0.36, F1  0.50
2022-05-12 00:41:19,602 19440 2281365420.py[line:333] - INFO: Label 3:   Precision  0.18, Recall  0.82, F1  0.30
2022-05-12 00:41:19,603 19440 2281365420.py[line:334] - INFO: macro_f1: 0.191219143060124



In [74]:
test5 = xgbPredict(model, train_feature_df, check_frequency_vector_df[['sn', 'fault_time', 'label']])

2022-05-12 01:03:50,417 19440 2281365420.py[line:393] - INFO: 开始xgb模型预测
2022-05-12 01:03:50,487 19440 2281365420.py[line:395] - INFO: xgb模型预测结束
2022-05-12 01:03:50,519 19440 2281365420.py[line:399] - INFO: 测试集评估效果: 
2022-05-12 01:03:50,535 19440 2281365420.py[line:333] - INFO: Label 0:   Precision  1.00, Recall  1.00, F1  1.00
2022-05-12 01:03:50,538 19440 2281365420.py[line:333] - INFO: Label 1:   Precision  1.00, Recall  0.99, F1  1.00
2022-05-12 01:03:50,540 19440 2281365420.py[line:333] - INFO: Label 2:   Precision  1.00, Recall  1.00, F1  1.00
2022-05-12 01:03:50,542 19440 2281365420.py[line:333] - INFO: Label 3:   Precision  1.00, Recall  0.99, F1  1.00
2022-05-12 01:03:50,543 19440 2281365420.py[line:334] - INFO: macro_f1: 0.9980335422327866



In [77]:
test6 = xgbPredict(check_model, check_frequency_vector_df, train_feature_df[['sn', 'fault_time', 'label']])

2022-05-12 01:04:36,822 19440 2281365420.py[line:393] - INFO: 开始xgb模型预测
2022-05-12 01:04:36,895 19440 2281365420.py[line:395] - INFO: xgb模型预测结束
2022-05-12 01:04:36,928 19440 2281365420.py[line:399] - INFO: 测试集评估效果: 
2022-05-12 01:04:36,944 19440 2281365420.py[line:333] - INFO: Label 0:   Precision  1.00, Recall  1.00, F1  1.00
2022-05-12 01:04:36,946 19440 2281365420.py[line:333] - INFO: Label 1:   Precision  1.00, Recall  1.00, F1  1.00
2022-05-12 01:04:36,949 19440 2281365420.py[line:333] - INFO: Label 2:   Precision  1.00, Recall  1.00, F1  1.00
2022-05-12 01:04:36,951 19440 2281365420.py[line:333] - INFO: Label 3:   Precision  1.00, Recall  0.99, F1  1.00
2022-05-12 01:04:36,952 19440 2281365420.py[line:334] - INFO: macro_f1: 0.9983509563194598



In [89]:
total_gain_dict=model.get_score(importance_type='total_gain')
total_gain_dict=dict(sorted(total_gain_dict.items(),key=lambda item:item[1],reverse=True))
feature_importance_top_200_dict={}
i=0
for key in total_gain_dict:
    if i<200:
        feature_name=feature_list[int(key[1:])]
        feature_importance_top_200_dict[feature_name]=total_gain_dict[key]
        i+=1
    else:
        break
feature_importance_top_200_dict

{'memory': 16509.923828125,
 'processor': 7686.3681640625,
 'nearest_msg_fault_time_delta': 5855.13427734375,
 'or': 5583.09521484375,
 'e': 3275.843505859375,
 'cpu': 3224.369140625,
 'mcerr': 2172.015380859375,
 'server_model': 1779.3798828125,
 'caterr': 1087.5068359375,
 'fault_minute': 1077.574462890625,
 'msg_minute_avg': 896.1079711914062,
 'msg_minute_min': 804.2904052734375,
 'msg_minute_max': 752.153564453125,
 'ecc': 702.378173828125,
 'msg_cnt': 673.3145141601562,
 'msg_hour_avg': 638.6115112304688,
 'fault_hour': 627.4557495117188,
 'all_msg_3_cnt': 625.0953369140625,
 'deasserted': 607.4459228515625,
 'msg_minute_median': 510.52691650390625,
 'msg_minute_mode': 503.4451599121094,
 'c': 485.1116943359375,
 'config': 481.6801452636719,
 'msg_hour_max': 448.94097900390625,
 'msg_hour_min': 413.9129333496094,
 'microcontroller/coprocessor': 392.5028381347656,
 'uncorrectable ecc': 381.94464111328125,
 'power supply': 331.1365051269531,
 'ec': 301.3660583496094,
 'dimm': 287.0

In [90]:
total_gain_dict=check_model.get_score(importance_type='total_gain')
total_gain_dict=dict(sorted(total_gain_dict.items(),key=lambda item:item[1],reverse=True))
check_feature_importance_top_200_dict={}
i=0
for key in total_gain_dict:
    if i<200:
        feature_name=check_feature_list[int(key[1:])]
        check_feature_importance_top_200_dict[feature_name]=total_gain_dict[key]
        i+=1
    else:
        break
check_feature_importance_top_200_dict

{'memory': 16678.73828125,
 'processor': 7667.82958984375,
 'nearest_msg_fault_time_delta': 5859.923828125,
 'or': 5416.6943359375,
 'e': 3332.02685546875,
 'cpu': 3314.033447265625,
 'mcerr': 2197.673095703125,
 'server_model': 1830.1949462890625,
 'fault_minute': 1095.7232666015625,
 'caterr': 992.7603149414062,
 'msg_minute_avg': 858.0697021484375,
 'msg_minute_max': 831.161376953125,
 'msg_minute_min': 807.8883666992188,
 'ecc': 688.5580444335938,
 'msg_cnt': 662.7244262695312,
 'fault_hour': 649.0639038085938,
 'msg_hour_avg': 634.6235961914062,
 'all_msg_3_cnt': 617.2780151367188,
 'deassert': 596.8054809570312,
 'msg_minute_mode': 538.3112182617188,
 'msg_minute_median': 505.2546691894531,
 'c': 494.27911376953125,
 'msg_hour_max': 442.40850830078125,
 'msg_hour_min': 435.636962890625,
 'microcontroller/coprocessor': 403.9117736816406,
 'uncorrectable ecc': 397.3321838378906,
 'ec': 320.33978271484375,
 'power supply': 308.3546142578125,
 '0': 299.7262878417969,
 'config': 284.4

In [91]:
for name in feature_importance_top_200_dict:
    if name not in check_feature_importance_top_200_dict:
        print(name)

deasserted
device enabled
initiated by warm reset
trip
check
restart
presence
running
specified
upper
16
failure
system boot initiated system restart
oem
working
bmc_boot_up
power_drop
57
chassis_control
legacy off state
cpu0_margin_temp
cpu0_temp
sel_status
ac lost
notice


In [96]:
top_200_feature_list = list(feature_importance_top_200_dict.keys())
top_200_feature_list += ['sn', 'fault_time', 'label']
check_top_200_feature_list = list(check_feature_importance_top_200_dict.keys())
check_top_200_feature_list += ['sn', 'fault_time', 'label']

In [99]:
train_feature_df[top_200_feature_list]

Unnamed: 0,memory,processor,nearest_msg_fault_time_delta,or,e,cpu,mcerr,server_model,caterr,fault_minute,...,ac lost,85,notice,critical interrupt,acpi_pwr_status,12,e9,sn,fault_time,label
0,0,4,3497,9,60,4,0,57,0,4,...,0,0,0,0,2,0,0,SERVER_10001,2020-05-01 10:04:00,1
1,1,0,16,2,6,1,0,57,0,48,...,0,0,0,0,0,0,0,SERVER_10003,2020-03-28 09:48:00,2
2,1,2,1172,5,14,2,0,53,0,12,...,0,0,0,0,0,0,0,SERVER_10008,2020-02-25 16:12:00,1
3,1,2,4611,5,20,2,0,53,0,4,...,0,0,0,0,0,0,0,SERVER_10008,2020-03-11 18:04:00,2
4,0,0,1757,0,18,0,0,53,0,37,...,0,0,0,0,0,0,0,SERVER_10009,2020-05-08 16:37:00,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16599,1,0,127,2,6,1,0,56,0,49,...,0,0,0,0,0,0,0,SERVER_9991,2020-08-04 22:49:00,2
16600,1,0,1659,2,6,1,0,56,0,42,...,0,0,0,0,0,0,0,SERVER_9991,2020-10-07 18:42:00,2
16601,1,0,119,2,6,1,0,57,0,50,...,0,0,0,0,0,0,0,SERVER_9993,2020-05-14 23:50:00,2
16602,2,0,324,4,13,2,0,57,0,25,...,0,0,0,0,0,0,0,SERVER_9998,2020-05-29 11:25:00,2


In [100]:
check_frequency_vector_df[check_top_200_feature_list]

Unnamed: 0,memory,processor,nearest_msg_fault_time_delta,or,e,cpu,mcerr,server_model,fault_minute,caterr,...,slot / connector,card,cpu1_vr_temp,subsystem,sensor,cpu1e0_dimm_stat,hard,sn,fault_time,label
0,0,4,3497,9,60,4,0,57,4,0,...,0,0,0,0,2,0,0,SERVER_10001,2020-05-01 10:04:00,1
1,1,0,16,2,6,1,0,57,48,0,...,0,0,0,0,0,0,0,SERVER_10003,2020-03-28 09:48:00,2
2,1,2,1172,5,14,2,0,53,12,0,...,0,0,0,0,0,0,0,SERVER_10008,2020-02-25 16:12:00,1
3,1,2,4611,5,20,2,0,53,4,0,...,0,0,0,0,0,0,0,SERVER_10008,2020-03-11 18:04:00,2
4,0,0,1757,0,18,0,0,53,37,0,...,0,0,0,0,0,0,0,SERVER_10009,2020-05-08 16:37:00,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16599,1,0,127,2,6,1,0,56,49,0,...,0,0,0,0,0,0,0,SERVER_9991,2020-08-04 22:49:00,2
16600,1,0,1659,2,6,1,0,56,42,0,...,0,0,0,0,0,0,0,SERVER_9991,2020-10-07 18:42:00,2
16601,1,0,119,2,6,1,0,57,50,0,...,0,0,0,0,0,0,0,SERVER_9993,2020-05-14 23:50:00,2
16602,2,0,324,4,13,2,0,57,25,0,...,0,0,0,0,0,0,0,SERVER_9998,2020-05-29 11:25:00,2


In [97]:
test7 = xgbPredict(model, train_feature_df[top_200_feature_list], train_feature_df[['sn', 'fault_time', 'label']])

2022-05-12 01:13:23,506 19440 2281365420.py[line:393] - INFO: 开始xgb模型预测
2022-05-12 01:13:23,532 19440 2281365420.py[line:395] - INFO: xgb模型预测结束
2022-05-12 01:13:23,565 19440 2281365420.py[line:399] - INFO: 测试集评估效果: 
2022-05-12 01:13:23,579 19440 2281365420.py[line:333] - INFO: Label 0:   Precision  0.00, Recall  0.00, F1  0.00
2022-05-12 01:13:23,582 19440 2281365420.py[line:333] - INFO: Label 1:   Precision  0.00, Recall  0.00, F1  0.00
2022-05-12 01:13:23,585 19440 2281365420.py[line:333] - INFO: Label 2:   Precision  0.56, Recall  1.00, F1  0.72
2022-05-12 01:13:23,587 19440 2281365420.py[line:333] - INFO: Label 3:   Precision  0.00, Recall  0.00, F1  0.00
2022-05-12 01:13:23,588 19440 2281365420.py[line:334] - INFO: macro_f1: 0.0653440850040827



In [98]:
test8 = xgbPredict(check_model, check_frequency_vector_df[check_top_200_feature_list], check_frequency_vector_df[['sn', 'fault_time', 'label']])

2022-05-12 01:14:15,667 19440 2281365420.py[line:393] - INFO: 开始xgb模型预测
2022-05-12 01:14:15,697 19440 2281365420.py[line:395] - INFO: xgb模型预测结束
2022-05-12 01:14:15,731 19440 2281365420.py[line:399] - INFO: 测试集评估效果: 
2022-05-12 01:14:15,745 19440 2281365420.py[line:333] - INFO: Label 0:   Precision  0.00, Recall  0.00, F1  0.00
2022-05-12 01:14:15,747 19440 2281365420.py[line:333] - INFO: Label 1:   Precision  0.00, Recall  0.00, F1  0.00
2022-05-12 01:14:15,750 19440 2281365420.py[line:333] - INFO: Label 2:   Precision  0.56, Recall  1.00, F1  0.72
2022-05-12 01:14:15,752 19440 2281365420.py[line:333] - INFO: Label 3:   Precision  0.00, Recall  0.00, F1  0.00
2022-05-12 01:14:15,752 19440 2281365420.py[line:334] - INFO: macro_f1: 0.0653440850040827



In [None]:
test9 = xgbPredict(model, check_frequency_vector_df, train_feature_df[['sn', 'fault_time', 'label']])

In [105]:
new_check_frequency_vector_df = check_frequency_vector_df[list(train_feature_df)]
new_train_feature_df = train_feature_df[list(check_frequency_vector_df)]

In [104]:
list(check_frequency_vector_df)

['cpu2',
 'bp4_hdd15_status',
 'front1_9_status',
 'deasserted drive slot / bay disk stat',
 'fan8_rpm',
 'ac lost',
 'mcerr',
 'deasserted unknown cpu caterr',
 'hdd76',
 'asserted system firmware error system progress',
 'asserted temperature cpu margin',
 'deasserted os stop/shutdown',
 'bp2_hdd5_status',
 'ipmi_watchdog',
 'aa170009028600c00026702000',
 'f7d',
 '95',
 'cpu0_a1',
 '5000',
 'front1_5_status',
 'os graceful shutdown',
 'asserted microcontroller',
 '44',
 'button buttonmm ev',
 'cpu3_status',
 'unit',
 'hdd62',
 'deasserted drive slot / bay hdd status',
 'hdd_r_3_status',
 'reached',
 'temp',
 '114',
 '000040000704',
 'asserted button button_pressed',
 'transition to running system boot initiated',
 'bp2_hdd14_status',
 'ocp_card_stat',
 'timer expired',
 '3',
 'hdd_r_22_status',
 'hdd_l_26_status',
 '200002610200',
 'unknown error',
 'fpga_pg',
 '2816',
 'dimm171_status',
 'hdd_l_13_status',
 'mem_chd0_status',
 'disk7',
 '245',
 'memory cpu',
 'log',
 'sig',
 'mem_ch

In [102]:
test10 = xgbPredict(model, new_check_frequency_vector_df, new_check_frequency_vector_df[['sn', 'fault_time', 'label']])

2022-05-12 01:19:34,301 19440 2281365420.py[line:393] - INFO: 开始xgb模型预测
2022-05-12 01:19:34,374 19440 2281365420.py[line:395] - INFO: xgb模型预测结束
2022-05-12 01:19:34,406 19440 2281365420.py[line:399] - INFO: 测试集评估效果: 
2022-05-12 01:19:34,419 19440 2281365420.py[line:333] - INFO: Label 0:   Precision  1.00, Recall  1.00, F1  1.00
2022-05-12 01:19:34,422 19440 2281365420.py[line:333] - INFO: Label 1:   Precision  1.00, Recall  0.99, F1  1.00
2022-05-12 01:19:34,424 19440 2281365420.py[line:333] - INFO: Label 2:   Precision  1.00, Recall  1.00, F1  1.00
2022-05-12 01:19:34,427 19440 2281365420.py[line:333] - INFO: Label 3:   Precision  1.00, Recall  0.99, F1  1.00
2022-05-12 01:19:34,427 19440 2281365420.py[line:334] - INFO: macro_f1: 0.9980335422327866



In [106]:
test11 = xgbPredict(check_model, new_train_feature_df, new_train_feature_df[['sn', 'fault_time', 'label']])

2022-05-12 01:23:43,184 19440 2281365420.py[line:393] - INFO: 开始xgb模型预测
2022-05-12 01:23:43,255 19440 2281365420.py[line:395] - INFO: xgb模型预测结束
2022-05-12 01:23:43,288 19440 2281365420.py[line:399] - INFO: 测试集评估效果: 
2022-05-12 01:23:43,302 19440 2281365420.py[line:333] - INFO: Label 0:   Precision  1.00, Recall  1.00, F1  1.00
2022-05-12 01:23:43,304 19440 2281365420.py[line:333] - INFO: Label 1:   Precision  1.00, Recall  1.00, F1  1.00
2022-05-12 01:23:43,307 19440 2281365420.py[line:333] - INFO: Label 2:   Precision  1.00, Recall  1.00, F1  1.00
2022-05-12 01:23:43,309 19440 2281365420.py[line:333] - INFO: Label 3:   Precision  1.00, Recall  0.99, F1  1.00
2022-05-12 01:23:43,309 19440 2281365420.py[line:334] - INFO: macro_f1: 0.9983509563194598



In [107]:
b_feature_df = caculateFeature(preliminary_sel_log_dataset_b, preliminary_submit_dataset_b, word_list)

2022-05-12 01:26:56,849 19440 2281365420.py[line:237] - INFO: 开始拼接日志和标签数据
2022-05-12 01:26:56,921 19440 2281365420.py[line:246] - INFO: 拼接日志和标签数据结束
2022-05-12 01:26:56,922 19440 2281365420.py[line:248] - INFO: 开始匹配日志和标签
2022-05-12 01:26:56,923 19440 2281365420.py[line:249] - INFO: 使用报错时间截断进行划分
2022-05-12 01:27:55,733 19440 2281365420.py[line:252] - INFO: 匹配日志和标签结束
2022-05-12 01:27:55,733 19440 2281365420.py[line:254] - INFO: 开始计算统计特征
2022-05-12 01:27:58,477 19440 2281365420.py[line:257] - INFO: 计算统计特征结束
2022-05-12 01:27:58,478 19440 2281365420.py[line:263] - INFO: 开始计算词频特征
0 2022-05-12 01:27:58.479919
100 2022-05-12 01:27:58.783919
200 2022-05-12 01:27:59.090837
300 2022-05-12 01:27:59.393837
400 2022-05-12 01:27:59.699837
500 2022-05-12 01:28:00.007837
600 2022-05-12 01:28:00.313837
700 2022-05-12 01:28:00.619836
800 2022-05-12 01:28:00.936837
900 2022-05-12 01:28:01.244837
1000 2022-05-12 01:28:01.550838
1100 2022-05-12 01:28:01.857837
1200 2022-05-12 01:28:02.163837
1300 2022-05-12 

In [108]:
b_feature_df

Unnamed: 0,2116,ps0_status,f2a,chassis,deasserted os stop/shutdown,power supply power limiting,74,f9d,26,bp3_hdd1_status,...,msg_hour_median,msg_hour_mode,msg_minute_max,msg_minute_min,msg_minute_avg,msg_minute_median,msg_minute_mode,label,sn,fault_time
0,0,0,0,0,0,0,0,0,0,0,...,23.0,23,48,47,47.300000,47.0,47,-1,0015fe530ad4,2020-05-01 23:48:17
1,0,0,0,0,0,0,0,0,0,0,...,7.0,7,16,16,16.000000,16.0,16,-1,00380f1435b0,2020-07-28 07:51:13
2,0,0,0,0,0,0,0,0,0,0,...,5.0,5,31,28,30.400000,31.0,31,-1,0045a71d0221,2020-07-02 06:33:54
3,0,0,0,0,0,0,0,0,0,0,...,8.0,8,13,7,8.200000,7.0,7,-1,004d5a7954e7,2020-08-24 08:27:55
4,0,0,0,0,0,0,0,0,0,0,...,9.0,9,42,42,42.000000,42.0,42,-1,004d5a7954e7,2020-08-24 09:42:45
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3025,0,0,0,0,0,0,0,0,0,0,...,15.0,15,46,46,46.000000,46.0,46,-1,ff92904d80f2,2020-08-28 16:48:54
3026,0,0,0,0,0,0,0,0,0,0,...,6.0,6,28,28,28.000000,28.0,28,-1,ff9f7a9c5c7e,2020-04-14 08:13:55
3027,0,0,0,0,0,0,0,0,0,0,...,22.0,22,38,37,37.545455,38.0,38,-1,ffc2f37539a9,2020-02-21 22:52:54
3028,0,0,0,0,0,0,0,0,0,0,...,10.0,10,25,25,25.000000,25.0,25,-1,fff12c95cc99,2020-05-20 10:27:43


In [109]:
check_b_feature_df = pd.read_csv('./user_data/tmp_data/check_b_feature_df.csv')

In [110]:
check_b_feature_df

Unnamed: 0,cpu2,bp4_hdd15_status,front1_9_status,deasserted drive slot / bay disk stat,fan8_rpm,ac lost,mcerr,deasserted unknown cpu caterr,hdd76,asserted system firmware error system progress,...,msg_hour_avg,msg_hour_median,msg_hour_mode,msg_minute_max,msg_minute_min,msg_minute_avg,msg_minute_median,msg_minute_mode,sn,fault_time
0,0,0,0,0,0,0,0,0,0,0,...,23.0,23.0,23,48,47,47.300000,47.0,47,0015fe530ad4,2020-05-01 23:48:17
1,0,0,0,0,0,0,0,0,0,0,...,7.0,7.0,7,16,16,16.000000,16.0,16,00380f1435b0,2020-07-28 07:51:13
2,0,0,0,0,0,0,1,0,0,0,...,5.0,5.0,5,31,28,30.400000,31.0,31,0045a71d0221,2020-07-02 06:33:54
3,0,0,0,0,0,0,0,0,0,0,...,8.0,8.0,8,13,7,8.200000,7.0,7,004d5a7954e7,2020-08-24 08:27:55
4,0,0,0,0,0,0,0,0,0,0,...,9.0,9.0,9,42,42,42.000000,42.0,42,004d5a7954e7,2020-08-24 09:42:45
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3025,0,0,0,0,0,0,0,0,0,0,...,15.0,15.0,15,46,46,46.000000,46.0,46,ff92904d80f2,2020-08-28 16:48:54
3026,0,0,0,0,0,0,0,0,0,0,...,6.0,6.0,6,28,28,28.000000,28.0,28,ff9f7a9c5c7e,2020-04-14 08:13:55
3027,4,0,0,0,0,0,0,0,0,0,...,22.0,22.0,22,38,37,37.545455,38.0,38,ffc2f37539a9,2020-02-21 22:52:54
3028,0,0,0,0,0,0,0,0,0,0,...,10.0,10.0,10,25,25,25.000000,25.0,25,fff12c95cc99,2020-05-20 10:27:43


In [111]:
for name in feature_list:
    temp = np.array(b_feature_df[name]) - np.array(check_b_feature_df[name])
    for i in range(len(temp)):
        if temp[i] > 10**-6:
            break
    if temp[i] > 10**-6:
        break

In [112]:
name

'msg_minute_mode'

In [113]:
b_prediction_df = xgbPredict(model, b_feature_df)

2022-05-12 01:33:45,992 19440 2281365420.py[line:381] - INFO: 开始xgb模型预测
2022-05-12 01:33:46,006 19440 2281365420.py[line:383] - INFO: xgb模型预测结束


In [115]:
check_b_prediction_df = pd.read_csv('./user_data/tmp_data/check_predictions.csv')

In [116]:
test12 = xgbPredict(model, b_feature_df, check_b_prediction_df)

2022-05-12 01:36:05,073 19440 2281365420.py[line:393] - INFO: 开始xgb模型预测
2022-05-12 01:36:05,087 19440 2281365420.py[line:395] - INFO: xgb模型预测结束
2022-05-12 01:36:05,113 19440 2281365420.py[line:399] - INFO: 测试集评估效果: 
2022-05-12 01:36:05,120 19440 2281365420.py[line:333] - INFO: Label 0:   Precision  0.83, Recall  0.79, F1  0.81
2022-05-12 01:36:05,122 19440 2281365420.py[line:333] - INFO: Label 1:   Precision  0.95, Recall  0.95, F1  0.95
2022-05-12 01:36:05,124 19440 2281365420.py[line:333] - INFO: Label 2:   Precision  0.99, Recall  0.99, F1  0.99
2022-05-12 01:36:05,126 19440 2281365420.py[line:333] - INFO: Label 3:   Precision  0.98, Recall  0.98, F1  0.98
2022-05-12 01:36:05,127 19440 2281365420.py[line:334] - INFO: macro_f1: 0.892644639309273



In [117]:
test13 = xgbPredict(check_model, check_b_feature_df, b_prediction_df)

2022-05-12 01:37:31,777 19440 2281365420.py[line:393] - INFO: 开始xgb模型预测
2022-05-12 01:37:31,795 19440 2281365420.py[line:395] - INFO: xgb模型预测结束
2022-05-12 01:37:31,822 19440 2281365420.py[line:399] - INFO: 测试集评估效果: 
2022-05-12 01:37:31,829 19440 2281365420.py[line:333] - INFO: Label 0:   Precision  0.81, Recall  0.68, F1  0.74
2022-05-12 01:37:31,831 19440 2281365420.py[line:333] - INFO: Label 1:   Precision  0.92, Recall  0.96, F1  0.94
2022-05-12 01:37:31,833 19440 2281365420.py[line:333] - INFO: Label 2:   Precision  1.00, Recall  0.99, F1  0.99
2022-05-12 01:37:31,835 19440 2281365420.py[line:333] - INFO: Label 3:   Precision  0.97, Recall  0.98, F1  0.98
2022-05-12 01:37:31,836 19440 2281365420.py[line:334] - INFO: macro_f1: 0.8565725545320657



In [None]:


# 获取特征
# 计算特征

train_feature_df.to_csv('./user_data/feature_data/train_feature_df.csv', index=None)
a_feature_df = caculateFeature(preliminary_sel_log_dataset_a, preliminary_submit_dataset_a, word_list)
a_feature_df.to_csv('./user_data/feature_data/a_feature_df.csv', index=None)
b_feature_df = caculateFeature(preliminary_sel_log_dataset_b, preliminary_submit_dataset_b, word_list)
b_feature_df.to_csv('./user_data/feature_data/b_feature_df.csv', index=None)
val_mask = [random.random() < 0.3 for _ in range(len(train_feature_df))]
train_mask = [not xx for xx in val_mask]
temp_feature_df = train_feature_df[train_mask]
val_feature_df = train_feature_df[val_mask]
temp_feature_df.to_csv('./user_data/feature_data/temp_feature_df.csv', index=None)
val_feature_df.to_csv('./user_data/feature_data/val_feature_df.csv', index=None)
读取特征
train_feature_df = pd.read_csv('./user_data/feature_data/train_feature_df.csv')
a_feature_df = pd.read_csv('./user_data/feature_data/a_feature_df.csv')
b_feature_df = pd.read_csv('./user_data/feature_data/b_feature_df.csv')
temp_feature_df = pd.read_csv('./user_data/feature_data/temp_feature_df.csv')
val_feature_df = pd.read_csv('./user_data/feature_data/val_feature_df.csv')

xgb训练
model = xgbTrain(train_feature_df)
model.save_model('./user_data/model_data/xgb_model_v2.json')
model_temp = xgbTrain(temp_feature_df)
model_temp.save_model('./user_data/model_data/temp_xgb_model_v2.json')
xgb预测
# 读取模型文件
model = xgb.Booster()
model.load_model('./user_data/model_data/xgb_model_v2.json')
# model.load_model('./user_data/model_data/temp_xgb_model_v2.json')
# 预测
train_prediction_df = xgbPredict(model, train_feature_df, train_feature_df[['sn', 'fault_time', 'label']])
train_prediction_df.to_csv('./prediction_result/train_prediction_df.csv',index=None)
temp_prediction_df = xgbPredict(model, temp_feature_df, temp_feature_df[['sn', 'fault_time', 'label']])
temp_prediction_df.to_csv('./prediction_result/temp_prediction_df.csv',index=None)
val_prediction_df = xgbPredict(model, val_feature_df, val_feature_df[['sn', 'fault_time', 'label']])
val_prediction_df.to_csv('./prediction_result/val_prediction_df.csv',index=None)
a_prediction_df = xgbPredict(model, a_feature_df, preliminary_submit_dataset_a_v1p6)
a_prediction_df.to_csv('./prediction_result/a_prediction_df.csv',index=None)
b_prediction_df = xgbPredict(model, b_feature_df, preliminary_submit_dataset_b_v1p8)
b_prediction_df.to_csv('./prediction_result/b_prediction_df.csv',index=None)


In [None]:
v1_word_list = list(pd.read_csv('./user_data/words/word_frequency_df.txt',sep='\t')['word'])
v1p1_word_list = list(pd.read_csv('./user_data/words/tags_incomplete.txt',sep='\t',names=['word'])['word'])
word_list = list(set(v1_word_list+v1p1_word_list))
final_a_feature_df = caculateFeature(final_log_a, final_label_a, word_list)
final_a_feature_df.to_csv('./user_data/feature_data/final_a_feature_df.csv', index=None)
model = xgb.Booster()
model.load_model('./user_data/model_data/xgb_model_v2.json')
final_prediction_df = xgbPredict(model, final_a_feature_df)
final_prediction_df.to_csv('./prediction_result/predictions.csv',index=None)