# 背景：在v1p7的基础上，使用B榜新增的数据集，作为B榜的baseline结果

## 导包、设置根目录

In [1]:
import pandas as pd
import numpy as np
import os
import nltk
from nltk.tokenize import word_tokenize
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models.word2vec import Word2Vec
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import random
import pickle
import multiprocessing
import re
import pickle
from matplotlib import pyplot as plt

from collections import Counter
from itertools import chain
from datetime import datetime

# 更改工作目录为当前项目根目录
import sys
import os
os.chdir(os.path.dirname(os.path.dirname(sys.path[0])))
print(os.getcwd())

/Users/jincan02/Projects/Log-diagnosis


In [249]:
# 处理后数据的主键为 sn + fault_time + label !!!

# sn分组后，按照最近邻+时间间隔划分日志数据
def divideLogByNearestTime(log_label_df: pd.DataFrame):
    log_correspond_label_df = pd.DataFrame(columns=['sn', 'fault_time', 'msg', 'time', 'server_model', 'label'])
    no_label_log_list = []
    # 不设置截断时间，效果最好
    cutoff = 10 * 3600

    for sn, log in log_label_df.groupby('sn'):
        if len(log[log['label'] != '']) == 0:
            no_label_log_list.append(log)
        elif len(log[log['label'] != '']) == 1:
            # 没有日志的标签，直接将标签的可用信息作为日志
            if len(log) == 1:
                msg_df = log
                msg_df['fault_time'] = log['time'].iloc[0]
                log_correspond_label_df = pd.concat([log_correspond_label_df, msg_df])
            else:
                msg_df = log[log['label'] == '']
                msg_df['label'] = log[log['label'] != '']['label'].iloc[0]
                msg_df['fault_time'] = log[log['label'] != '']['time'].iloc[0]
                log_correspond_label_df = pd.concat([log_correspond_label_df, msg_df])
        else:
            label_df = log[log['label'] != '']
            msg_df = log[log['label'] == '']
            for msg_item in msg_df.iterrows():
                previous_delta_time = 1000 * 24 * 3600
                for label_item in label_df.iterrows():
                    now_delta_time = abs(datetime.strptime(label_item[1]['time'],'%Y-%m-%d %H:%M:%S'
                        ) - datetime.strptime(msg_item[1]['time'],'%Y-%m-%d %H:%M:%S'))
                    if now_delta_time.days * 24 * 3600 + now_delta_time.seconds < previous_delta_time:
                        previous_delta_time = now_delta_time.days * 24 * 3600 + now_delta_time.seconds
                        if previous_delta_time < cutoff:
                            msg_item[1]['fault_time'] = label_item[1]['time']
                            msg_item[1]['label'] = label_item[1]['label']
            log_correspond_label_df = pd.concat([log_correspond_label_df, msg_df])
            # 没有日志的标签，直接将标签的可用信息作为日志
            for label_item in label_df.iterrows():
                if len(msg_df[(msg_df['fault_time'] == label_item[1]['time']) & (
                    msg_df['label'] == label_item[1]['label'])]) == 0:
                    label_item[1]['fault_time'] = label_item[1]['time']
            log_correspond_label_df = pd.concat([log_correspond_label_df, label_df])
    if len(log_correspond_label_df[log_correspond_label_df['label'] == '']) > 0:
        no_label_log_list.append(log_correspond_label_df[log_correspond_label_df['label'] == ''])
    log_correspond_label_df = log_correspond_label_df[log_correspond_label_df['fault_time'] != '']
    return log_correspond_label_df, no_label_log_list



# sn分组后，本次报错和上次报错之间的日志匹配到本次报错
def divideLogByFaultTime(log_label_df: pd.DataFrame):
    log_correspond_label_df = pd.DataFrame(columns=['sn', 'fault_time', 'msg', 'time', 'server_model', 'label'])
    no_label_log_list = []
    log_label_df =  log_label_df.reset_index(drop = True)
    
    for sn, log in log_label_df.groupby('sn'):
        if len(log[log['label'] != '']) == 0:
            no_label_log_list.append(log)
        elif len(log[log['label'] != '']) == 1:
            msg_df = log[log['label'] == '']
            msg_df['label'] = log[log['label'] != '']['label'].iloc[0]
            msg_df['fault_time'] = log[log['label'] != '']['time'].iloc[0]
            log_correspond_label_df = pd.concat([log_correspond_label_df, msg_df])
        else:
            # 使用index的顺序取数时，要注意index必须按所需的顺序排列
            cutoff_index = [-1] + log.loc[log['label'] != ''].index.tolist() + [log.index.tolist()[-1]+1]
            for kth in range(len(cutoff_index)-1):
                temp_log = log.loc[(log.index <= cutoff_index[kth+1]) & (log.index > cutoff_index[kth])]
                if len(temp_log) > 0:
                    if len(temp_log[temp_log['label'] != '']) == 0:
                        no_label_log_list.append(temp_log)
                    # 只有标签，没有日志的数据，把标签的部分数据直接作为日志
                    elif len(temp_log) == 1:
                        msg_df = temp_log
                        msg_df['fault_time'] = temp_log[temp_log['label'] != '']['time'].iloc[0]
                        log_correspond_label_df = pd.concat([log_correspond_label_df, msg_df])
                    else:
                        msg_df = temp_log[temp_log['label'] == '']
                        msg_df['label'] = temp_log[temp_log['label'] != '']['label'].iloc[0]
                        msg_df['fault_time'] = temp_log[temp_log['label'] != '']['time'].iloc[0]
                        log_correspond_label_df = pd.concat([log_correspond_label_df, msg_df])
    return log_correspond_label_df, no_label_log_list



# 计算统计特征
def calculateStatisticFeature(log_correspond_label_df: pd.DataFrame):
    use_log_label_df = log_correspond_label_df

    use_log_label_df['msg_hour'] = use_log_label_df['time'].apply(lambda x : datetime.strptime(x, "%Y-%m-%d %H:%M:%S").hour)
    use_log_label_df['msg_minute'] = use_log_label_df['time'].apply(lambda x : datetime.strptime(x, "%Y-%m-%d %H:%M:%S").minute)
    use_log_label_df['fault_hour'] = use_log_label_df['fault_time'].apply(lambda x : datetime.strptime(x, "%Y-%m-%d %H:%M:%S").hour)
    use_log_label_df['fault_minute'] = use_log_label_df['fault_time'].apply(lambda x : datetime.strptime(x, "%Y-%m-%d %H:%M:%S").minute)

    # 0408新增
    # 最近一次日志时间距报错时间间隔，单位秒
    nearest_msg_fault_time_delta_list = []
    # 日志不去重时长度1,2,3,4日志数量统计
    all_msg_1_cnt_list=[]
    all_msg_2_cnt_list=[]
    all_msg_3_cnt_list=[]
    all_msg_4_cnt_list=[]
    
    fault_minute_list = []
    msg_1_cnt_list=[]
    msg_2_cnt_list=[]
    msg_3_cnt_list=[]
    msg_4_cnt_list=[]
    msg_hour_max_list=[]
    msg_hour_min_list=[]
    msg_hour_avg_list=[]
    msg_hour_median_list=[]
    msg_hour_mode_list=[]
    msg_minute_max_list=[]
    msg_minute_min_list=[]
    msg_minute_avg_list=[]
    msg_minute_median_list=[]
    msg_minute_mode_list=[]

    sn_list=[]
    day_list=[]
    server_model_list=[]
    msg_log_list=[]
    msg_cnt_list=[]
    fault_hour_list=[]
    label_list=[]
    fault_time_list=[]
    for msg_log_df in use_log_label_df.groupby(['sn','fault_time','label']):
        msg_log_str = ''
        all_msg_1_cnt = 0
        all_msg_2_cnt = 0
        all_msg_3_cnt = 0
        all_msg_4_cnt = 0
        msg_1_cnt = 0
        msg_2_cnt = 0
        msg_3_cnt = 0
        msg_4_cnt = 0
        for info in msg_log_df[1]['msg']:
            if info == info:
                if len(info.split('|')) == 1:
                    all_msg_1_cnt += 1
                elif len(info.split('|')) == 2:
                    all_msg_2_cnt += 1
                elif len(info.split('|')) == 3:
                    all_msg_3_cnt += 1
                else:
                    all_msg_4_cnt += 1
        for info in msg_log_df[1]['msg'].drop_duplicates():
            if info == info:
                msg_log_str=msg_log_str+info.lower()+'.'
                if len(info.split('|')) == 1:
                    msg_1_cnt += 1
                elif len(info.split('|')) == 2:
                    msg_2_cnt += 1
                elif len(info.split('|')) == 3:
                    msg_3_cnt += 1
                else:
                    msg_4_cnt += 1
        nearest_msg_fault_time_delta = abs(datetime.strptime(msg_log_df[1].iloc[-1]['time'],'%Y-%m-%d %H:%M:%S'
                        ) - datetime.strptime(msg_log_df[0][1],'%Y-%m-%d %H:%M:%S'))
        nearest_msg_fault_time_delta = nearest_msg_fault_time_delta.days * 24 * 3600 + nearest_msg_fault_time_delta.seconds
        sm=int(msg_log_df[1].iloc[0]['server_model'][2:])

        sn_list.append(msg_log_df[0][0])
        fault_time_list.append(msg_log_df[0][1])
        label_list.append(msg_log_df[0][2])

        nearest_msg_fault_time_delta_list.append(nearest_msg_fault_time_delta)
        server_model_list.append(sm)
        msg_log_list.append(msg_log_str)
        msg_cnt_list.append(len(msg_log_df[1]))

        fault_hour_list.append(msg_log_df[1].iloc[0]['fault_hour'])
        fault_minute_list.append(msg_log_df[1].iloc[0]['fault_minute'])

        all_msg_1_cnt_list.append(all_msg_1_cnt)
        all_msg_2_cnt_list.append(all_msg_2_cnt)
        all_msg_3_cnt_list.append(all_msg_3_cnt)
        all_msg_4_cnt_list.append(all_msg_4_cnt)    

        msg_1_cnt_list.append(msg_1_cnt)
        msg_2_cnt_list.append(msg_2_cnt)
        msg_3_cnt_list.append(msg_3_cnt)
        msg_4_cnt_list.append(msg_4_cnt)

        msg_hour_max_list.append(msg_log_df[1]['msg_hour'].max())
        msg_hour_min_list.append(msg_log_df[1]['msg_hour'].min())
        msg_hour_avg_list.append(msg_log_df[1]['msg_hour'].mean())
        msg_hour_median_list.append(msg_log_df[1]['msg_hour'].median())
        msg_hour_mode_list.append(msg_log_df[1]['msg_hour'].mode()[0])

        msg_minute_max_list.append(msg_log_df[1]['msg_minute'].max())
        msg_minute_min_list.append(msg_log_df[1]['msg_minute'].min())
        msg_minute_avg_list.append(msg_log_df[1]['msg_minute'].mean())
        msg_minute_median_list.append(msg_log_df[1]['msg_minute'].median())
        msg_minute_mode_list.append(msg_log_df[1]['msg_minute'].mode()[0])

    msg_log_label_df=pd.DataFrame(
        {
        'sn': sn_list,
        'fault_time': fault_time_list,
        'server_model': server_model_list,
        'msg_cnt': msg_cnt_list,
        'fault_hour': fault_hour_list,
        'fault_minute': fault_minute_list,
        'nearest_msg_fault_time_delta': nearest_msg_fault_time_delta_list,
        'all_msg_1_cnt': all_msg_1_cnt_list,
        'all_msg_2_cnt': all_msg_2_cnt_list,
        'all_msg_3_cnt': all_msg_3_cnt_list,
        'all_msg_4_cnt': all_msg_4_cnt_list,
        'msg_1_cnt': msg_1_cnt_list,
        'msg_2_cnt': msg_2_cnt_list,
        'msg_3_cnt': msg_3_cnt_list,
        'msg_4_cnt': msg_4_cnt_list,
        'msg_hour_max': msg_hour_max_list,
        'msg_hour_min': msg_hour_min_list,
        'msg_hour_avg': msg_hour_avg_list,
        'msg_hour_median': msg_hour_median_list,
        'msg_hour_mode': msg_hour_mode_list,
        'msg_minute_max': msg_minute_max_list,
        'msg_minute_min': msg_minute_min_list,
        'msg_minute_avg': msg_minute_avg_list,
        'msg_minute_median': msg_minute_median_list,
        'msg_minute_mode': msg_minute_mode_list,
        'msg_log': msg_log_list,
        'label': label_list
        }
    )
    return msg_log_label_df




## B榜最新数据的数据分析

之前的数据没变，新增测试集数据：

新增日志数据preliminary_sel_log_dataset_b.csv

新增报错数据preliminary_submit_dataset_b.csv

In [2]:
# 读取sel日志数据
sel_log_df = pd.read_csv('./pre_contest/dataset_B/preliminary_sel_log_dataset.csv').drop_duplicates()
# 读取训练标签数据：有重复数据！
train_label1 = pd.read_csv('./pre_contest/dataset_B/preliminary_train_label_dataset.csv')
train_label2 = pd.read_csv('./pre_contest/dataset_B/preliminary_train_label_dataset_s.csv')

# 
additional_sel_log_dataset = pd.read_csv('./pre_contest/dataset_B/additional_sel_log_dataset.csv')

#
preliminary_sel_log_dataset_a = pd.read_csv('./pre_contest/dataset_B/preliminary_sel_log_dataset_a.csv')

# 
preliminary_submit_dataset_a = pd.read_csv('./pre_contest/dataset_B/preliminary_submit_dataset_a.csv')

# 新增
preliminary_sel_log_dataset_b = pd.read_csv('./pre_contest/dataset_B/preliminary_sel_log_dataset_b.csv')

# 新增
preliminary_submit_dataset_b = pd.read_csv('./pre_contest/dataset_B/preliminary_submit_dataset_b.csv')

In [10]:
preliminary_submit_dataset_b

Unnamed: 0,sn,fault_time
0,0015fe530ad4,2020-05-01 23:48:17
1,00380f1435b0,2020-07-28 07:51:13
2,0045a71d0221,2020-07-02 06:33:54
3,004d5a7954e7,2020-08-24 08:27:55
4,004d5a7954e7,2020-08-24 09:42:45
...,...,...
3025,ff92904d80f2,2020-08-28 16:48:54
3026,ff9f7a9c5c7e,2020-04-14 08:13:55
3027,ffc2f37539a9,2020-02-21 22:52:54
3028,fff12c95cc99,2020-05-20 10:27:43


## 采用按照时间最近邻+截止时间间隔的方式匹配日志和标签

In [250]:
# 读取sel日志数据
sel_log_df = pd.read_csv('./pre_contest/dataset_B/preliminary_sel_log_dataset.csv').drop_duplicates()
# 读取训练标签数据：有重复数据！
train_label1 = pd.read_csv('./pre_contest/dataset_B/preliminary_train_label_dataset.csv')
train_label2 = pd.read_csv('./pre_contest/dataset_B/preliminary_train_label_dataset_s.csv')
train_label_df = pd.concat([train_label1,train_label2],axis=0).drop_duplicates()


# 合并日志和标签
sel_log_df['label'] = ''
train_label_df['time'] = train_label_df['fault_time']
train_label_df['msg'] = ''
train_label_df['server_model'] = train_label_df['sn'].map(dict(zip(sel_log_df['sn'],sel_log_df['server_model'])))
train_label_df = train_label_df[['sn', 'time', 'msg', 'server_model', 'label']]

log_label_df = pd.concat([sel_log_df,train_label_df], axis = 0).sort_values(by = 'time')
log_label_df['fault_time'] = ''
log_label_df = log_label_df[['sn', 'fault_time', 'msg', 'time', 'server_model', 'label']]

In [251]:
train_label_df

Unnamed: 0,sn,time,msg,server_model,label
0,SERVER_25698,2020-10-09 13:43:00,,SM0,0
1,SERVER_25699,2020-08-25 18:50:00,,SM3,0
2,SERVER_25712,2020-03-16 13:20:00,,SM4,0
3,SERVER_25708,2020-07-25 12:44:00,,SM4,0
4,SERVER_25711,2020-03-16 16:51:00,,SM4,0
...,...,...,...,...,...
4404,SERVER_24971,2020-03-04 21:09:00,,SM102,3
4405,SERVER_24971,2020-11-12 20:49:00,,SM102,3
4406,SERVER_24962,2020-09-12 12:18:00,,SM102,3
4407,SERVER_24971,2020-10-04 17:41:00,,SM102,3


In [252]:
NearestTime_log_correspond_label_df, NearestTime_no_label_log_list = divideLogByNearestTime(log_label_df)
# FaultTime_log_correspond_label_df, FaultTime_no_label_log_list = divideLogByFaultTime(log_label_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  msg_df['label'] = log[log['label'] != '']['label'].iloc[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  msg_df['fault_time'] = log[log['label'] != '']['time'].iloc[0]


In [183]:
NearestTime_log_correspond_label_df
# 476571 rows × 6 columns
NearestTime_log_correspond_label_df.groupby(['sn','fault_time','label']).size()

sn            fault_time           label
SERVER_10001  2020-05-01 10:04:00  1          9
SERVER_10003  2020-03-28 09:48:00  2        117
SERVER_10008  2020-02-25 16:12:00  1          5
              2020-03-11 18:04:00  2          4
SERVER_10009  2020-05-08 16:37:00  3          4
                                           ... 
SERVER_9991   2020-08-04 22:49:00  2          2
              2020-10-07 18:42:00  2          1
SERVER_9993   2020-05-14 23:50:00  2          2
SERVER_9998   2020-05-29 11:25:00  2          2
SERVER_9999   2020-10-13 02:57:00  2         16
Length: 16604, dtype: int64

In [182]:
FaultTime_log_correspond_label_df
# 481752 rows × 6 columns
FaultTime_log_correspond_label_df.groupby(['sn','fault_time','label']).size()

sn            fault_time           label
SERVER_10001  2020-05-01 10:04:00  1          9
SERVER_10003  2020-03-28 09:48:00  2        117
SERVER_10008  2020-02-25 16:12:00  1          5
              2020-03-11 18:04:00  2          4
SERVER_10009  2020-05-08 16:37:00  3          4
                                           ... 
SERVER_9991   2020-08-04 22:49:00  2          2
              2020-10-07 18:42:00  2          1
SERVER_9993   2020-05-14 23:50:00  2          2
SERVER_9998   2020-05-29 11:25:00  2          2
SERVER_9999   2020-10-13 02:57:00  2         16
Length: 16604, dtype: int64

In [173]:
len(NearestTime_no_label_log_list)

0

In [174]:
len(FaultTime_no_label_log_list)

135

## 计算时间和日志数的统计特征

In [253]:
# 使用最近邻时间进行划分
msg_log_label_df = calculateStatisticFeature(NearestTime_log_correspond_label_df)

In [192]:
# 使用报错时间截断进行划分
msg_log_label_df = calculateStatisticFeature(FaultTime_log_correspond_label_df)

In [254]:
msg_log_label_df
# 16014 rows × 27 columns

Unnamed: 0,sn,fault_time,server_model,msg_cnt,fault_hour,fault_minute,nearest_msg_fault_time_delta,all_msg_1_cnt,all_msg_2_cnt,all_msg_3_cnt,...,msg_hour_avg,msg_hour_median,msg_hour_mode,msg_minute_max,msg_minute_min,msg_minute_avg,msg_minute_median,msg_minute_mode,msg_log,label
0,SERVER_10001,2020-05-01 10:04:00,57,9,10,4,3497,0,0,9,...,8.333333,8.0,8,59,0,38.333333,54.0,59,processor cpu0_status | ierr | asserted. proc...,1
1,SERVER_10003,2020-03-28 09:48:00,57,117,9,48,16,0,0,117,...,9.000000,9.0,9,48,45,46.444444,46.0,46,memory cpu1d0_dimm_stat | correctable ecc | a...,2
2,SERVER_10008,2020-02-25 16:12:00,53,5,16,12,1172,0,0,5,...,15.000000,15.0,15,52,51,51.600000,52.0,52,processor cpu0_status | configuration error |...,1
3,SERVER_10008,2020-03-11 18:04:00,53,3,18,4,4611,0,0,3,...,16.000000,16.0,16,47,46,46.333333,46.0,46,memory dimm050_stat | uncorrectable ecc | ass...,2
4,SERVER_10009,2020-05-08 16:37:00,53,4,16,37,1757,0,0,4,...,16.000000,16.0,16,7,7,7.000000,7.0,7,drive slot hdd_l_14_status | drive fault | as...,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16599,SERVER_9991,2020-08-04 22:49:00,56,2,22,49,127,0,0,2,...,20.500000,20.5,19,52,46,49.000000,49.0,46,memory cpu0a0_dimm_stat | correctable ecc | a...,2
16600,SERVER_9991,2020-10-07 18:42:00,56,1,18,42,1659,0,0,1,...,18.000000,18.0,18,14,14,14.000000,14.0,14,memory cpu1a0_dimm_stat | correctable ecc | a...,2
16601,SERVER_9993,2020-05-14 23:50:00,57,2,23,50,119,0,0,2,...,23.000000,23.0,23,48,43,45.500000,45.5,43,memory cpu1f0_dimm_stat | correctable ecc | a...,2
16602,SERVER_9998,2020-05-29 11:25:00,57,2,11,25,324,0,0,2,...,11.000000,11.0,11,19,4,11.500000,11.5,4,memory cpu1e1_dimm_stat | correctable ecc | a...,2


## 读取日志和标签数据

In [255]:
# msg_log_label_df = pd.read_csv('./pre_contest/v1p8/msg_log_label_df.csv',sep=',')
msg_log_list=list(msg_log_label_df['msg_log'])
label_list=list(msg_log_label_df['label'])
msg_log_label_df

Unnamed: 0,sn,fault_time,server_model,msg_cnt,fault_hour,fault_minute,nearest_msg_fault_time_delta,all_msg_1_cnt,all_msg_2_cnt,all_msg_3_cnt,...,msg_hour_avg,msg_hour_median,msg_hour_mode,msg_minute_max,msg_minute_min,msg_minute_avg,msg_minute_median,msg_minute_mode,msg_log,label
0,SERVER_10001,2020-05-01 10:04:00,57,9,10,4,3497,0,0,9,...,8.333333,8.0,8,59,0,38.333333,54.0,59,processor cpu0_status | ierr | asserted. proc...,1
1,SERVER_10003,2020-03-28 09:48:00,57,117,9,48,16,0,0,117,...,9.000000,9.0,9,48,45,46.444444,46.0,46,memory cpu1d0_dimm_stat | correctable ecc | a...,2
2,SERVER_10008,2020-02-25 16:12:00,53,5,16,12,1172,0,0,5,...,15.000000,15.0,15,52,51,51.600000,52.0,52,processor cpu0_status | configuration error |...,1
3,SERVER_10008,2020-03-11 18:04:00,53,3,18,4,4611,0,0,3,...,16.000000,16.0,16,47,46,46.333333,46.0,46,memory dimm050_stat | uncorrectable ecc | ass...,2
4,SERVER_10009,2020-05-08 16:37:00,53,4,16,37,1757,0,0,4,...,16.000000,16.0,16,7,7,7.000000,7.0,7,drive slot hdd_l_14_status | drive fault | as...,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16599,SERVER_9991,2020-08-04 22:49:00,56,2,22,49,127,0,0,2,...,20.500000,20.5,19,52,46,49.000000,49.0,46,memory cpu0a0_dimm_stat | correctable ecc | a...,2
16600,SERVER_9991,2020-10-07 18:42:00,56,1,18,42,1659,0,0,1,...,18.000000,18.0,18,14,14,14.000000,14.0,14,memory cpu1a0_dimm_stat | correctable ecc | a...,2
16601,SERVER_9993,2020-05-14 23:50:00,57,2,23,50,119,0,0,2,...,23.000000,23.0,23,48,43,45.500000,45.5,43,memory cpu1f0_dimm_stat | correctable ecc | a...,2
16602,SERVER_9998,2020-05-29 11:25:00,57,2,11,25,324,0,0,2,...,11.000000,11.0,11,19,4,11.500000,11.5,4,memory cpu1e1_dimm_stat | correctable ecc | a...,2


## 读取v1_baseline用的词和v1p1的新词

In [256]:
v1_word_list=list(pd.read_csv('pre_contest/v1p2/word_frequency_df.txt',sep='\t')['word'])
v1p1_word_list=list(pd.read_csv('pre_contest/v1p2/tags_incomplete.txt',sep='\t',names=['word'])['word'])
v1p2_word_list=list(set(v1_word_list+v1p1_word_list))

In [257]:
len(v1p2_word_list)

2087

## 训练词频向量

In [258]:
frequency_vector_list = []
tag=0
for word in v1p2_word_list:
    if tag%100==0:
        print(tag,datetime.now())
    pattern=re.compile(word)
    frequency_vector = [len(re.findall(pattern,log))  for log in msg_log_list]
    frequency_vector_list.append(frequency_vector)
    tag+=1

0 2022-04-10 19:16:10.549576
100 2022-04-10 19:16:12.330815
200 2022-04-10 19:16:14.187890
300 2022-04-10 19:16:15.934981
400 2022-04-10 19:16:17.644685
500 2022-04-10 19:16:19.403541
600 2022-04-10 19:16:21.137708
700 2022-04-10 19:16:22.896081
800 2022-04-10 19:16:24.802960
900 2022-04-10 19:16:26.533557
1000 2022-04-10 19:16:28.336136
1100 2022-04-10 19:16:30.049310
1200 2022-04-10 19:16:31.702079
1300 2022-04-10 19:16:33.403148
1400 2022-04-10 19:16:35.193203
1500 2022-04-10 19:16:36.915053
1600 2022-04-10 19:16:38.598680
1700 2022-04-10 19:16:40.319796
1800 2022-04-10 19:16:42.039373
1900 2022-04-10 19:16:43.775670
2000 2022-04-10 19:16:45.530355


In [259]:
frequency_vector_df=pd.DataFrame(frequency_vector_list)
frequency_vector_df=frequency_vector_df.T
frequency_vector_df.columns=v1p2_word_list
new_feature_list=list(msg_log_label_df.columns)[2:-2]
frequency_vector_df[new_feature_list]=msg_log_label_df[new_feature_list]

frequency_vector_df['label']=label_list
frequency_vector_df[['sn','fault_time']]=msg_log_label_df[['sn','fault_time']]
feature=np.array(frequency_vector_df[v1p2_word_list+new_feature_list])

In [260]:
frequency_vector_df

Unnamed: 0,cpu0c0_dimm_stat,f12,deasserted fan,bp4_hdd14_status,hdd71,hdd46,cpu0c1_dimm_stat,drive slot / bay front status,hdd_r_29_status,memory memory error,...,msg_hour_median,msg_hour_mode,msg_minute_max,msg_minute_min,msg_minute_avg,msg_minute_median,msg_minute_mode,label,sn,fault_time
0,0,0,0,0,0,0,0,0,0,0,...,8.0,8,59,0,38.333333,54.0,59,1,SERVER_10001,2020-05-01 10:04:00
1,0,0,0,0,0,0,0,0,0,0,...,9.0,9,48,45,46.444444,46.0,46,2,SERVER_10003,2020-03-28 09:48:00
2,0,0,0,0,0,0,0,0,0,0,...,15.0,15,52,51,51.600000,52.0,52,1,SERVER_10008,2020-02-25 16:12:00
3,0,0,0,0,0,0,0,0,0,0,...,16.0,16,47,46,46.333333,46.0,46,2,SERVER_10008,2020-03-11 18:04:00
4,0,0,0,0,0,0,0,0,0,0,...,16.0,16,7,7,7.000000,7.0,7,3,SERVER_10009,2020-05-08 16:37:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16599,0,0,0,0,0,0,0,0,0,0,...,20.5,19,52,46,49.000000,49.0,46,2,SERVER_9991,2020-08-04 22:49:00
16600,0,0,0,0,0,0,0,0,0,0,...,18.0,18,14,14,14.000000,14.0,14,2,SERVER_9991,2020-10-07 18:42:00
16601,0,0,0,0,0,0,0,0,0,0,...,23.0,23,48,43,45.500000,45.5,43,2,SERVER_9993,2020-05-14 23:50:00
16602,0,0,0,0,0,0,0,0,0,0,...,11.0,11,19,4,11.500000,11.5,4,2,SERVER_9998,2020-05-29 11:25:00


## 训练xgb模型

In [261]:
# xgb模型参数
xgb_params = {
    'booster':'gbtree',
    'objective':'multi:softmax',   # 多分类问题
    'num_class':4,  # 类别数，与multi softmax并用
    'gamma':0.1,    # 用于控制是否后剪枝的参数，越大越保守，一般0.1 0.2的样子
    'max_depth':6,  # 构建树的深度，越大越容易过拟合
    'lambda':2,  # 控制模型复杂度的权重值的L2 正则化项参数，参数越大，模型越不容易过拟合
    'subsample':1, # 随机采样训练样本
    'colsample_bytree':1,# 这个参数默认为1，是每个叶子里面h的和至少是多少
    # 对于正负样本不均衡时的0-1分类而言，假设h在0.01附近，min_child_weight为1
    #意味着叶子节点中最少需要包含100个样本。这个参数非常影响结果，
    # 控制叶子节点中二阶导的和的最小值，该参数值越小，越容易过拟合
    'silent':0,  # 设置成1 则没有运行信息输入，最好是设置成0
    'eta':0.3,  # 如同学习率
    'seed':1000,
    'nthread':16,  #CPU线程数
    #'eval_metric':'auc'
}

# 指标评估
def macro_f1(label,prediction)  -> float:

    """
    计算得分
    :param target_df: [sn,fault_time,label]
    :param submit_df: [sn,fault_time,label]
    :return:
    """

    weights =  [3  /  7,  2  /  7,  1  /  7,  1  /  7]
    macro_F1 =  0.
    for i in  range(len(weights)):
        TP =  np.sum((label==i) & (prediction==i))
        FP =  np.sum((label!= i) & (prediction == i))
        FN =  np.sum((label == i) & (prediction!= i))
        precision = TP /  (TP + FP)  if  (TP + FP)  >  0  else  0
        recall = TP /  (TP + FN)  if  (TP + FN)  >  0  else  0
        F1 =  2  * precision * recall /  (precision + recall)  if  (precision + recall)  >  0  else  0
        macro_F1 += weights[i]  * F1
        
        print('Task %d:\n Prcesion %.2f, Recall %.2f, F1 %.2f' % (i+1, precision, recall, F1))
        
    return macro_F1

In [262]:
random.seed(0)
label=np.array(label_list)
val_mask = [random.random() < 0.3 for _ in range(len(feature))]
train_mask = [not xx for xx in val_mask]
val_feature = feature[val_mask]
val_label = label[val_mask]
train_feature = feature[train_mask]
train_label = label[train_mask]
train_data=xgb.DMatrix(train_feature,label=train_label)
train_feature=xgb.DMatrix(train_feature)
val_feature=xgb.DMatrix(val_feature)

In [263]:
xgb_model=xgb.train(xgb_params,train_data,num_boost_round=500)

Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




In [264]:
train_pred=xgb_model.predict(train_feature)
val_pred=xgb_model.predict(val_feature)

In [265]:
macro_f1(train_label,train_pred)

Task 1:
 Prcesion 1.00, Recall 1.00, F1 1.00
Task 2:
 Prcesion 1.00, Recall 1.00, F1 1.00
Task 3:
 Prcesion 1.00, Recall 1.00, F1 1.00
Task 4:
 Prcesion 1.00, Recall 1.00, F1 1.00


0.9986586832723325

In [266]:
macro_f1(val_label,val_pred)
#  报错
# 0.6695408389371176
#  最近邻
# 0.6539482152175896   1000 * 24
# 0.6595777908225926   10

Task 1:
 Prcesion 0.59, Recall 0.34, F1 0.43
Task 2:
 Prcesion 0.71, Recall 0.78, F1 0.74
Task 3:
 Prcesion 0.93, Recall 0.96, F1 0.95
Task 4:
 Prcesion 0.90, Recall 0.90, F1 0.90


0.6595777908225926

## 保存验证集，sn+fault_time+日志+标签+重要性前50特征
## 保存验证集类别0_1的验证集，sn+fault_time+日志+标签+重要性前50特征

In [206]:
total_gain_dict=xgb_model.get_score(importance_type='total_gain')
total_gain_dict=dict(sorted(total_gain_dict.items(),key=lambda item:item[1],reverse=True))
feature_names=list(frequency_vector_df.columns)
feature_importance_top_50_dict={}
i=0
for key in total_gain_dict:
    if i<50:
        feature_name=feature_names[int(key[1:])]
        feature_importance_top_50_dict[feature_name]=total_gain_dict[key]
        i+=1
    else:
        break
feature_importance_top_50_dict

{'memory': 11394.44808374511,
 'processor': 5348.197021740199,
 'nearest_msg_fault_time_delta': 3986.4107156517935,
 'or': 3770.4642459560014,
 'e': 2647.828372532794,
 'cpu': 2062.0292756921026,
 'mcerr': 1609.3938167659999,
 'server_model': 1234.664254869728,
 'caterr': 861.5511079291002,
 'fault_minute': 698.3931628420004,
 'msg_minute_avg': 611.3326207036007,
 'msg_minute_max': 575.2656649697997,
 'msg_minute_min': 567.434461238399,
 'msg_cnt': 565.5632041980009,
 'fault_hour': 428.51188517470035,
 'ecc': 423.77903267870016,
 'msg_minute_mode': 412.45389282460013,
 'all_msg_3_cnt': 378.6040032361001,
 'msg_hour_avg': 375.59327236489975,
 'deasserted': 370.1253821231,
 'msg_minute_median': 358.0345179657002,
 'uncorrectable ecc': 323.07546452299994,
 'msg_hour_max': 311.85440915660007,
 'msg_hour_min': 298.87588057629966,
 'sta': 277.58500118130024,
 'c': 253.79877627579987,
 'config': 244.84959305039993,
 'controller': 241.39198889449995,
 'configuration': 206.7108899227001,
 '0': 

In [207]:
validation_df = frequency_vector_df[['sn','fault_time']][val_mask]
validation_df['msg_log'] = msg_log_label_df['msg_log'][val_mask]
validation_df['label'] = val_label
validation_df['prediction'] = val_pred
validation_df[list(feature_importance_top_50_dict.keys())] = frequency_vector_df[list(feature_importance_top_50_dict.keys())]
validation_df.to_csv('./pre_contest/v1p8/validation_df.csv',sep=',',index=None)
_0_1_validation_df = validation_df[validation_df['label'].isin([0,1])]
_0_1_validation_df.to_csv('./pre_contest/v1p8/_0_1_validation_df.csv',sep=',',index=None)

In [208]:
_0_1_validation_df

Unnamed: 0,sn,fault_time,msg_log,label,prediction,memory,processor,nearest_msg_fault_time_delta,or,e,...,in,microcontroller/coprocessor,cpu_caterr,power,msg_hour_median,on,status,system,0x19,failure detected
40,SERVER_10082,2020-08-12 20:09:00,processor cpu1_status | ierr | deasserted. sy...,1,1.0,0,1,3753,2,21,...,3,0,0,3,19.0,0,3,3,0,0
43,SERVER_10087,2020-01-23 04:51:00,processor cpu0_status | ierr | asserted. proc...,1,1.0,0,2,3893,2,8,...,0,0,0,0,3.0,0,2,0,0,0
48,SERVER_10094,2020-04-02 10:22:00,processor cpu1_status | ierr | asserted. proc...,1,1.0,0,4,1439,9,60,...,3,0,0,2,9.0,0,6,5,0,0
167,SERVER_102,2020-03-06 02:22:00,processor cpu2 status | uncorrectable machine...,1,1.0,0,2,4104,4,26,...,3,0,0,0,1.0,2,2,2,0,0
249,SERVER_10272,2020-01-13 18:42:00,processor cpu1_status | ierr | asserted. proc...,1,1.0,0,3,3065,4,23,...,1,0,0,2,17.0,0,5,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16358,SERVER_9729,2020-08-24 18:18:00,processor cpu0_status | ierr | asserted. proc...,1,1.0,0,4,3202,9,66,...,5,0,0,3,17.0,0,6,6,0,0
16359,SERVER_9730,2020-10-03 18:33:00,system boot initiated bios_boot_up | initiate...,0,1.0,0,1,1479,2,11,...,2,0,0,0,17.0,2,1,1,0,0
16368,SERVER_9747,2020-06-12 04:04:00,processor cpu_caterr | state asserted | asser...,1,1.0,0,1,3354,2,17,...,1,0,1,2,3.0,0,2,2,0,0
16406,SERVER_9780,2020-04-15 05:47:00,memory cpu1b0_dimm_stat | correctable ecc | d...,1,1.0,2,1,3849,6,24,...,2,0,0,0,4.0,2,1,1,0,0


# 用所有数据进行训练、保存模型、训练集

In [209]:
all_data=xgb.DMatrix(feature,label)
xgb_model_v1p8=xgb.train(xgb_params,all_data,num_boost_round=500)

Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




In [210]:
frequency_vector_df.head()

Unnamed: 0,cpu0c0_dimm_stat,f12,deasserted fan,bp4_hdd14_status,hdd71,hdd46,cpu0c1_dimm_stat,drive slot / bay front status,hdd_r_29_status,memory memory error,...,msg_hour_median,msg_hour_mode,msg_minute_max,msg_minute_min,msg_minute_avg,msg_minute_median,msg_minute_mode,label,sn,fault_time
0,0,0,0,0,0,0,0,0,0,0,...,8.0,8,59,0,38.333333,54.0,59,1,SERVER_10001,2020-05-01 10:04:00
1,0,0,0,0,0,0,0,0,0,0,...,9.0,9,48,45,46.444444,46.0,46,2,SERVER_10003,2020-03-28 09:48:00
2,0,0,0,0,0,0,0,0,0,0,...,15.0,15,52,51,51.6,52.0,52,1,SERVER_10008,2020-02-25 16:12:00
3,0,0,0,0,0,0,0,0,0,0,...,16.0,16,52,46,47.75,46.5,46,2,SERVER_10008,2020-03-11 18:04:00
4,0,0,0,0,0,0,0,0,0,0,...,16.0,16,7,7,7.0,7.0,7,3,SERVER_10009,2020-05-08 16:37:00


In [211]:
frequency_vector_df.to_csv('./pre_contest/v1p8/frequency_vector_df.csv',sep=',',index=None)
file = open('./pre_contest/v1p8/xgb_model_v1p8.model','wb')
pickle.dump(xgb_model_v1p8, file)

## 查看特征重要性、保存特征重要性前300的特征

In [212]:
total_gain_dict=xgb_model_v1p8.get_score(importance_type='total_gain')
total_gain_dict=dict(sorted(total_gain_dict.items(),key=lambda item:item[1],reverse=True))
feature_names=list(frequency_vector_df.columns)
feature_importance_top_300_dict={}
i=0
for key in total_gain_dict:
    if i<300:
        feature_name=feature_names[int(key[1:])]
        feature_importance_top_300_dict[feature_name]=total_gain_dict[key]
        i+=1
    else:
        break
feature_importance_top_300_dict

{'memory': 16488.79234242591,
 'processor': 7683.176529273502,
 'nearest_msg_fault_time_delta': 5882.299677995206,
 'or': 5620.9597168625005,
 'cpu': 3225.725822095199,
 'e': 3211.270232207604,
 'mcerr': 2195.3234057532,
 'server_model': 1811.7028504106966,
 'fault_minute': 1084.6685013220015,
 'caterr': 959.4544280596,
 'msg_minute_avg': 889.0359567916006,
 'msg_minute_min': 818.7301254269005,
 'msg_minute_max': 787.4509459530992,
 'ecc': 701.3680745077002,
 'fault_hour': 659.1478617397993,
 'msg_cnt': 658.3097819362993,
 'msg_hour_avg': 644.0338387658,
 'deasserted': 597.3668413853007,
 'all_msg_3_cnt': 596.8849083994,
 'msg_minute_mode': 570.4114441313004,
 'msg_minute_median': 530.9608613283003,
 'c': 528.1311200789005,
 'msg_hour_max': 451.4809337350001,
 'msg_hour_min': 410.08951292170076,
 'microcontroller/coprocessor': 405.8485730635,
 'uncorrectable ecc': 382.1537698276,
 'supply': 340.3359582994,
 'ec': 333.71975424110013,
 'config': 282.06349833300004,
 'sta': 280.2993159849

In [213]:
feature_importance_top_300_df=frequency_vector_df[['sn','fault_time','label']+list(feature_importance_top_300_dict.keys())]

In [102]:
# feature_importance_top_300_df.to_csv('./pre_contest/v1p5/feature_importance_top_300_df.csv',sep=',',index=None)

# 对初赛测试集进行预测

## 读取测试集数据、拼接数据

In [214]:
# 读取选手提交数据
submit_b = pd.read_csv('./pre_contest/dataset_B/preliminary_submit_dataset_b.csv')
submit_log = pd.read_csv('./pre_contest/dataset_B/preliminary_sel_log_dataset_b.csv')

In [215]:
submit_b.drop_duplicates()

Unnamed: 0,sn,fault_time
0,0015fe530ad4,2020-05-01 23:48:17
1,00380f1435b0,2020-07-28 07:51:13
2,0045a71d0221,2020-07-02 06:33:54
3,004d5a7954e7,2020-08-24 08:27:55
4,004d5a7954e7,2020-08-24 09:42:45
...,...,...
3025,ff92904d80f2,2020-08-28 16:48:54
3026,ff9f7a9c5c7e,2020-04-14 08:13:55
3027,ffc2f37539a9,2020-02-21 22:52:54
3028,fff12c95cc99,2020-05-20 10:27:43


In [216]:
submit_b.groupby(['sn','fault_time']).size()

sn            fault_time         
0015fe530ad4  2020-05-01 23:48:17    1
00380f1435b0  2020-07-28 07:51:13    1
0045a71d0221  2020-07-02 06:33:54    1
004d5a7954e7  2020-08-24 08:27:55    1
              2020-08-24 09:42:45    1
                                    ..
ff92904d80f2  2020-08-28 16:48:54    1
ff9f7a9c5c7e  2020-04-14 08:13:55    1
ffc2f37539a9  2020-02-21 22:52:54    1
fff12c95cc99  2020-05-20 10:27:43    1
fff42b378722  2020-05-06 15:02:20    1
Length: 3030, dtype: int64

In [217]:
submit_log.drop_duplicates()

Unnamed: 0,sn,time,msg,server_model
0,0015fe530ad4,2020-05-01 23:48:01,Memory #0xe2 | Correctable ECC | Asserted,SM49
1,0015fe530ad4,2020-05-01 23:47:38,Memory #0xe2 | Correctable ECC | Asserted,SM49
2,0015fe530ad4,2020-05-01 23:48:09,Memory #0xe2 | Correctable ECC | Asserted,SM49
3,0015fe530ad4,2020-05-01 23:47:54,Memory #0xe2 | Correctable ECC | Asserted,SM49
4,0015fe530ad4,2020-05-01 23:48:17,Memory #0xe2 | Correctable ECC | Asserted,SM49
...,...,...,...,...
11124,fff12c95cc99,2020-05-20 10:25:57,Processor CPU1_Status | IERR | Asserted,SM35
11125,fff42b378722,2020-05-06 15:01:31,Memory CPU0E0_DIMM_Stat | Correctable ECC | A...,SM35
11126,fff42b378722,2020-05-06 15:01:26,Memory CPU0E0_DIMM_Stat | Correctable ECC | A...,SM35
11127,fff42b378722,2020-05-06 15:01:32,Memory CPU0E0_DIMM_Stat | Correctable ECC | A...,SM35


In [218]:
len(set(submit_b['sn'].drop_duplicates()).intersection(set(submit_log['sn'].drop_duplicates())))

2891

In [219]:
submit_b['sn'].drop_duplicates()

0       0015fe530ad4
1       00380f1435b0
2       0045a71d0221
3       004d5a7954e7
5       005133de75a0
            ...     
3025    ff92904d80f2
3026    ff9f7a9c5c7e
3027    ffc2f37539a9
3028    fff12c95cc99
3029    fff42b378722
Name: sn, Length: 2891, dtype: object

In [220]:
submit_log['sn'].drop_duplicates()

0        0015fe530ad4
10       00380f1435b0
11       0045a71d0221
16       004d5a7954e7
21       005133de75a0
             ...     
11109    ff92904d80f2
11110    ff9f7a9c5c7e
11111    ffc2f37539a9
11122    fff12c95cc99
11125    fff42b378722
Name: sn, Length: 2891, dtype: object

In [221]:
# 合并日志和标签
submit_log['label'] = ''
submit_b['label'] = -1
submit_b['time'] = submit_b['fault_time']
submit_b['msg'] = ''
submit_b['server_model'] = submit_b['sn'].map(dict(zip(submit_log['sn'],submit_log['server_model'])))
submit_b = submit_b[['sn', 'time', 'msg', 'server_model', 'label']]

submit_log_label_df = pd.concat([submit_log,submit_b], axis = 0).sort_values(by = 'time')
submit_log_label_df['fault_time'] = ''
submit_log_label_df = submit_log_label_df[['sn', 'fault_time', 'msg', 'time', 'server_model', 'label']]

In [222]:
submit_log_label_df

Unnamed: 0,sn,fault_time,msg,time,server_model,label
3614,5312fd6f4690,,Processor CPU1_Status | Configuration Error |...,2019-12-28 00:44:47,SM35,
340,07c55eb2f4c6,,Processor CPU_CATERR | State Asserted | Asserted,2019-12-28 01:13:06,SM22,
336,07c55eb2f4c6,,System ACPI Power State ACPI_PWR_Status | Leg...,2019-12-28 01:13:44,SM22,
329,07c55eb2f4c6,,Processor CPU_CATERR | State Deasserted | Ass...,2019-12-28 01:13:44,SM22,
330,07c55eb2f4c6,,System ACPI Power State ACPI_PWR_Status | Leg...,2019-12-28 01:13:49,SM22,
...,...,...,...,...,...,...
2750,3e09124e81c4,,System Boot Initiated BIOS_Boot_UP | Initiate...,2020-11-25 20:02:17,SM66,
742,3e09124e81c4,,,2020-11-25 20:07:17,SM66,-1
812,11b9b85be36b,,Memory #0xe2 | Correctable ECC | Asserted,2020-11-25 20:59:42,SM49,
813,11b9b85be36b,,Memory #0xe2 | Correctable ECC | Asserted,2020-11-25 21:06:24,SM49,


In [223]:
submit_log_correspond_label_df, submit_no_label_log_list = divideLogByFaultTime(submit_log_label_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  msg_df['label'] = log[log['label'] != '']['label'].iloc[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  msg_df['fault_time'] = log[log['label'] != '']['time'].iloc[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  msg_df['label'] = temp_log[temp_log['label'] != '']['label'].iloc[0]
A value is try

In [224]:
submit_log_correspond_label_df
# 11166 rows × 6 columns

Unnamed: 0,sn,fault_time,msg,time,server_model,label
5121,0015fe530ad4,2020-05-01 23:48:17,Memory #0xe2 | Correctable ECC | Asserted,2020-05-01 23:47:08,SM49,-1
5122,0015fe530ad4,2020-05-01 23:48:17,Memory #0xe2 | Correctable ECC | Asserted,2020-05-01 23:47:15,SM49,-1
5123,0015fe530ad4,2020-05-01 23:48:17,Memory #0xe2 | Correctable ECC | Asserted,2020-05-01 23:47:22,SM49,-1
5124,0015fe530ad4,2020-05-01 23:48:17,Memory #0xe2 | Correctable ECC | Asserted,2020-05-01 23:47:30,SM49,-1
5125,0015fe530ad4,2020-05-01 23:48:17,Memory #0xe2 | Correctable ECC | Asserted,2020-05-01 23:47:38,SM49,-1
...,...,...,...,...,...,...
6002,fff12c95cc99,2020-05-20 10:27:43,Processor CPU1_Status | IERR | Asserted,2020-05-20 10:25:57,SM35,-1
5304,fff42b378722,2020-05-06 15:02:20,Memory CPU0E0_DIMM_Stat | Correctable ECC | A...,2020-05-06 09:09:50,SM35,-1
5318,fff42b378722,2020-05-06 15:02:20,Memory CPU0E0_DIMM_Stat | Correctable ECC | A...,2020-05-06 15:01:26,SM35,-1
5319,fff42b378722,2020-05-06 15:02:20,Memory CPU0E0_DIMM_Stat | Correctable ECC | A...,2020-05-06 15:01:31,SM35,-1


In [225]:
submit_log_correspond_label_df.groupby(['sn', 'fault_time']).size()

sn            fault_time         
0015fe530ad4  2020-05-01 23:48:17    10
00380f1435b0  2020-07-28 07:51:13     1
0045a71d0221  2020-07-02 06:33:54     5
004d5a7954e7  2020-08-24 08:27:55     5
              2020-08-24 09:42:45     1
                                     ..
ff92904d80f2  2020-08-28 16:48:54     1
ff9f7a9c5c7e  2020-04-14 08:13:55     1
ffc2f37539a9  2020-02-21 22:52:54    11
fff12c95cc99  2020-05-20 10:27:43     3
fff42b378722  2020-05-06 15:02:20     4
Length: 3030, dtype: int64

In [226]:
msg_cnt = 0
for item in submit_no_label_log_list:
    msg_cnt += len(item)
msg_cnt

2

## 计算统计特征

In [227]:
submit_log_label_df = calculateStatisticFeature(submit_log_correspond_label_df)
msg_log_list = list(submit_log_label_df['msg_log'])

## 计算词频特征向量

In [228]:
# 统计词频向量
submit_frequency_vector_list = []
tag=0
for word in v1p2_word_list:
    if tag%100==0:
        print(tag, datetime.now())
    pattern=re.compile(word)
    frequency_vector = [len(re.findall(pattern,log))  for log in msg_log_list]
    submit_frequency_vector_list.append(frequency_vector)
    tag+=1                                                              

0 2022-04-10 18:25:54.696155
100 2022-04-10 18:25:55.004544
200 2022-04-10 18:25:55.307047
300 2022-04-10 18:25:55.594008
400 2022-04-10 18:25:55.883789
500 2022-04-10 18:25:56.174592
600 2022-04-10 18:25:56.459353
700 2022-04-10 18:25:56.744794
800 2022-04-10 18:25:57.041933
900 2022-04-10 18:25:57.336735
1000 2022-04-10 18:25:57.618840
1100 2022-04-10 18:25:57.900732
1200 2022-04-10 18:25:58.178272
1300 2022-04-10 18:25:58.456290
1400 2022-04-10 18:25:58.733991
1500 2022-04-10 18:25:59.030711
1600 2022-04-10 18:25:59.306146
1700 2022-04-10 18:25:59.583213
1800 2022-04-10 18:25:59.859562
1900 2022-04-10 18:26:00.138753
2000 2022-04-10 18:26:00.413186


In [229]:
submit_frequency_vector_df=pd.DataFrame(submit_frequency_vector_list)
submit_frequency_vector_df=submit_frequency_vector_df.T
submit_frequency_vector_df.columns=v1p2_word_list
submit_frequency_vector_df[new_feature_list]=submit_log_label_df[new_feature_list]
submit_frequency_vector_df[['sn','fault_time']]=submit_log_label_df[['sn','fault_time']]
feature=np.array(submit_frequency_vector_df[v1p2_word_list+new_feature_list])  

# submit_frequency_vector_df.to_csv('./pre_contest/v1p8/submit_frequency_vector_df.csv',sep=',',index=None)

In [230]:
submit_frequency_vector_df

Unnamed: 0,cpu0c0_dimm_stat,f12,deasserted fan,bp4_hdd14_status,hdd71,hdd46,cpu0c1_dimm_stat,drive slot / bay front status,hdd_r_29_status,memory memory error,...,msg_hour_avg,msg_hour_median,msg_hour_mode,msg_minute_max,msg_minute_min,msg_minute_avg,msg_minute_median,msg_minute_mode,sn,fault_time
0,0,0,0,0,0,0,0,0,0,0,...,23.0,23.0,23,48,47,47.300000,47.0,47,0015fe530ad4,2020-05-01 23:48:17
1,0,0,0,0,0,0,0,0,0,0,...,7.0,7.0,7,16,16,16.000000,16.0,16,00380f1435b0,2020-07-28 07:51:13
2,0,0,0,0,0,0,0,0,0,0,...,5.0,5.0,5,31,28,30.400000,31.0,31,0045a71d0221,2020-07-02 06:33:54
3,0,0,0,0,0,0,0,0,0,0,...,8.0,8.0,8,13,7,8.200000,7.0,7,004d5a7954e7,2020-08-24 08:27:55
4,0,0,0,0,0,0,0,0,0,0,...,9.0,9.0,9,42,42,42.000000,42.0,42,004d5a7954e7,2020-08-24 09:42:45
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3025,0,0,0,0,0,0,0,0,0,0,...,15.0,15.0,15,46,46,46.000000,46.0,46,ff92904d80f2,2020-08-28 16:48:54
3026,0,0,0,0,0,0,0,0,0,0,...,6.0,6.0,6,28,28,28.000000,28.0,28,ff9f7a9c5c7e,2020-04-14 08:13:55
3027,0,0,0,0,0,0,0,0,0,0,...,22.0,22.0,22,38,37,37.545455,38.0,38,ffc2f37539a9,2020-02-21 22:52:54
3028,0,0,0,0,0,0,0,0,0,0,...,10.0,10.0,10,25,25,25.000000,25.0,25,fff12c95cc99,2020-05-20 10:27:43


## 对测试集进行预测，保存特征和预测结果

In [231]:
test_feature=xgb.DMatrix(feature)
submit_frequency_vector_df['prediction']=xgb_model_v1p8.predict(test_feature)
preliminary_submit_dataset_b=submit_frequency_vector_df[['sn','fault_time','prediction']]
preliminary_submit_dataset_b['label']=preliminary_submit_dataset_b['prediction'].apply(lambda x : int(x))
preliminary_submit_dataset_b=preliminary_submit_dataset_b[['sn','fault_time','label']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  preliminary_submit_dataset_b['label']=preliminary_submit_dataset_b['prediction'].apply(lambda x : int(x))


In [232]:
preliminary_submit_dataset_b.to_csv('pre_contest/v1p8/preliminary_submit_dataset_b_v1p8.csv',sep=',',index=None)