# 背景：在v1p5的基础上，采用清扬的新标签

## 导包、设置根目录

In [62]:
import pandas as pd
import numpy as np
import os
import nltk
from nltk.tokenize import word_tokenize
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models.word2vec import Word2Vec
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import random
import pickle
import multiprocessing
import re
import pickle
from matplotlib import pyplot as plt

from collections import Counter
from itertools import chain
import datetime

# 更改工作目录为当前项目根目录
import sys
import os
os.chdir(os.path.dirname(os.path.dirname(sys.path[0])))
print(os.getcwd())

/Users/jincan02/Projects/Log-diagnosis


## 采用图的方式，构建新日志 - 标签表

In [63]:
# 类定义

class msg_node:
    msg_type = None
    attrs = []
    server_model = None
    msg_time = None
    msg_title = None
    idx = 0
    error_type = None
    
    def __init__(self, msg, log_time, server_model, error_type):
        self.server_model = server_model
        self.attrs = [xx.strip() for xx in msg.split('|')]
        self.msg_time = log_time
        if (len(self.attrs) == 1) and (self.attrs[0] == ''):
            self.error_type = error_type
            self.msg_type = 4
        elif len(self.attrs) == 1:
            self.msg_type = 0
        elif len(self.attrs) == 2:
            self.msg_type = 1
            self.msg_title = self.attrs[0]
        elif (len(self.attrs) == 3) and (self.attrs[-1] in ['Assert','Asserted','Deasserted','Deassert','asserted','deasserted']):
            self.msg_type = 2
            self.msg_title = self.attrs[0]
        elif len(self.attrs) >=3:
            self.msg_type = 3
            self.msg_title = self.attrs[0]
        else:
            print('Error in extracting msg attributes!')
    
    def set_index(self, index):
        self.idx = index

class msg_graph:
    nodes = []  # list of msg_node
    edges = []  # list of tuples (msg_node_index1, msg_node_index2, edge_type, delta_time)
    error_label = []
    
    def __init__(self, nodes):
        # 注！这里默认输入的nodes是已经按照时序排好的
        self.nodes = nodes
        self.edges = []
        
    def assign_index(self):
        for kth, node in enumerate(self.nodes):
            node.set_index(kth)
    
    def build_graph(self, with_time_order = True, time_cutoff=172800):
        edge_stamps = []
        # edge type 1: same msg_title
        unique_edge_types = list(set([xx.msg_title for xx in nodes]))
        edge_type_vector = np.array([xx.msg_title for xx in nodes])
        for kth in range(len(unique_edge_types)):
            temp_nodes = np.array(nodes)[edge_type_vector == unique_edge_types[kth]].tolist()
            for pth in range(len(temp_nodes)-1):
                self.edges.append((temp_nodes[pth].idx, temp_nodes[pth+1].idx, 
                                   unique_edge_types[kth],
                                   (temp_nodes[pth+1].msg_time - temp_nodes[pth].msg_time).seconds))
                edge_stamps.append((temp_nodes[pth].idx, temp_nodes[pth+1].idx))
        self.unique_edge_types = unique_edge_types
        
        # edge type 2: order of time
        if with_time_order:
            for ith in range(len(nodes)-1):
                if not ((ith, ith+1) in edge_stamps):
                    self.edges.append((ith, ith+1, 'time_order', (nodes[ith+1].msg_time - nodes[ith].msg_time).seconds))
        
        # assign error type to the graph
        self.error_label = []
        for ith in range(len(nodes)):
            if not nodes[ith].error_type is None:
                self.error_label.append(nodes[ith].error_type)
        
    def network_info(self):
        node_info = [(xx.idx, {'type': xx.msg_type}) for xx in self.nodes]
        edge_info = [(xx[0], xx[1], {'type': xx[2],'weight': 10 / (xx[3]+1e-6)}) for xx in self.edges]
        return node_info, edge_info, self.error_label

In [136]:
# 日志表

df1 = pd.read_csv('pre_contest/dataset/preliminary_sel_log_dataset.csv',sep=',')
#df3 = pd.read_csv('pre_contest/dataset/preliminary_submit_dataset_a.csv',sep=',')

select_fields = ['sn','time','msg','server_model']
df = df1[select_fields]
df['error_type'] = ''
df['log_time'] = pd.to_datetime(df['time'])
df['msg_len'] = df['msg'].apply(lambda d: len(d.split('|')))

df_server_mapping = df[['sn','server_model']].drop_duplicates()


# 标签表

df1_label = pd.read_csv('pre_contest/dataset/preliminary_train_label_dataset.csv',sep=',')
df2_label = pd.read_csv('pre_contest/dataset/preliminary_train_label_dataset_s.csv',sep=',')

df_label = pd.concat([df1_label, df2_label]).drop_duplicates()
df_label['log_time'] = pd.to_datetime(df_label['fault_time'])
df_label['msg'] = ''
df_label['msg_len'] = 0
df_label['server_model'] = df_label['sn'].map(dict(zip(df_server_mapping['sn'], 
                                                       df_server_mapping['server_model'])))
df_label.columns = ['sn','time','error_type','log_time','msg','msg_len','server_model']


# 合并日志表与标签表

df_comb = pd.concat([df, df_label])
df_comb = df_comb.sort_values(['log_time'], ascending=True)
df_comb = df_comb.reset_index(drop=True)


# 按sn分组，按日志（含标签）时间顺序排列，以10小时间隔划分日志

cutoff = 10 * 3600

grp_store = []
miss_label_store = []
cnt_with_label = 0
cnt_with_multi_label = 0
cnt_miss_label = 0
for name, grp in df_comb.groupby(['sn']):
    if len(grp.error_type.unique()) == 1:
        cnt_miss_label += 1
        miss_label_store.append(grp)
    elif len(grp.error_type.unique()) == 2:
        prev_time = [grp['time'].iloc[0]] + grp['time'].tolist()[:-1]
        grp['prev_time'] = pd.to_datetime(prev_time)
        grp['delta_time'] = grp['log_time'] - grp['prev_time']
        grp['delta_time'] = grp['delta_time'].apply(lambda d: d.days*24*3600+d.seconds)
        cutoff_idx = [0] + grp.loc[grp['delta_time'] > cutoff].index.tolist() + [grp.index.tolist()[-1]+1]
        for kth in range(len(cutoff_idx)-1): 
            temp_grp = grp.loc[(grp.index < cutoff_idx[kth+1]) & (grp.index >= cutoff_idx[kth])]
            if len(temp_grp.error_type.unique()) == 1:
                cnt_miss_label += 1
                miss_label_store.append(temp_grp)
            else:
                cnt_with_label += 1 
                grp_store.append(temp_grp)
    else:
        prev_time = [grp['time'].iloc[0]] + grp['time'].tolist()[:-1]
        grp['prev_time'] = pd.to_datetime(prev_time)
        grp['delta_time'] = grp['log_time'] - grp['prev_time']
        grp['delta_time'] = grp['delta_time'].apply(lambda d: d.days*24*3600+d.seconds)
        cutoff_idx = [0] + grp.loc[grp['delta_time'] > cutoff].index.tolist() + [grp.index.tolist()[-1]+1]
        for kth in range(len(cutoff_idx)-1): 
            temp_grp = grp.loc[(grp.index < cutoff_idx[kth+1]) & (grp.index >= cutoff_idx[kth])]
            if len(temp_grp.error_type.unique()) == 1:
                cnt_miss_label += 1
                miss_label_store.append(temp_grp)
            elif len(temp_grp.error_type.unique()) == 2:
                cnt_with_label += 1 
                grp_store.append(temp_grp)
            else:
                cnt_with_multi_label += 1
                grp_store.append(temp_grp)
                
# 在多个不同时间段报同样错的例子
#     if name == 'SERVER_998':
#         break

                
# 采用图的方式划分日志及其标签

miss_store = []
label_0_store = []
label_1_store = []
label_2_store = []
label_3_store = []

save_fields = ['sn','time','msg','server_model','error_type','msg_len']
for grp in grp_store:
    file_name = grp['sn'].iloc[0] + '__' + grp['time'].iloc[0]
    grp['id'] = file_name
    
    nodes = []
    for ith in range(len(grp)):
        nodes.append(msg_node(grp['msg'].iloc[ith], 
                              grp['log_time'].iloc[ith], 
                              grp['server_model'].iloc[ith],
                              grp['error_type'].iloc[ith]))
    test_graph = msg_graph(nodes)
    test_graph.assign_index()
    test_graph.build_graph()
    
    flag = True
    if len(set(test_graph.error_label)) == 1:
        if 0 in test_graph.error_label:
            label_0_store.append(grp)
            flag = False
        if 1 in test_graph.error_label:
            label_1_store.append(grp)
            flag = False
        if 2 in test_graph.error_label:
            label_2_store.append(grp)
            flag = False
        if 3 in test_graph.error_label:
            label_3_store.append(grp)
            flag = False
    if flag:
        # miss_store 含无标签和多个标签
        miss_store.append(grp)

In [137]:
cnt_with_label, cnt_with_multi_label, cnt_miss_label
# 10h (14924, 427, 799)
# 2h (14090, 342, 5806)

(14924, 427, 799)

In [141]:
for grp in miss_label_store:
    if len(grp[grp['error_type'] != '']) > 0:
        print(grp)
        break

                 sn                 time msg server_model error_type  \
55457  SERVER_10019  2020-01-29 22:38:00             SM53          3   

                 log_time  msg_len           prev_time  delta_time  
55457 2020-01-29 22:38:00        0 2020-01-29 11:43:09       39291  


In [113]:
df[df['sn'] == 'SERVER_10019']

Unnamed: 0,sn,time,msg,server_model,error_type,log_time,msg_len
473207,SERVER_10019,2020-01-29 11:43:09,System Boot Initiated BIOS_Boot_Up | Initiate...,SM53,,2020-01-29 11:43:09,3
473208,SERVER_10019,2020-01-29 11:03:44,System Boot Initiated BIOS_Boot_Up | Initiate...,SM53,,2020-01-29 11:03:44,3


In [114]:
df_label[df_label['sn'] == 'SERVER_10019']

Unnamed: 0,sn,time,error_type,log_time,msg,msg_len,server_model
2823,SERVER_10019,2020-01-29 22:38:00,3,2020-01-29 22:38:00,,0,SM53


In [80]:
grp

Unnamed: 0,sn,time,msg,server_model,error_type,log_time,msg_len,prev_time,delta_time
391199,SERVER_998,2020-09-05 14:07:00,,SM15,3.0,2020-09-05 14:07:00,0,2020-09-05 14:07:00,0
394957,SERVER_998,2020-09-08 14:35:59,Button Button_Pressed | Power Button pressed ...,SM15,,2020-09-08 14:35:59,3,2020-09-05 14:07:00,260939
394958,SERVER_998,2020-09-08 14:35:59,Button Button_Pressed | Power Button pressed ...,SM15,,2020-09-08 14:35:59,3,2020-09-08 14:35:59,0
394961,SERVER_998,2020-09-08 15:00:00,,SM15,3.0,2020-09-08 15:00:00,0,2020-09-08 14:35:59,1441


In [81]:
temp_grp

Unnamed: 0,sn,time,msg,server_model,error_type,log_time,msg_len,prev_time,delta_time
394957,SERVER_998,2020-09-08 14:35:59,Button Button_Pressed | Power Button pressed ...,SM15,,2020-09-08 14:35:59,3,2020-09-05 14:07:00,260939
394958,SERVER_998,2020-09-08 14:35:59,Button Button_Pressed | Power Button pressed ...,SM15,,2020-09-08 14:35:59,3,2020-09-08 14:35:59,0
394961,SERVER_998,2020-09-08 15:00:00,,SM15,3.0,2020-09-08 15:00:00,0,2020-09-08 14:35:59,1441


In [138]:
len(miss_store), len(label_0_store), len(label_1_store),len(label_2_store),len(label_3_store),len(label_0_store)+len(label_1_store)+len(label_2_store)+len(label_3_store)
# 加入提交的标签时的结果  (0, 1471, 3378, 8632, 2306, 15787)

(427, 1463, 2996, 8231, 2234, 14924)

In [139]:
miss_store[0]

Unnamed: 0,sn,time,msg,server_model,error_type,log_time,msg_len,prev_time,delta_time,id
262831,SERVER_10067,2020-06-16 02:01:59,Memory CPU1D0_DIMM_Stat | Correctable ECC | A...,SM55,,2020-06-16 02:01:59,3,2020-06-16 02:01:59,0,SERVER_10067__2020-06-16 02:01:59
262833,SERVER_10067,2020-06-16 02:03:25,Memory CPU1D0_DIMM_Stat | Correctable ECC | A...,SM55,,2020-06-16 02:03:25,3,2020-06-16 02:01:59,86,SERVER_10067__2020-06-16 02:01:59
262839,SERVER_10067,2020-06-16 02:05:39,Memory CPU1D0_DIMM_Stat | Correctable ECC | A...,SM55,,2020-06-16 02:05:39,3,2020-06-16 02:03:25,134,SERVER_10067__2020-06-16 02:01:59
262840,SERVER_10067,2020-06-16 02:06:00,,SM55,2,2020-06-16 02:06:00,0,2020-06-16 02:05:39,21,SERVER_10067__2020-06-16 02:01:59
262841,SERVER_10067,2020-06-16 02:06:10,Memory CPU1D0_DIMM_Stat | Correctable ECC | A...,SM55,,2020-06-16 02:06:10,3,2020-06-16 02:06:00,10,SERVER_10067__2020-06-16 02:01:59
...,...,...,...,...,...,...,...,...,...,...
264883,SERVER_10067,2020-06-16 15:21:57,Processor CPU1_Status | Configuration Error |...,SM55,,2020-06-16 15:21:57,3,2020-06-16 15:21:57,0,SERVER_10067__2020-06-16 02:01:59
264940,SERVER_10067,2020-06-16 15:50:06,Memory CPU1D0_DIMM_Stat | Correctable ECC | A...,SM55,,2020-06-16 15:50:06,3,2020-06-16 15:21:57,1689,SERVER_10067__2020-06-16 02:01:59
265010,SERVER_10067,2020-06-16 16:39:44,Memory CPU1D0_DIMM_Stat | Correctable ECC | A...,SM55,,2020-06-16 16:39:44,3,2020-06-16 15:50:06,2978,SERVER_10067__2020-06-16 02:01:59
265012,SERVER_10067,2020-06-16 16:48:04,System ACPI Power State ACPI_PWR_Status | S4/...,SM55,,2020-06-16 16:48:04,3,2020-06-16 16:39:44,500,SERVER_10067__2020-06-16 02:01:59


In [48]:
df[df['sn'] == 'SERVER_998']
df_label[df_label['sn'] == 'SERVER_998']

Unnamed: 0,sn,time,error_type,log_time,msg,msg_len,server_model
193,SERVER_998,2020-09-05 14:07:00,3,2020-09-05 14:07:00,,0,SM15
201,SERVER_998,2020-09-08 15:00:00,3,2020-09-08 15:00:00,,0,SM15


In [142]:
# 构建新日志-标签表
use_log_label_df = pd.DataFrame(columns = ['sn','fault_time','time','msg','server_model','label']) 

for grp in label_0_store:
    fault_time = grp[grp['error_type'] != '']['time'].iloc[0]
    grp['fault_time'] = fault_time
    grp['label'] = 0
    temp = grp[grp['error_type'] == ''][['sn','fault_time','time','msg','server_model','label']]
    use_log_label_df = pd.concat([use_log_label_df,temp])
for grp in label_1_store:
    fault_time = grp[grp['error_type'] != '']['time'].iloc[0]
    grp['fault_time'] = fault_time
    grp['label'] = 1
    temp = grp[grp['error_type'] == ''][['sn','fault_time','time','msg','server_model','label']]
    use_log_label_df = pd.concat([use_log_label_df,temp])
for grp in label_2_store:
    fault_time = grp[grp['error_type'] != '']['time'].iloc[0]
    grp['fault_time'] = fault_time
    grp['label'] = 2
    temp = grp[grp['error_type'] == ''][['sn','fault_time','time','msg','server_model','label']]
    use_log_label_df = pd.concat([use_log_label_df,temp])
for grp in label_3_store:
    fault_time = grp[grp['error_type'] != '']['time'].iloc[0]
    grp['fault_time'] = fault_time
    grp['label'] = 3
    temp = grp[grp['error_type'] == ''][['sn','fault_time','time','msg','server_model','label']]
    use_log_label_df = pd.concat([use_log_label_df,temp])

In [143]:
use_log_label_df.shape
# (688697, 6)
# (525497, 6)

(419943, 6)

In [39]:
df.shape

(482536, 7)

## 添加时间和日志数的统计特征

In [144]:
# # 读取sel日志数据
# sel_log = pd.read_csv('./pre_contest/dataset/preliminary_sel_log_dataset.csv')
# # 读取训练标签数据：有重复数据！
# train_label1=pd.read_csv('./pre_contest/dataset/preliminary_train_label_dataset.csv')
# train_label2=pd.read_csv('./pre_contest/dataset/preliminary_train_label_dataset_s.csv')
# train_label=pd.concat([train_label1,train_label2],axis=0).drop_duplicates()

# # sn+day仅有一条报错的日志筛选和
# train_label['day']=train_label['fault_time'].apply(lambda x:x[0:10])
# temp=train_label.groupby(['sn','day']).size()
# use_temp=temp[temp.values==1]
# sn_list=[use_temp.index[i][0] for i in range(len(use_temp))]
# day_list=[use_temp.index[i][1] for i in range(len(use_temp))]
# use_temp_df=pd.DataFrame({'sn':sn_list,'day':day_list})
# use_train_label=pd.merge(train_label,use_temp_df,how='inner',on=['sn','day'])
# sel_log['day']=sel_log['time'].apply(lambda x:x[0:10])
# use_log_label_df=pd.merge(sel_log,use_train_label,how='inner',on=['sn','day'])
# columns_order=['sn','day','time','msg','server_model','fault_time','label']
# use_log_label_df=use_log_label_df[columns_order]



use_log_label_df['msg_hour'] = use_log_label_df['time'].apply(lambda x : datetime.datetime.strptime(x, "%Y-%m-%d %H:%M:%S").hour)
use_log_label_df['msg_minute'] = use_log_label_df['time'].apply(lambda x : datetime.datetime.strptime(x, "%Y-%m-%d %H:%M:%S").minute)
use_log_label_df['fault_hour'] = use_log_label_df['fault_time'].apply(lambda x : datetime.datetime.strptime(x, "%Y-%m-%d %H:%M:%S").hour)
use_log_label_df['fault_minute'] = use_log_label_df['fault_time'].apply(lambda x : datetime.datetime.strptime(x, "%Y-%m-%d %H:%M:%S").minute)
# 拼接日志
# 新增 fault_minute，按竖线统计任务数，日志小时和分钟的最大值、最小值、均值、中位数、众数
fault_minute_list = []
msg_1_cnt_list=[]
msg_2_cnt_list=[]
msg_3_cnt_list=[]
msg_4_cnt_list=[]
msg_hour_max_list=[]
msg_hour_min_list=[]
msg_hour_avg_list=[]
msg_hour_median_list=[]
msg_hour_mode_list=[]
msg_minute_max_list=[]
msg_minute_min_list=[]
msg_minute_avg_list=[]
msg_minute_median_list=[]
msg_minute_mode_list=[]


sn_list=[]
day_list=[]
server_model_list=[]
msg_log_list=[]
msg_cnt_list=[]
fault_hour_list=[]
label_list=[]
fault_time_list=[]
for msg_log_df in use_log_label_df.groupby(['sn','fault_time']):
    msg_log_str = ''
    msg_1_cnt = 0
    msg_2_cnt = 0
    msg_3_cnt = 0
    msg_4_cnt = 0
    for info in msg_log_df[1]['msg'].drop_duplicates():
        if info == info:
            msg_log_str=msg_log_str+info.lower()+'.'
            if len(info.split('|')) == 1:
                msg_1_cnt += 1
            elif len(info.split('|')) == 2:
                msg_2_cnt += 1
            elif len(info.split('|')) == 3:
                msg_3_cnt += 1
            else:
                msg_4_cnt += 1
    label=msg_log_df[1].iloc[0]['label']
    sm=int(msg_log_df[1].iloc[0]['server_model'][2:])
    
    sn_list.append(msg_log_df[0][0])
    fault_time_list.append(msg_log_df[0][1])
    server_model_list.append(sm)
    msg_log_list.append(msg_log_str)
    msg_cnt_list.append(len(msg_log_df[1]))
    label_list.append(label)
    
    fault_hour_list.append(msg_log_df[1].iloc[0]['fault_hour'])
    fault_minute_list.append(msg_log_df[1].iloc[0]['fault_minute'])
    
    msg_1_cnt_list.append(msg_1_cnt)
    msg_2_cnt_list.append(msg_2_cnt)
    msg_3_cnt_list.append(msg_3_cnt)
    msg_4_cnt_list.append(msg_4_cnt)
    
    msg_hour_max_list.append(msg_log_df[1]['msg_hour'].max())
    msg_hour_min_list.append(msg_log_df[1]['msg_hour'].min())
    msg_hour_avg_list.append(msg_log_df[1]['msg_hour'].mean())
    msg_hour_median_list.append(msg_log_df[1]['msg_hour'].median())
    msg_hour_mode_list.append(msg_log_df[1]['msg_hour'].mode()[0])
    
    msg_minute_max_list.append(msg_log_df[1]['msg_minute'].max())
    msg_minute_min_list.append(msg_log_df[1]['msg_minute'].min())
    msg_minute_avg_list.append(msg_log_df[1]['msg_minute'].mean())
    msg_minute_median_list.append(msg_log_df[1]['msg_minute'].median())
    msg_minute_mode_list.append(msg_log_df[1]['msg_minute'].mode()[0])
    
msg_log_label_df=pd.DataFrame(
    {
    'sn':sn_list,
    'fault_time':fault_time_list,
    'server_model':server_model_list,
    'msg_cnt':msg_cnt_list,
    'fault_hour':fault_hour_list,
    'fault_minute':fault_minute_list,
    'msg_1_cnt':msg_1_cnt_list,
    'msg_2_cnt':msg_2_cnt_list,
    'msg_3_cnt':msg_3_cnt_list,
    'msg_4_cnt':msg_4_cnt_list,
    'msg_hour_max':msg_hour_max_list,
    'msg_hour_min':msg_hour_min_list,
    'msg_hour_avg':msg_hour_avg_list,
    'msg_hour_median':msg_hour_median_list,
    'msg_hour_mode':msg_hour_mode_list,
    'msg_minute_max':msg_minute_max_list,
    'msg_minute_min':msg_minute_min_list,
    'msg_minute_avg':msg_minute_avg_list,
    'msg_minute_median':msg_minute_median_list,
    'msg_minute_mode':msg_minute_mode_list,
    'msg_log':msg_log_list,
    'label':label_list
    })
# msg_log_label_df.to_csv('./pre_contest/v1p6/msg_log_label_df.csv',sep=',',index=None)

## 读取日志和标签数据

In [147]:
msg_log_label_df = pd.read_csv('./pre_contest/v1p6/msg_log_label_df.csv',sep=',')
msg_log_list=list(msg_log_label_df['msg_log'])
label_list=list(msg_log_label_df['label'])
msg_log_label_df

Unnamed: 0,sn,fault_time,server_model,msg_cnt,fault_hour,fault_minute,msg_1_cnt,msg_2_cnt,msg_3_cnt,msg_4_cnt,...,msg_hour_avg,msg_hour_median,msg_hour_mode,msg_minute_max,msg_minute_min,msg_minute_avg,msg_minute_median,msg_minute_mode,msg_log,label
0,SERVER_10001,2020-05-01 10:04:00,57,9,10,4,0,0,9,0,...,8.333333,8.0,8,59,0,38.333333,54.0,59,processor cpu0_status | ierr | asserted. proc...,1
1,SERVER_10003,2020-03-28 09:48:00,57,117,9,48,0,0,1,0,...,9.000000,9.0,9,48,45,46.444444,46.0,46,memory cpu1d0_dimm_stat | correctable ecc | a...,2
2,SERVER_10008,2020-02-25 16:12:00,53,5,16,12,0,0,3,0,...,15.000000,15.0,15,52,51,51.600000,52.0,52,processor cpu0_status | configuration error |...,1
3,SERVER_10008,2020-03-11 18:04:00,53,3,18,4,0,0,3,0,...,16.000000,16.0,16,47,46,46.333333,46.0,46,memory dimm050_stat | uncorrectable ecc | ass...,2
4,SERVER_10009,2020-05-08 16:37:00,53,4,16,37,0,0,4,0,...,16.000000,16.0,16,7,7,7.000000,7.0,7,drive slot hdd_l_14_status | drive fault | as...,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14919,SERVER_9991,2020-08-04 22:49:00,56,2,22,49,0,0,1,0,...,20.500000,20.5,19,52,46,49.000000,49.0,46,memory cpu0a0_dimm_stat | correctable ecc | a...,2
14920,SERVER_9991,2020-10-07 18:42:00,56,1,18,42,0,0,1,0,...,18.000000,18.0,18,14,14,14.000000,14.0,14,memory cpu1a0_dimm_stat | correctable ecc | a...,2
14921,SERVER_9993,2020-05-14 23:50:00,57,2,23,50,0,0,1,0,...,23.000000,23.0,23,48,43,45.500000,45.5,43,memory cpu1f0_dimm_stat | correctable ecc | a...,2
14922,SERVER_9998,2020-05-29 11:25:00,57,2,11,25,0,0,2,0,...,11.000000,11.0,11,19,4,11.500000,11.5,4,memory cpu1e1_dimm_stat | correctable ecc | a...,2


## 读取v1_baseline用的词和v1p1的新词

In [148]:
v1_word_list=list(pd.read_csv('pre_contest/v1p2/word_frequency_df.txt',sep='\t')['word'])
v1p1_word_list=list(pd.read_csv('pre_contest/v1p2/tags_incomplete.txt',sep='\t',names=['word'])['word'])
v1p2_word_list=list(set(v1_word_list+v1p1_word_list))

In [149]:
len(v1p2_word_list)

2087

## 训练词频向量

In [150]:
frequency_vector_list = []
tag=0
for word in v1p2_word_list:
    if tag%100==0:
        print(tag,datetime.datetime.now())
    pattern=re.compile(word)
    frequency_vector = [len(re.findall(pattern,log))  for log in msg_log_list]
    frequency_vector_list.append(frequency_vector)
    tag+=1

0 2022-04-04 19:01:25.817831
100 2022-04-04 19:01:27.453708
200 2022-04-04 19:01:29.308666
300 2022-04-04 19:01:30.842449
400 2022-04-04 19:01:32.410376
500 2022-04-04 19:01:33.994467
600 2022-04-04 19:01:35.603546
700 2022-04-04 19:01:37.183816
800 2022-04-04 19:01:38.917709
900 2022-04-04 19:01:40.451355
1000 2022-04-04 19:01:41.978375
1100 2022-04-04 19:01:43.524862
1200 2022-04-04 19:01:45.143569
1300 2022-04-04 19:01:46.871358
1400 2022-04-04 19:01:48.733013
1500 2022-04-04 19:01:50.835180
1600 2022-04-04 19:01:52.728265
1700 2022-04-04 19:01:54.358192
1800 2022-04-04 19:01:55.879648
1900 2022-04-04 19:01:57.553641
2000 2022-04-04 19:01:59.370009


In [151]:
frequency_vector_df=pd.DataFrame(frequency_vector_list)
frequency_vector_df=frequency_vector_df.T
frequency_vector_df.columns=v1p2_word_list
new_feature_list=list(msg_log_label_df.columns)[2:-2]
frequency_vector_df[new_feature_list]=msg_log_label_df[new_feature_list]

frequency_vector_df['label']=label_list
frequency_vector_df[['sn','fault_time']]=msg_log_label_df[['sn','fault_time']]
feature=np.array(frequency_vector_df[v1p2_word_list+new_feature_list])

In [152]:
frequency_vector_df

Unnamed: 0,bp1_hdd23_status,15,rear2_3_status,security,252,deasserted system firmware error bios poststatus,drive,threshold,outofrange,0x19,...,msg_hour_median,msg_hour_mode,msg_minute_max,msg_minute_min,msg_minute_avg,msg_minute_median,msg_minute_mode,label,sn,fault_time
0,0,0,0,0,0,0,0,0,0,0,...,8.0,8,59,0,38.333333,54.0,59,1,SERVER_10001,2020-05-01 10:04:00
1,0,0,0,0,0,0,0,0,0,0,...,9.0,9,48,45,46.444444,46.0,46,2,SERVER_10003,2020-03-28 09:48:00
2,0,0,0,0,0,0,0,0,0,0,...,15.0,15,52,51,51.600000,52.0,52,1,SERVER_10008,2020-02-25 16:12:00
3,0,0,0,0,0,0,0,0,0,0,...,16.0,16,47,46,46.333333,46.0,46,2,SERVER_10008,2020-03-11 18:04:00
4,0,0,0,0,0,0,8,0,0,0,...,16.0,16,7,7,7.000000,7.0,7,3,SERVER_10009,2020-05-08 16:37:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14919,0,0,0,0,0,0,0,0,0,0,...,20.5,19,52,46,49.000000,49.0,46,2,SERVER_9991,2020-08-04 22:49:00
14920,0,0,0,0,0,0,0,0,0,0,...,18.0,18,14,14,14.000000,14.0,14,2,SERVER_9991,2020-10-07 18:42:00
14921,0,0,0,0,0,0,0,0,0,0,...,23.0,23,48,43,45.500000,45.5,43,2,SERVER_9993,2020-05-14 23:50:00
14922,0,0,0,0,0,0,0,0,0,0,...,11.0,11,19,4,11.500000,11.5,4,2,SERVER_9998,2020-05-29 11:25:00


## 训练xgb模型

In [153]:
# xgb模型参数
xgb_params = {
    'booster':'gbtree',
    'objective':'multi:softmax',   # 多分类问题
    'num_class':4,  # 类别数，与multi softmax并用
    'gamma':0.1,    # 用于控制是否后剪枝的参数，越大越保守，一般0.1 0.2的样子
    'max_depth':6,  # 构建树的深度，越大越容易过拟合
    'lambda':2,  # 控制模型复杂度的权重值的L2 正则化项参数，参数越大，模型越不容易过拟合
    'subsample':1, # 随机采样训练样本
    'colsample_bytree':1,# 这个参数默认为1，是每个叶子里面h的和至少是多少
    # 对于正负样本不均衡时的0-1分类而言，假设h在0.01附近，min_child_weight为1
    #意味着叶子节点中最少需要包含100个样本。这个参数非常影响结果，
    # 控制叶子节点中二阶导的和的最小值，该参数值越小，越容易过拟合
    'silent':0,  # 设置成1 则没有运行信息输入，最好是设置成0
    'eta':0.3,  # 如同学习率
    'seed':1000,
    'nthread':16,  #CPU线程数
    #'eval_metric':'auc'
}

# 指标评估
def macro_f1(label,prediction)  -> float:

    """
    计算得分
    :param target_df: [sn,fault_time,label]
    :param submit_df: [sn,fault_time,label]
    :return:
    """

    weights =  [3  /  7,  2  /  7,  1  /  7,  1  /  7]
    macro_F1 =  0.
    for i in  range(len(weights)):
        TP =  np.sum((label==i) & (prediction==i))
        FP =  np.sum((label!= i) & (prediction == i))
        FN =  np.sum((label == i) & (prediction!= i))
        precision = TP /  (TP + FP)  if  (TP + FP)  >  0  else  0
        recall = TP /  (TP + FN)  if  (TP + FN)  >  0  else  0
        F1 =  2  * precision * recall /  (precision + recall)  if  (precision + recall)  >  0  else  0
        macro_F1 += weights[i]  * F1
        
        print('Task %d:\n Prcesion %.2f, Recall %.2f, F1 %.2f' % (i+1, precision, recall, F1))
        
    return macro_F1

In [154]:
random.seed(0)
label=np.array(label_list)
val_mask = [random.random() < 0.3 for _ in range(len(feature))]
train_mask = [not xx for xx in val_mask]
val_feature = feature[val_mask]
val_label = label[val_mask]
train_feature = feature[train_mask]
train_label = label[train_mask]
train_data=xgb.DMatrix(train_feature,label=train_label)
train_feature=xgb.DMatrix(train_feature)
val_feature=xgb.DMatrix(val_feature)

In [155]:
xgb_model=xgb.train(xgb_params,train_data,num_boost_round=500)

Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




In [156]:
train_pred=xgb_model.predict(train_feature)
val_pred=xgb_model.predict(val_feature)

In [157]:
macro_f1(train_label,train_pred)

Task 1:
 Prcesion 1.00, Recall 1.00, F1 1.00
Task 2:
 Prcesion 1.00, Recall 1.00, F1 1.00
Task 3:
 Prcesion 1.00, Recall 1.00, F1 1.00
Task 4:
 Prcesion 1.00, Recall 1.00, F1 1.00


0.9999999999999998

In [158]:
macro_f1(val_label,val_pred)
# 0.6566607725096727

Task 1:
 Prcesion 0.50, Recall 0.36, F1 0.42
Task 2:
 Prcesion 0.73, Recall 0.78, F1 0.76
Task 3:
 Prcesion 0.95, Recall 0.97, F1 0.96
Task 4:
 Prcesion 0.91, Recall 0.94, F1 0.92


0.6640035152684288

## 保存验证集，sn+fault_time+日志+标签+重要性前50特征
## 保存验证集类别0_1的验证集，sn+fault_time+日志+标签+重要性前50特征

In [167]:
total_gain_dict=xgb_model.get_score(importance_type='total_gain')
total_gain_dict=dict(sorted(total_gain_dict.items(),key=lambda item:item[1],reverse=True))
feature_names=list(frequency_vector_df.columns)
feature_importance_top_50_dict={}
i=0
for key in total_gain_dict:
    if i<50:
        feature_name=feature_names[int(key[1:])]
        feature_importance_top_50_dict[feature_name]=total_gain_dict[key]
        i+=1
    else:
        break
feature_importance_top_50_dict

{'memory': 15638.688353418496,
 'or': 4286.106905462703,
 'cpu': 3987.7502924094006,
 'processor': 2567.1063391391963,
 'config': 912.9961943554002,
 'server_model': 906.2745191886993,
 'fault_minute': 871.3845071674981,
 'msg_minute_avg': 734.6533094284981,
 'caterr': 686.3688510905004,
 'msg_cnt': 680.6180438658004,
 'msg_minute_max': 572.0743919838003,
 'msg_minute_min': 568.4317705392998,
 'fault_hour': 513.1301522292,
 'msg_hour_avg': 473.8788561497997,
 'correctable ecc': 470.8986099933999,
 'e': 443.8740223370002,
 'msg_minute_mode': 413.8862015048998,
 'msg_minute_median': 400.1679765135006,
 'deassert': 335.41932722059994,
 'uncorrectable ecc': 332.76969724370014,
 'c': 330.99264198879985,
 'microcontroller/coprocessor': 300.5963722480001,
 'ec': 299.8555676554004,
 'msg_hour_max': 278.9856655740998,
 'msg_hour_min': 277.2084092764004,
 'sta': 246.52170583269987,
 '0': 244.15984114469995,
 '0x3b': 188.67864560499999,
 'unknown': 185.52709877970005,
 'asserted': 152.90629501059

In [176]:
validation_df = frequency_vector_df[['sn','fault_time']][val_mask]
validation_df['msg_log'] = msg_log_label_df['msg_log'][val_mask]
validation_df['label'] = val_label
validation_df['prediction'] = val_pred
validation_df[list(feature_importance_top_50_dict.keys())] = frequency_vector_df[list(feature_importance_top_50_dict.keys())]
# validation_df.to_csv('./pre_contest/v1p6/validation_df.csv',sep=',',index=None)
_0_1_validation_df = validation_df[validation_df['label'].isin([0,1])]
# _0_1_validation_df.to_csv('./pre_contest/v1p6/_0_1_validation_df.csv',sep=',',index=None)

In [175]:
_0_1_validation_df

Unnamed: 0,sn,fault_time,msg_log,label,prediction,memory,or,cpu,processor,config,...,msg_hour_median,up,power,msg_3_cnt,status,fa,boot,device,0x19,error
12,SERVER_10028,2020-01-21 16:33:00,processor cpu0_status | configuration error |...,1,1.0,0,4,2,2,2,...,15.0,1,0,3,2,0,2,0,0,2
43,SERVER_10093,2020-02-22 12:33:00,memory cpu0b1_dimm_stat | correctable ecc | d...,1,1.0,5,17,7,4,2,...,11.0,3,3,13,6,0,4,2,0,2
135,SERVER_10198,2020-05-24 11:45:00,processor cpu0_status | ierr | asserted. proc...,1,1.0,0,2,2,2,0,...,2.0,0,0,2,2,0,0,0,0,0
169,SERVER_10222,2020-07-08 12:45:00,management subsystem health system_health | s...,1,1.0,0,10,1,1,0,...,11.0,1,2,8,3,0,2,0,0,0
174,SERVER_10226,2020-05-23 20:16:00,processor cpu0_status | ierr | asserted. proc...,1,0.0,0,2,2,2,0,...,11.0,0,0,2,2,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14657,SERVER_9678,2020-10-08 02:06:00,memory cpu0c0_dimm_stat | correctable ecc | a...,0,2.0,4,12,6,2,2,...,1.0,0,0,6,2,0,0,0,0,2
14675,SERVER_9716,2020-03-01 23:21:00,processor cpu0_status | ierr | asserted. proc...,0,0.0,0,4,2,2,0,...,21.0,0,0,3,2,0,0,0,0,0
14684,SERVER_9727,2020-08-26 11:21:00,event logging disabled sel_status | log area ...,1,1.0,0,8,6,6,1,...,2.0,3,5,13,11,0,1,1,0,1
14687,SERVER_9729,2020-08-24 18:18:00,processor cpu0_status | ierr | asserted. proc...,1,1.0,0,9,4,4,0,...,17.0,3,3,10,6,0,4,0,0,0


# 用所有数据进行训练、保存模型、训练集

In [159]:
all_data=xgb.DMatrix(feature,label)
xgb_model_v1p6=xgb.train(xgb_params,all_data,num_boost_round=500)

Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




In [177]:
frequency_vector_df.head()

Unnamed: 0,bp1_hdd23_status,15,rear2_3_status,security,252,deasserted system firmware error bios poststatus,drive,threshold,outofrange,0x19,...,msg_hour_median,msg_hour_mode,msg_minute_max,msg_minute_min,msg_minute_avg,msg_minute_median,msg_minute_mode,label,sn,fault_time
0,0,0,0,0,0,0,0,0,0,0,...,8.0,8,59,0,38.333333,54.0,59,1,SERVER_10001,2020-05-01 10:04:00
1,0,0,0,0,0,0,0,0,0,0,...,9.0,9,48,45,46.444444,46.0,46,2,SERVER_10003,2020-03-28 09:48:00
2,0,0,0,0,0,0,0,0,0,0,...,15.0,15,52,51,51.6,52.0,52,1,SERVER_10008,2020-02-25 16:12:00
3,0,0,0,0,0,0,0,0,0,0,...,16.0,16,47,46,46.333333,46.0,46,2,SERVER_10008,2020-03-11 18:04:00
4,0,0,0,0,0,0,8,0,0,0,...,16.0,16,7,7,7.0,7.0,7,3,SERVER_10009,2020-05-08 16:37:00


In [161]:
frequency_vector_df.to_csv('./pre_contest/v1p6/frequency_vector_df.csv',sep=',',index=None)
file = open('./pre_contest/v1p6/xgb_model_v1p6.model','wb')
pickle.dump(xgb_model_v1p6, file)

## 查看特征重要性、保存特征重要性前300的特征

In [162]:
total_gain_dict=xgb_model_v1p6.get_score(importance_type='total_gain')
total_gain_dict=dict(sorted(total_gain_dict.items(),key=lambda item:item[1],reverse=True))
feature_names=list(frequency_vector_df.columns)
feature_importance_top_300_dict={}
i=0
for key in total_gain_dict:
    if i<300:
        feature_name=feature_names[int(key[1:])]
        feature_importance_top_300_dict[feature_name]=total_gain_dict[key]
        i+=1
    else:
        break
feature_importance_top_300_dict

{'memory': 22207.870354995295,
 'or': 6173.164130015699,
 'cpu': 5820.829303034005,
 'processor': 3735.560395604702,
 'config': 1348.8281070910007,
 'fault_minute': 1267.0539518629002,
 'server_model': 1199.7780455850998,
 'caterr': 1085.3047328632993,
 'msg_minute_avg': 957.5440701405026,
 'msg_cnt': 899.730107435098,
 'msg_minute_max': 855.8012383971012,
 'msg_minute_min': 738.193022992599,
 'msg_hour_avg': 713.2222798615979,
 'fault_hour': 690.9168246005988,
 'e': 644.0394151110005,
 'correctable ecc': 603.6458398336002,
 'msg_minute_mode': 584.3420134322008,
 'c': 534.3769730587993,
 'msg_minute_median': 532.8831313272003,
 'uncorrectable ecc': 521.9578034249002,
 'deassert': 517.4673607651996,
 'msg_hour_max': 461.5146673354003,
 'msg_hour_min': 443.56284308240043,
 'microcontroller/coprocessor': 366.7927114856999,
 'ec': 341.4673784176002,
 '0x3b': 319.4135091712001,
 '0': 258.62340454699995,
 'device disabled': 243.7474478899,
 'on': 234.74257847250013,
 'asserted': 226.00724908

In [101]:
feature_importance_top_300_df=frequency_vector_df[['sn','day','label']+list(feature_importance_top_300_dict.keys())]

In [102]:
# feature_importance_top_300_df.to_csv('./pre_contest/v1p5/feature_importance_top_300_df.csv',sep=',',index=None)

# 对初赛测试集进行预测

## 读取测试集数据、拼接数据

In [178]:
# 读取选手提交数据
submit_a=pd.read_csv('./pre_contest/dataset/preliminary_submit_dataset_a.csv')
submit_log=pd.read_csv('./pre_contest/dataset/preliminary_sel_log_dataset_a.csv')

In [209]:
submit_a

Unnamed: 0,sn,time,error_type,log_time,msg,msg_len,server_model
0,000d33b21436,2020-09-02 16:42:54,-1,2020-09-02 16:42:54,,0,SM40
1,005c5a9218ba,2020-06-28 19:05:16,-1,2020-06-28 19:05:16,,0,SM99
2,0079283bde6e,2020-04-26 21:32:44,-1,2020-04-26 21:32:44,,0,SM14
3,007bdf23b62f,2020-06-16 18:40:39,-1,2020-06-16 18:40:39,,0,SM93
4,00a577a8e54f,2020-04-07 07:16:55,-1,2020-04-07 07:16:55,,0,SM13
...,...,...,...,...,...,...,...
3006,ffbf46b4af21,2019-12-28 20:10:01,-1,2019-12-28 20:10:01,,0,SM35
3007,ffc229b6cd9a,2020-06-27 02:39:08,-1,2020-06-27 02:39:08,,0,SM49
3008,ffd44698a52b,2020-01-21 15:46:56,-1,2020-01-21 15:46:56,,0,SM66
3009,fff73a9e5bd5,2020-03-01 22:43:43,-1,2020-03-01 22:43:43,,0,SM92


In [211]:
submit_a.groupby(['sn','time']).size()

sn            time               
000d33b21436  2020-09-02 16:42:54    1
005c5a9218ba  2020-06-28 19:05:16    1
0079283bde6e  2020-04-26 21:32:44    1
007bdf23b62f  2020-06-16 18:40:39    1
00a577a8e54f  2020-04-07 07:16:55    1
                                    ..
ffbf46b4af21  2019-12-28 20:10:01    1
ffc229b6cd9a  2020-06-27 02:39:08    1
ffd44698a52b  2020-01-21 15:46:56    1
fff73a9e5bd5  2020-03-01 22:43:43    1
fffd22fffe19  2020-01-21 19:22:56    1
Length: 3011, dtype: int64

In [180]:
submit_log

Unnamed: 0,sn,time,msg,server_model
0,000d33b21436,2020-09-02 11:38:40,System Boot Initiated BIOS_Boot_Up | Initiate...,SM40
1,000d33b21436,2020-09-02 15:46:23,System Boot Initiated BIOS_Boot_Up | Initiate...,SM40
2,005c5a9218ba,2020-06-28 18:26:25,Memory Memory_Status | Correctable ECC | Asse...,SM99
3,005c5a9218ba,2020-06-28 18:40:26,System ACPI Power State #0x7d | S0/G0: workin...,SM99
4,005c5a9218ba,2020-06-28 18:26:26,Memory Memory_Status | Correctable ECC | Asse...,SM99
...,...,...,...,...
10986,fffd22fffe19,2020-01-21 19:16:01,Microcontroller/Coprocessor #0x16 | Transitio...,SM16
10987,fffd22fffe19,2020-01-21 19:17:03,System Event #0x10 | Timestamp Clock Sync | A...,SM16
10988,fffd22fffe19,2020-01-21 18:32:59,Memory #0xf9 | Uncorrectable ECC | Asserted,SM16
10989,fffd22fffe19,2020-01-21 19:18:14,System Boot Initiated BIOS_Boot_Up | Initiate...,SM16


In [188]:
len(set(submit_a['sn'].drop_duplicates()).intersection(set(submit_log['sn'].drop_duplicates())))

2883

In [291]:
# 读取提交日志数据
submit_log = pd.read_csv('./pre_contest/dataset/preliminary_sel_log_dataset_a.csv')

submit_log['error_type'] = ''
submit_log['log_time'] = pd.to_datetime(submit_log['time'])
submit_log['msg_len'] = submit_log['msg'].apply(lambda d: len(d.split('|')))

submit_log_server_mapping = submit_log[['sn','server_model']].drop_duplicates()


# 读取提交标签数据
submit_a = pd.read_csv('./pre_contest/dataset/preliminary_submit_dataset_a.csv')

submit_a['error_type'] = -1
submit_a['log_time'] = pd.to_datetime(submit_a['fault_time'])
submit_a['msg'] = ''
submit_a['msg_len'] = 0
submit_a['server_model'] = submit_a['sn'].map(dict(zip(submit_log_server_mapping['sn'], 
                                                       submit_log_server_mapping['server_model'])))
submit_a.columns = ['sn','time','error_type','log_time','msg','msg_len','server_model']


# 合并日志表与标签表
submit_combination_df = pd.concat([submit_log, submit_a])
submit_combination_df = submit_combination_df.sort_values(['log_time'], ascending=True)
submit_combination_df = submit_combination_df.reset_index(drop=True)


# 按sn分组，按日志（含标签）时间顺序排列，以报错划分日志，取此次报错前的日志合并

submit_grp_store = []
submit_miss_log_store = []
submit_miss_label_store = []
cnt_miss_label = 0
cnt_with_label = 0
for name, grp in submit_combination_df.groupby(['sn']):
    if len(grp[grp['error_type'] == -1]) == 0:
        cnt_miss_label += 1
        submit_miss_log_store.append(grp)
    elif len(grp[grp['error_type'] == -1]) == 1:
        submit_grp_store.append(grp)
    else:
        cutoff_idx = [-1] + grp.loc[grp['error_type'] == -1].index.tolist() + [grp.index.tolist()[-1]+1]
        for kth in range(len(cutoff_idx)-1): 
            temp_grp = grp.loc[(grp.index <= cutoff_idx[kth+1]) & (grp.index > cutoff_idx[kth])]
            if len(temp_grp) > 0:
                if len(temp_grp[temp_grp['error_type'] == -1]) == 0:
                    cnt_miss_label += 1
                    submit_miss_label_store.append(temp_grp)
                else:
                    cnt_with_label += 1 
                    submit_grp_store.append(temp_grp)
                
                
# 构建新日志-标签表
submit_log_df = pd.DataFrame(columns = ['sn','fault_time','time','msg','server_model']) 

for grp in submit_grp_store:
    fault_time = grp[grp['error_type'] == -1]['time'].iloc[0]
    grp['fault_time'] = fault_time
    if len(grp) > 1:
        temp = grp[grp['error_type'] == ''][['sn','fault_time','time','msg','server_model']]
        submit_log_df = pd.concat([submit_log_df,temp])
    else:
        submit_log_df = pd.concat([submit_log_df,grp])


In [292]:
len(submit_grp_store),len(submit_miss_log_store),len(submit_miss_label_store)

(3011, 0, 1)

## 计算统计特征

In [293]:
submit_log_df['msg_hour'] = submit_log_df['time'].apply(lambda x : datetime.datetime.strptime(x, "%Y-%m-%d %H:%M:%S").hour)
submit_log_df['msg_minute'] = submit_log_df['time'].apply(lambda x : datetime.datetime.strptime(x, "%Y-%m-%d %H:%M:%S").minute)
submit_log_df['fault_hour'] = submit_log_df['fault_time'].apply(lambda x : datetime.datetime.strptime(x, "%Y-%m-%d %H:%M:%S").hour)
submit_log_df['fault_minute'] = submit_log_df['fault_time'].apply(lambda x : datetime.datetime.strptime(x, "%Y-%m-%d %H:%M:%S").minute)
# 拼接日志
# 新增 fault_minute，按竖线统计任务数，日志小时和分钟的最大值、最小值、均值、中位数、众数
fault_minute_list = []
msg_1_cnt_list=[]
msg_2_cnt_list=[]
msg_3_cnt_list=[]
msg_4_cnt_list=[]
msg_hour_max_list=[]
msg_hour_min_list=[]
msg_hour_avg_list=[]
msg_hour_median_list=[]
msg_hour_mode_list=[]
msg_minute_max_list=[]
msg_minute_min_list=[]
msg_minute_avg_list=[]
msg_minute_median_list=[]
msg_minute_mode_list=[]


sn_list=[]
fault_time_list=[]
server_model_list=[]
msg_log_list=[]
msg_cnt_list=[]
fault_hour_list=[]
label_list=[]

for msg_log_df in submit_log_label_df.groupby(['sn','fault_time']):
    msg_log_str = ''
    msg_1_cnt = 0
    msg_2_cnt = 0
    msg_3_cnt = 0
    msg_4_cnt = 0
    for info in msg_log_df[1]['msg'].drop_duplicates():
        if info == info:
            msg_log_str=msg_log_str+info.lower()+'.'
            if len(info.split('|')) == 1:
                msg_1_cnt += 1
            elif len(info.split('|')) == 2:
                msg_2_cnt += 1
            elif len(info.split('|')) == 3:
                msg_3_cnt += 1
            else:
                msg_4_cnt += 1
    sm=int(msg_log_df[1].iloc[0]['server_model'][2:])
    
    sn_list.append(msg_log_df[0][0])
    fault_time_list.append(msg_log_df[0][1])
    server_model_list.append(sm)
    msg_log_list.append(msg_log_str)
    msg_cnt_list.append(len(msg_log_df[1]))
    label_list.append(label)
    
    fault_hour_list.append(msg_log_df[1].iloc[0]['fault_hour'])
    fault_minute_list.append(msg_log_df[1].iloc[0]['fault_minute'])
    
    msg_1_cnt_list.append(msg_1_cnt)
    msg_2_cnt_list.append(msg_2_cnt)
    msg_3_cnt_list.append(msg_3_cnt)
    msg_4_cnt_list.append(msg_4_cnt)
    
    msg_hour_max_list.append(msg_log_df[1]['msg_hour'].max())
    msg_hour_min_list.append(msg_log_df[1]['msg_hour'].min())
    msg_hour_avg_list.append(msg_log_df[1]['msg_hour'].mean())
    msg_hour_median_list.append(msg_log_df[1]['msg_hour'].median())
    msg_hour_mode_list.append(msg_log_df[1]['msg_hour'].mode()[0])
    
    msg_minute_max_list.append(msg_log_df[1]['msg_minute'].max())
    msg_minute_min_list.append(msg_log_df[1]['msg_minute'].min())
    msg_minute_avg_list.append(msg_log_df[1]['msg_minute'].mean())
    msg_minute_median_list.append(msg_log_df[1]['msg_minute'].median())
    msg_minute_mode_list.append(msg_log_df[1]['msg_minute'].mode()[0])
    
submit_msg_log_df=pd.DataFrame(
    {
    'sn':sn_list,
    'fault_time':fault_time_list,
    'server_model':server_model_list,
    'msg_cnt':msg_cnt_list,
    'fault_hour':fault_hour_list,
    'fault_minute':fault_minute_list,
    'msg_1_cnt':msg_1_cnt_list,
    'msg_2_cnt':msg_2_cnt_list,
    'msg_3_cnt':msg_3_cnt_list,
    'msg_4_cnt':msg_4_cnt_list,
    'msg_hour_max':msg_hour_max_list,
    'msg_hour_min':msg_hour_min_list,
    'msg_hour_avg':msg_hour_avg_list,
    'msg_hour_median':msg_hour_median_list,
    'msg_hour_mode':msg_hour_mode_list,
    'msg_minute_max':msg_minute_max_list,
    'msg_minute_min':msg_minute_min_list,
    'msg_minute_avg':msg_minute_avg_list,
    'msg_minute_median':msg_minute_median_list,
    'msg_minute_mode':msg_minute_mode_list,
    'msg_log':msg_log_list
    })

In [294]:
submit_msg_log_df

Unnamed: 0,sn,fault_time,server_model,msg_cnt,fault_hour,fault_minute,msg_1_cnt,msg_2_cnt,msg_3_cnt,msg_4_cnt,...,msg_hour_min,msg_hour_avg,msg_hour_median,msg_hour_mode,msg_minute_max,msg_minute_min,msg_minute_avg,msg_minute_median,msg_minute_mode,msg_log
0,000d33b21436,2020-09-02 16:42:54,40,2,16,42,0,0,2,0,...,11,13.000000,13.0,11,46,38,42.000000,42.0,38,system boot initiated bios_boot_up | initiate...
1,005c5a9218ba,2020-06-28 19:05:16,99,10,19,5,0,0,4,0,...,18,18.000000,18.0,18,40,26,31.400000,26.0,26,memory memory_status | correctable ecc | asse...
2,0079283bde6e,2020-04-26 21:32:44,14,1,21,32,0,0,1,0,...,20,20.000000,20.0,20,54,54,54.000000,54.0,54,power supply psu1_supply | failure detected |...
3,007bdf23b62f,2020-06-16 18:40:39,93,19,18,40,0,0,5,0,...,17,17.000000,17.0,17,45,4,24.421053,16.0,16,memory #0xe2 | correctable ecc | asserted. un...
4,00a577a8e54f,2020-04-07 07:16:55,13,6,7,16,0,0,6,0,...,6,6.000000,6.0,6,52,43,45.666667,45.0,43,memory mem_chg1_status | correctable ecc | as...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3006,ffbf46b4af21,2019-12-28 20:10:01,35,3,20,10,0,0,1,0,...,20,20.000000,20.0,20,9,8,8.666667,9.0,9,memory cpu0c0_dimm_stat | correctable ecc | a...
3007,ffc229b6cd9a,2020-06-27 02:39:08,49,2,2,39,0,0,1,0,...,2,2.000000,2.0,2,36,29,32.500000,32.5,29,memory #0xe2 | correctable ecc | asserted.
3008,ffd44698a52b,2020-01-21 15:46:56,66,2,15,46,0,0,2,0,...,13,13.000000,13.0,13,57,57,57.000000,57.0,57,unknown chassis_control | | asserted. system...
3009,fff73a9e5bd5,2020-03-01 22:43:43,92,6,22,43,0,0,1,0,...,13,13.000000,13.0,13,50,18,44.333333,49.5,50,memory #0x87 | correctable ecc | asserted.


## 计算词频特征向量

In [289]:
# 统计词频向量
submit_frequency_vector_list = []
tag=0
for word in v1p2_word_list:
    if tag%100==0:
        print(tag,datetime.datetime.now())
    pattern=re.compile(word)
    frequency_vector = [len(re.findall(pattern,log))  for log in msg_log_list]
    submit_frequency_vector_list.append(frequency_vector)
    tag+=1                                                              

0 2022-04-04 23:20:59.972675
100 2022-04-04 23:21:00.396766
200 2022-04-04 23:21:00.792601
300 2022-04-04 23:21:01.159176
400 2022-04-04 23:21:01.497585
500 2022-04-04 23:21:01.885660
600 2022-04-04 23:21:02.235019
700 2022-04-04 23:21:02.579013
800 2022-04-04 23:21:03.006614
900 2022-04-04 23:21:03.306641
1000 2022-04-04 23:21:03.616905
1100 2022-04-04 23:21:03.983513
1200 2022-04-04 23:21:04.293546
1300 2022-04-04 23:21:04.651483
1400 2022-04-04 23:21:04.995835
1500 2022-04-04 23:21:05.326449
1600 2022-04-04 23:21:05.617902
1700 2022-04-04 23:21:05.939220
1800 2022-04-04 23:21:06.260225
1900 2022-04-04 23:21:06.578668
2000 2022-04-04 23:21:06.903447


In [295]:
submit_frequency_vector_df=pd.DataFrame(submit_frequency_vector_list)
submit_frequency_vector_df=submit_frequency_vector_df.T
submit_frequency_vector_df.columns=v1p2_word_list
submit_frequency_vector_df[new_feature_list]=submit_msg_log_df[new_feature_list]
submit_frequency_vector_df[['sn','fault_time']]=submit_msg_log_df[['sn','fault_time']]
feature=np.array(submit_frequency_vector_df[v1p2_word_list+new_feature_list])  

# submit_frequency_vector_df.to_csv('./pre_contest/v1p6/submit_frequency_vector_df.csv',sep=',',index=None)

In [296]:
submit_frequency_vector_df

Unnamed: 0,bp1_hdd23_status,15,rear2_3_status,security,252,deasserted system firmware error bios poststatus,drive,threshold,outofrange,0x19,...,msg_hour_avg,msg_hour_median,msg_hour_mode,msg_minute_max,msg_minute_min,msg_minute_avg,msg_minute_median,msg_minute_mode,sn,fault_time
0,0,0,0,0,0,0,0,0,0,0,...,13.000000,13.0,11,46,38,42.000000,42.0,38,000d33b21436,2020-09-02 16:42:54
1,0,0,0,0,0,0,0,0,0,0,...,18.000000,18.0,18,40,26,31.400000,26.0,26,005c5a9218ba,2020-06-28 19:05:16
2,0,0,0,0,0,0,0,0,0,0,...,20.000000,20.0,20,54,54,54.000000,54.0,54,0079283bde6e,2020-04-26 21:32:44
3,0,0,0,0,0,0,0,0,0,0,...,17.000000,17.0,17,45,4,24.421053,16.0,16,007bdf23b62f,2020-06-16 18:40:39
4,0,0,0,0,0,0,0,0,0,0,...,6.000000,6.0,6,52,43,45.666667,45.0,43,00a577a8e54f,2020-04-07 07:16:55
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3006,0,0,0,0,0,0,0,0,0,0,...,20.000000,20.0,20,9,8,8.666667,9.0,9,ffbf46b4af21,2019-12-28 20:10:01
3007,0,0,0,0,0,0,0,0,0,0,...,2.000000,2.0,2,36,29,32.500000,32.5,29,ffc229b6cd9a,2020-06-27 02:39:08
3008,0,0,0,0,0,0,0,0,0,0,...,13.000000,13.0,13,57,57,57.000000,57.0,57,ffd44698a52b,2020-01-21 15:46:56
3009,0,0,0,0,0,0,0,0,0,0,...,13.000000,13.0,13,50,18,44.333333,49.5,50,fff73a9e5bd5,2020-03-01 22:43:43


## 对测试集进行预测，保存特征和预测结果

In [298]:
test_feature=xgb.DMatrix(feature)
submit_frequency_vector_df['prediction']=xgb_model_v1p6.predict(test_feature)
preliminary_submit_dataset_a=submit_frequency_vector_df[['sn','fault_time','prediction']]
preliminary_submit_dataset_a['label']=preliminary_submit_dataset_a['prediction'].apply(lambda x : int(x))
preliminary_submit_dataset_a=preliminary_submit_dataset_a[['sn','fault_time','label']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  preliminary_submit_dataset_a['label']=preliminary_submit_dataset_a['prediction'].apply(lambda x : int(x))


In [299]:
preliminary_submit_dataset_a.to_csv('pre_contest/v1p6/preliminary_submit_dataset_a_v1p6.csv',sep=',',index=None)