# 背景：记录Graph Dataset Prep and Analysis 存在的一些问题

In [4]:
import pandas as pd
import numpy as np
import datetime
import networkx as nx
# 更改工作目录为当前项目根目录
import sys
import os
os.chdir(os.path.dirname(os.path.dirname(sys.path[0])))
print(os.getcwd())

/Users/jincan02/Projects/Log-diagnosis


In [44]:
class msg_node:
    msg_type = None
    attrs = []
    server_model = None
    msg_time = None
    msg_title = None
    idx = 0
    error_type = None
    
    def __init__(self, msg, log_time, server_model, error_type):
        self.server_model = server_model
        self.attrs = [xx.strip() for xx in msg.split('|')]
        self.msg_time = log_time
        if (len(self.attrs) == 1) and (self.attrs[0] == ''):
            self.error_type = error_type
            self.msg_type = 4
        elif len(self.attrs) == 1:
            self.msg_type = 0
        elif len(self.attrs) == 2:
            self.msg_type = 1
            self.msg_title = self.attrs[0]
        elif (len(self.attrs) == 3) and (self.attrs[-1] in ['Assert','Asserted','Deasserted','Deassert','asserted','deasserted']):
            self.msg_type = 2
            self.msg_title = self.attrs[0]
        elif len(self.attrs) >=3:
            self.msg_type = 3
            self.msg_title = self.attrs[0]
        else:
            print('Error in extracting msg attributes!')
    
    def set_index(self, index):
        self.idx = index

class msg_graph:
    nodes = []  # list of msg_node
    edges = []  # list of tuples (msg_node_index1, msg_node_index2, edge_type, delta_time)
    error_label = []
    
    def __init__(self, nodes):
        # 注！这里默认输入的nodes是已经按照时序排好的
        self.nodes = nodes
        self.edges = []
        
    def assign_index(self):
        for kth, node in enumerate(self.nodes):
            node.set_index(kth)
    
    def build_graph(self, with_time_order = True, time_cutoff=172800):
        edge_stamps = []
        # edge type 1: same msg_title
        unique_edge_types = list(set([xx.msg_title for xx in nodes]))
        edge_type_vector = np.array([xx.msg_title for xx in nodes])
        for kth in range(len(unique_edge_types)):
            temp_nodes = np.array(nodes)[edge_type_vector == unique_edge_types[kth]].tolist()
            for pth in range(len(temp_nodes)-1):
                self.edges.append((temp_nodes[pth].idx, temp_nodes[pth+1].idx, 
                                   unique_edge_types[kth],
                                   (temp_nodes[pth+1].msg_time - temp_nodes[pth].msg_time).seconds))
                edge_stamps.append((temp_nodes[pth].idx, temp_nodes[pth+1].idx))
        self.unique_edge_types = unique_edge_types
        
        # edge type 2: order of time
        if with_time_order:
            for ith in range(len(nodes)-1):
                if not ((ith, ith+1) in edge_stamps):
                    self.edges.append((ith, ith+1, 'time_order', (nodes[ith+1].msg_time - nodes[ith].msg_time).seconds))
        
        # assign error type to the graph
        self.error_label = []
        for ith in range(len(nodes)):
            if not nodes[ith].error_type is None:
                self.error_label.append(nodes[ith].error_type)
        
    def network_info(self):
        node_info = [(xx.idx, {'type': xx.msg_type}) for xx in self.nodes]
        edge_info = [(xx[0], xx[1], {'type': xx[2],'weight': 10 / (xx[3]+1e-6)}) for xx in self.edges]
        return node_info, edge_info, self.error_label

In [6]:
df1 = pd.read_csv('pre_contest/dataset/preliminary_sel_log_dataset.csv',sep=',')

# 日志表在训练集中不能选用preliminary_sel_log_dataset_a，是测试提交日志。这会导致sn出现miss_label，与之前sn与标签都能匹配矛盾
df2 = pd.read_csv('pre_contest/dataset/preliminary_sel_log_dataset_a.csv',sep=',')
#df3 = pd.read_csv('pre_contest/dataset/preliminary_submit_dataset_a.csv',sep=',')

select_fields = ['sn','time','msg','server_model']
df = pd.concat([df1[select_fields], df2[select_fields]])
df['error_type'] = ''
df['log_time'] = pd.to_datetime(df['time'])
df['msg_len'] = df['msg'].apply(lambda d: len(d.split('|')))

df_server_mapping = df[['sn','server_model']].drop_duplicates()

In [18]:
df.drop_duplicates()

Unnamed: 0,sn,time,msg,server_model,error_type,log_time,msg_len
0,SERVER_25698,2020-10-09 08:32:21,System Boot Initiated BIOS_Boot_Up | State As...,SM0,,2020-10-09 08:32:21,3
1,SERVER_25698,2020-10-09 07:43:48,System Boot Initiated BIOS_Boot_Up | State As...,SM0,,2020-10-09 07:43:48,3
2,SERVER_25698,2020-10-09 08:16:22,System Boot Initiated BIOS_Boot_Up | State As...,SM0,,2020-10-09 08:16:22,3
3,SERVER_25698,2020-10-09 05:46:41,System Boot Initiated BIOS_Boot_Up | State As...,SM0,,2020-10-09 05:46:41,3
4,SERVER_25698,2020-10-09 12:59:13,System Boot Initiated BIOS_Boot_Up | State As...,SM0,,2020-10-09 12:59:13,3
...,...,...,...,...,...,...,...
10986,fffd22fffe19,2020-01-21 19:16:01,Microcontroller/Coprocessor #0x16 | Transitio...,SM16,,2020-01-21 19:16:01,3
10987,fffd22fffe19,2020-01-21 19:17:03,System Event #0x10 | Timestamp Clock Sync | A...,SM16,,2020-01-21 19:17:03,3
10988,fffd22fffe19,2020-01-21 18:32:59,Memory #0xf9 | Uncorrectable ECC | Asserted,SM16,,2020-01-21 18:32:59,3
10989,fffd22fffe19,2020-01-21 19:18:14,System Boot Initiated BIOS_Boot_Up | Initiate...,SM16,,2020-01-21 19:18:14,3


In [9]:
df1_label = pd.read_csv('pre_contest/dataset/preliminary_train_label_dataset.csv',sep=',')
df2_label = pd.read_csv('pre_contest/dataset/preliminary_train_label_dataset_s.csv',sep=',')


# 合并时需去除重复
df_label = pd.concat([df1_label, df2_label])
df_label['log_time'] = pd.to_datetime(df_label['fault_time'])
df_label['msg'] = ''
df_label['msg_len'] = 0
df_label['server_model'] = df_label['sn'].map(dict(zip(df_server_mapping['sn'], 
                                                       df_server_mapping['server_model'])))
df_label.columns = ['sn','time','error_type','log_time','msg','msg_len','server_model']

In [12]:
df_label

Unnamed: 0,sn,time,error_type,log_time,msg,msg_len,server_model
0,SERVER_25698,2020-10-09 13:43:00,0,2020-10-09 13:43:00,,0,SM0
1,SERVER_25699,2020-08-25 18:50:00,0,2020-08-25 18:50:00,,0,SM3
2,SERVER_25712,2020-03-16 13:20:00,0,2020-03-16 13:20:00,,0,SM4
3,SERVER_25708,2020-07-25 12:44:00,0,2020-07-25 12:44:00,,0,SM4
4,SERVER_25711,2020-03-16 16:51:00,0,2020-03-16 16:51:00,,0,SM4
...,...,...,...,...,...,...,...
4404,SERVER_24971,2020-03-04 21:09:00,3,2020-03-04 21:09:00,,0,SM102
4405,SERVER_24971,2020-11-12 20:49:00,3,2020-11-12 20:49:00,,0,SM102
4406,SERVER_24962,2020-09-12 12:18:00,3,2020-09-12 12:18:00,,0,SM102
4407,SERVER_24971,2020-10-04 17:41:00,3,2020-10-04 17:41:00,,0,SM102


In [13]:
df_comb = pd.concat([df, df_label])
df_comb = df_comb.sort_values(['log_time'], ascending=True)
df_comb = df_comb.reset_index(drop=True)

In [17]:
df_comb

Unnamed: 0,sn,time,msg,server_model,error_type,log_time,msg_len
0,SERVER_10657,2019-12-27 23:38:05,Memory CPU1C0_DIMM_Stat | Correctable ECC | A...,SM54,,2019-12-27 23:38:05,3
1,SERVER_10657,2019-12-27 23:38:19,Memory CPU1C0_DIMM_Stat | Correctable ECC | A...,SM54,,2019-12-27 23:38:19,3
2,SERVER_10657,2019-12-27 23:38:33,Memory CPU1C0_DIMM_Stat | Correctable ECC | A...,SM54,,2019-12-27 23:38:33,3
3,SERVER_10657,2019-12-27 23:38:46,Memory CPU1C0_DIMM_Stat | Correctable ECC | A...,SM54,,2019-12-27 23:38:46,3
4,SERVER_10657,2019-12-27 23:39:00,Memory CPU1C0_DIMM_Stat | Correctable ECC | A...,SM54,,2019-12-27 23:39:00,3
...,...,...,...,...,...,...,...
510191,SERVER_20339,2020-11-25 23:10:19,Management Subsystem Health System_Health | S...,SM35,,2020-11-25 23:10:19,3
510192,SERVER_20339,2020-11-25 23:10:19,Management Subsys Health System_Health | Sens...,SM35,,2020-11-25 23:10:19,3
510193,SERVER_20339,2020-11-25 23:11:23,System Boot Initiated BIOS_Boot_Up | Initiate...,SM35,,2020-11-25 23:11:23,3
510194,SERVER_20339,2020-11-25 23:21:06,System Boot Initiated BIOS_Boot_Up | Initiate...,SM35,,2020-11-25 23:21:06,3


In [51]:

# 为什么选10小时？
cutoff = 10 * 3600

grp_store = []
cnt_with_label = 0
cnt_with_multi_label = 0

# miss_label目前有两种情形，一种是sn无标签，一种是时间间隔超过10小时的部分日志无标签，第一种情形理论是没有的，实际也是没有的
cnt_miss_label = 0
for name, grp in df_comb.groupby(['sn']):
    # 不能直接append，会导致出现重复
    grp_store.append(grp)
    if len(grp.error_type.unique()) == 1:
        cnt_miss_label += 1
        break
    elif len(grp.error_type.unique()) == 2:
        # 只有一个标签时也需要按照10小时进行划分，有在多个不同时间段报同样错的例子
        cnt_with_label += 1
    else:
        prev_time = [grp['time'].iloc[0]] + grp['time'].tolist()[:-1]
        grp['prev_time'] = pd.to_datetime(prev_time)
        grp['delta_time'] = grp['log_time'] - grp['prev_time']
        
        # 计算时间间隔 需加上d.days*24*3600
        grp['delta_time'] = grp['delta_time'].apply(lambda d: d.seconds)
        cutoff_idx = [0] + grp.loc[grp['delta_time'] > cutoff].index.tolist() + [grp.index.tolist()[-1]+1]
        for kth in range(len(cutoff_idx)-1): 
            temp_grp = grp.loc[(grp.index < cutoff_idx[kth+1]) & (grp.index >= cutoff_idx[kth])]
            grp_store.append(temp_grp)
            # 括号位置出错，导致miss误判
            if len(temp_grp.error_type.unique() == 1):
                cnt_miss_label += 1
            elif len(temp_grp.error_type.unique()) == 2:
                cnt_with_label += 1 
            else:
                cnt_with_multi_label += 1

In [52]:
grp

Unnamed: 0,sn,time,msg,server_model,error_type,log_time,msg_len
392005,000d33b21436,2020-09-02 11:38:40,System Boot Initiated BIOS_Boot_Up | Initiate...,SM40,,2020-09-02 11:38:40,3
393007,000d33b21436,2020-09-02 15:46:23,System Boot Initiated BIOS_Boot_Up | Initiate...,SM40,,2020-09-02 15:46:23,3


In [49]:
df_label[df_label['sn'] == '000d33b21436']

Unnamed: 0,sn,time,error_type,log_time,msg,msg_len,server_model


In [50]:
df[df['sn'] == '000d33b21436']

Unnamed: 0,sn,time,msg,server_model,error_type,log_time,msg_len
0,000d33b21436,2020-09-02 11:38:40,System Boot Initiated BIOS_Boot_Up | Initiate...,SM40,,2020-09-02 11:38:40,3
1,000d33b21436,2020-09-02 15:46:23,System Boot Initiated BIOS_Boot_Up | Initiate...,SM40,,2020-09-02 15:46:23,3


In [37]:
cutoff_idx

[0, 110099, 113378, 113493]

In [45]:
miss_store = []
label_0_store = []
label_1_store = []
label_2_store = []
label_3_store = []

save_fields = ['sn','time','msg','server_model','error_type','msg_len']
for grp in grp_store:
    file_name = grp['sn'].iloc[0] + '__' + grp['time'].iloc[0]
    grp['id'] = file_name
    
    nodes = []
    for ith in range(len(grp)):
        nodes.append(msg_node(grp['msg'].iloc[ith], 
                              grp['log_time'].iloc[ith], 
                              grp['server_model'].iloc[ith],
                              grp['error_type'].iloc[ith]))
    test_graph = msg_graph(nodes)
    test_graph.assign_index()
    test_graph.build_graph()
    
    flag = True
    if 0 in test_graph.error_label:
        label_0_store.append(grp)
        flag = False
    if 1 in test_graph.error_label:
        label_1_store.append(grp)
        flag = False
    if 2 in test_graph.error_label:
        label_2_store.append(grp)
        flag = False
    if 3 in test_graph.error_label:
        label_3_store.append(grp)
        flag = False
    if flag:
        miss_store.append(grp)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  grp['id'] = file_name


In [46]:
len(miss_store), len(label_0_store), len(label_1_store),len(label_2_store),len(label_3_store)
# (2914, 1466, 3946, 8577, 2432)

(2925, 1466, 3961, 8604, 2433)

In [18]:
select_fields = ['sn','time','msg','server_model','error_type','msg_len','id','log_time']
df0 = pd.concat(label_0_store)
df0[select_fields].to_pickle('pre_contest/graph_dataset/label_0_dat.p')

In [21]:
df1 = pd.concat(label_1_store)
df1[select_fields].to_pickle('pre_contest/graph_dataset/label_1_dat.p')

df2 = pd.concat(label_2_store)
df2[select_fields].to_pickle('pre_contest/graph_dataset/label_2_dat.p')

df3 = pd.concat(label_3_store)
df3[select_fields].to_pickle('pre_contest/graph_dataset/label_3_dat.p')

In [22]:
df_miss = pd.concat(miss_store)
df_miss[select_fields].to_pickle('pre_contest/graph_dataset/label_miss_dat.p')