In [1]:
import pandas as pd
import numpy as np
import datetime
import networkx as nx

In [2]:
class msg_node:
    msg_type = None
    attrs = []
    server_model = None
    msg_time = None
    msg_title = None
    idx = 0
    error_type = None
    
    def __init__(self, msg, log_time, server_model, error_type):
        self.server_model = server_model
        self.attrs = [xx.strip() for xx in msg.split('|')]
        self.msg_time = log_time
        if (len(self.attrs) == 1) and (self.attrs[0] == ''):
            self.error_type = error_type
            self.msg_type = 4
        elif len(self.attrs) == 1:
            self.msg_type = 0
        elif len(self.attrs) == 2:
            self.msg_type = 1
            self.msg_title = self.attrs[0]
        elif (len(self.attrs) == 3) and (self.attrs[-1] in ['Assert','Asserted','Deasserted','Deassert','asserted','deasserted']):
            self.msg_type = 2
            self.msg_title = self.attrs[0]
        elif len(self.attrs) >=3:
            self.msg_type = 3
            self.msg_title = self.attrs[0]
        else:
            print('Error in extracting msg attributes!')
    
    def set_index(self, index):
        self.idx = index

class msg_graph:
    nodes = []  # list of msg_node
    edges = []  # list of tuples (msg_node_index1, msg_node_index2, edge_type, delta_time)
    error_label = []
    
    def __init__(self, nodes):
        # 注！这里默认输入的nodes是已经按照时序排好的
        self.nodes = nodes
        self.edges = []
        
    def assign_index(self):
        for kth, node in enumerate(self.nodes):
            node.set_index(kth)
    
    def build_graph(self, with_time_order = True, time_cutoff=172800):
        edge_stamps = []
        # edge type 1: same msg_title
        unique_edge_types = list(set([xx.msg_title for xx in nodes]))
        edge_type_vector = np.array([xx.msg_title for xx in nodes])
        for kth in range(len(unique_edge_types)):
            temp_nodes = np.array(nodes)[edge_type_vector == unique_edge_types[kth]].tolist()
            for pth in range(len(temp_nodes)-1):
                self.edges.append((temp_nodes[pth].idx, temp_nodes[pth+1].idx, 
                                   unique_edge_types[kth],
                                   (temp_nodes[pth+1].msg_time - temp_nodes[pth].msg_time).seconds))
                edge_stamps.append((temp_nodes[pth].idx, temp_nodes[pth+1].idx))
        self.unique_edge_types = unique_edge_types
        
        # edge type 2: order of time
        if with_time_order:
            for ith in range(len(nodes)-1):
                if not ((ith, ith+1) in edge_stamps):
                    self.edges.append((ith, ith+1, 'time_order', (nodes[ith+1].msg_time - nodes[ith].msg_time).seconds))
        
        # assign error type to the graph
        self.error_label = []
        for ith in range(len(nodes)):
            if not nodes[ith].error_type is None:
                self.error_label.append(nodes[ith].error_type)
        
    def network_info(self):
        node_info = [(xx.idx, {'type': xx.msg_type}) for xx in self.nodes]
        edge_info = [(xx[0], xx[1], {'type': xx[2],'weight': 10 / (xx[3]+1e-6)}) for xx in self.edges]
        return node_info, edge_info, self.error_label

In [3]:
df1 = pd.read_csv('pre_contest/dataset/preliminary_sel_log_dataset.csv',sep=',')
df2 = pd.read_csv('pre_contest/dataset/preliminary_sel_log_dataset_a.csv',sep=',')
#df3 = pd.read_csv('pre_contest/dataset/preliminary_submit_dataset_a.csv',sep=',')

select_fields = ['sn','time','msg','server_model']
df = pd.concat([df1[select_fields], df2[select_fields]])
df['error_type'] = ''
df['log_time'] = pd.to_datetime(df['time'])
df['msg_len'] = df['msg'].apply(lambda d: len(d.split('|')))

df_server_mapping = df[['sn','server_model']].drop_duplicates()

In [4]:
df1_label = pd.read_csv('pre_contest/dataset/preliminary_train_label_dataset.csv',sep=',')
df2_label = pd.read_csv('pre_contest/dataset/preliminary_train_label_dataset_s.csv',sep=',')

df_label = pd.concat([df1_label, df2_label])
df_label['log_time'] = pd.to_datetime(df_label['fault_time'])
df_label['msg'] = ''
df_label['msg_len'] = 0
df_label['server_model'] = df_label['sn'].map(dict(zip(df_server_mapping['sn'], 
                                                       df_server_mapping['server_model'])))
df_label.columns = ['sn','time','error_type','log_time','msg','msg_len','server_model']

In [5]:
df_comb = pd.concat([df, df_label])
df_comb = df_comb.sort_values(['log_time'], ascending=True)
df_comb = df_comb.reset_index(drop=True)

In [6]:
cutoff = 10 * 3600

grp_store = []
cnt_with_label = 0
cnt_with_multi_label = 0
cnt_miss_label = 0
for name, grp in df_comb.groupby(['sn']):
    grp_store.append(grp)
    if len(grp.error_type.unique()) == 1:
        cnt_miss_label += 1
    elif len(grp.error_type.unique()) == 2:
        cnt_with_label += 1
    else:
        prev_time = [grp['time'].iloc[0]] + grp['time'].tolist()[:-1]
        grp['prev_time'] = pd.to_datetime(prev_time)
        grp['delta_time'] = grp['log_time'] - grp['prev_time']
        grp['delta_time'] = grp['delta_time'].apply(lambda d: d.seconds)
        cutoff_idx = [0] + grp.loc[grp['delta_time'] > cutoff].index.tolist() + [grp.index.tolist()[-1]+1]
        for kth in range(len(cutoff_idx)-1): 
            temp_grp = grp.loc[(grp.index < cutoff_idx[kth+1]) & (grp.index >= cutoff_idx[kth])]
            grp_store.append(temp_grp)
            if len(temp_grp.error_type.unique() == 1):
                cnt_miss_label += 1
            elif len(temp_grp.error_type.unique()) == 2:
                cnt_with_label += 1
            else:
                cnt_with_multi_label += 1

In [12]:
miss_store = []
label_0_store = []
label_1_store = []
label_2_store = []
label_3_store = []

save_fields = ['sn','time','msg','server_model','error_type','msg_len']
for grp in grp_store:
    file_name = grp['sn'].iloc[0] + '__' + grp['time'].iloc[0]
    grp['id'] = file_name
    
    nodes = []
    for ith in range(len(grp)):
        nodes.append(msg_node(grp['msg'].iloc[ith], 
                              grp['log_time'].iloc[ith], 
                              grp['server_model'].iloc[ith],
                              grp['error_type'].iloc[ith]))
    test_graph = msg_graph(nodes)
    test_graph.assign_index()
    test_graph.build_graph()
    
    flag = True
    if 0 in test_graph.error_label:
        label_0_store.append(grp)
        flag = False
    if 1 in test_graph.error_label:
        label_1_store.append(grp)
        flag = False
    if 2 in test_graph.error_label:
        label_2_store.append(grp)
        flag = False
    if 3 in test_graph.error_label:
        label_3_store.append(grp)
        flag = False
    if flag:
        miss_store.append(grp)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


In [13]:
len(miss_store), len(label_0_store), len(label_1_store),len(label_2_store),len(label_3_store)

(2914, 1466, 3946, 8577, 2432)

In [18]:
select_fields = ['sn','time','msg','server_model','error_type','msg_len','id','log_time']
df0 = pd.concat(label_0_store)
df0[select_fields].to_pickle('pre_contest/graph_dataset/label_0_dat.p')

In [21]:
df1 = pd.concat(label_1_store)
df1[select_fields].to_pickle('pre_contest/graph_dataset/label_1_dat.p')

df2 = pd.concat(label_2_store)
df2[select_fields].to_pickle('pre_contest/graph_dataset/label_2_dat.p')

df3 = pd.concat(label_3_store)
df3[select_fields].to_pickle('pre_contest/graph_dataset/label_3_dat.p')

In [22]:
df_miss = pd.concat(miss_store)
df_miss[select_fields].to_pickle('pre_contest/graph_dataset/label_miss_dat.p')