In [1]:
import pandas as pd
import lightgbm as lgb

In [2]:
data_train = pd.read_csv('./data/preliminary_sel_log_dataset.csv')
data_test = pd.read_csv('./data/preliminary_sel_log_dataset_a.csv')
data = pd.concat([data_train, data_test])

In [3]:
from drain3 import TemplateMiner #开源在线日志解析框架
from drain3.file_persistence import FilePersistence
from drain3.template_miner_config import TemplateMinerConfig

config = TemplateMinerConfig()
config.load('./drain3.ini') ## 这个文件在drain3的github仓库里有
config.profiling_enabled = False

In [4]:
drain_file = 'comp_a_sellog'
persistence = FilePersistence(drain_file + '.bin')
template_miner = TemplateMiner(persistence, config=config)

In [5]:
for msg in data.msg.tolist():
    template_miner.add_log_message(msg)
temp_count = len(template_miner.drain.clusters)

In [6]:
template_dic = {}
size_list = []
for cluster in template_miner.drain.clusters:
    size_list.append(cluster.size)
size_list = sorted(size_list, reverse=True)[:200] ## 筛选模板集合大小前200条，这里的筛选只是举最简单的例子。
min_size = size_list[-1]

for cluster in template_miner.drain.clusters: ## 把符合要求的模板存下来
    if cluster.size >= min_size:
        template_dic[cluster.cluster_id] = cluster.size

In [7]:
temp_count_f = len(template_dic)

In [9]:
def match_template(df, template_miner, template_dic):
    msg = df.msg
    cluster = template_miner.match(msg) # 匹配模板，由开源工具提供
    if cluster and cluster.cluster_id in template_dic:
        df['template_id'] = cluster.cluster_id # 模板id
        df['template'] = cluster.get_template() # 具体模板
    else:
        df['template_id'] = 'None' # 没有匹配到模板的数据也会记录下来，之后也会用作一种特征。
        df['template'] = 'None'
    return df

In [10]:
data = data.apply(match_template, template_miner=template_miner, template_dic=template_dic, axis=1)
data.to_pickle('./' + drain_file +'_result_match_data.pkl')

In [11]:
df_data = pd.read_pickle('./' + drain_file + '_result_match_data.pkl')

In [12]:
df_data[df_data['template_id']!='None'].head()

Unnamed: 0,sn,time,msg,server_model,template_id,template
0,SERVER_25698,2020-10-09 08:32:21,System Boot Initiated BIOS_Boot_Up | State As...,SM0,81,System Boot Initiated <:*:> <:*:> <:*:> <:*:> ...
1,SERVER_25698,2020-10-09 07:43:48,System Boot Initiated BIOS_Boot_Up | State As...,SM0,81,System Boot Initiated <:*:> <:*:> <:*:> <:*:> ...
2,SERVER_25698,2020-10-09 08:16:22,System Boot Initiated BIOS_Boot_Up | State As...,SM0,81,System Boot Initiated <:*:> <:*:> <:*:> <:*:> ...
3,SERVER_25698,2020-10-09 05:46:41,System Boot Initiated BIOS_Boot_Up | State As...,SM0,81,System Boot Initiated <:*:> <:*:> <:*:> <:*:> ...
4,SERVER_25698,2020-10-09 12:59:13,System Boot Initiated BIOS_Boot_Up | State As...,SM0,81,System Boot Initiated <:*:> <:*:> <:*:> <:*:> ...


In [13]:
df_data.shape

(493527, 6)

In [14]:
def feature_generation(df_data, gap_list, model_name, log_source, win_list, func_list):
    gap_list = gap_list.split(',')

    dummy_list = set(df_data.template_id.unique())
    dummy_col = ['template_id_' + str(x) for x in dummy_list]

    for gap in gap_list:
        df_data['collect_time_gap'] = pd.to_datetime(df_data.collect_time).dt.ceil(gap)
        df_data = template_dummy(df_data)

        df_data = df_data.reset_index(drop=True)
        df_data = df_data.groupby(['sn','collect_time_gap']).agg(sum).reset_index()
        df_data = feature_win_fun(df_data, dummy_col, win_list, func_list, gap)
        # 将构造好的特征数据存下来
        df_data.to_pickle('./cpu_diag_comp_sel_log_all_feature' + gap + '_' + win_list + '_' + func_list +'.pkl')
        return df_data
    
def template_dummy(df):
    df_dummy = pd.get_dummies(df['template_id'], prefix='template_id')
    df = pd.concat([df[['sn','collect_time_gap']], df_dummy], axis=1)
    return df

def feature_win_fun(df, dummy_col, win_list, func_list, gap):
    win_list = win_list.split(',')
    func_list = func_list.split(',')
    drop_col = ['sn']
    merge_col = ['collect_time_gap']
    df_out = df[drop_col + merge_col]

    for win in win_list:
        for func in func_list:
            df_feature = df.groupby(drop_col).apply(rolling_funcs, win, func, dummy_col)
            df_feature = df_feature.reset_index(drop=True).rename(columns=dict(zip(dummy_col, map(lambda x: x + '_' +
                                                                  func + '_' + win, dummy_col))))
            df_out = pd.concat([df_out, df_feature], axis=1)
    return df_out

def rolling_funcs(df, window, func, fea_col):
    df = df.sort_values('collect_time_gap')
    df = df.set_index('collect_time_gap')
    df = df[fea_col]

    df2 = df.rolling(str(window) + 'h')

    if func in ['sum']:
        df3 = df2.apply(sum_func)
    else:
        print('func not existed')
    return df3

def sum_func(series):
    return sum(series)

In [15]:
df_data.rename(columns={'time':'collect_time'},inplace=True)
feature_generation(df_data, '1h','', '', '3', 'sum')

Unnamed: 0,sn,collect_time_gap,template_id_1_sum_3,template_id_2_sum_3,template_id_3_sum_3,template_id_4_sum_3,template_id_6_sum_3,template_id_7_sum_3,template_id_8_sum_3,template_id_9_sum_3,...,template_id_218_sum_3,template_id_220_sum_3,template_id_221_sum_3,template_id_222_sum_3,template_id_223_sum_3,template_id_224_sum_3,template_id_225_sum_3,template_id_226_sum_3,template_id_227_sum_3,template_id_228_sum_3
0,000d33b21436,2020-09-02 12:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,000d33b21436,2020-09-02 16:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,005c5a9218ba,2020-06-28 19:00:00,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0079283bde6e,2020-04-26 21:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,007bdf23b62f,2020-06-16 18:00:00,0.0,0.0,13.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35175,ffc229b6cd9a,2020-06-27 03:00:00,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
35176,ffd44698a52b,2020-01-21 14:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
35177,fff73a9e5bd5,2020-03-01 14:00:00,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
35178,fffd22fffe19,2020-01-21 19:00:00,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
df_data = pd.read_pickle('./cpu_diag_comp_sel_log_all_feature1h_3_sum.pkl')

In [18]:
df_train_label = pd.read_csv('./data/preliminary_train_label_dataset.csv')
df_train_label_s = pd.read_csv('./data/preliminary_train_label_dataset_s.csv')
df_train_label = pd.concat([df_train_label, df_train_label_s])
df_train_label = df_train_label.drop_duplicates(['sn','fault_time','label'])

In [19]:
df_data_train = pd.merge(df_data[df_data.sn.isin(df_train_label.sn)],df_train_label, on='sn', how='left')
y = df_data_train['label']
x = df_data_train.drop(['sn','collect_time_gap','fault_time','label'],axis=1)

In [29]:
df_data_train.shape

(44110, 183)

In [25]:
df_test_df = pd.read_csv('./data/preliminary_submit_dataset_a.csv',index_col=0).reset_index()
df_test = pd.merge(df_data[df_data.sn.isin(df_test_df.sn)], df_test_df, on='sn', how='left')

In [31]:
df_test.shape

(4292, 182)

In [27]:
df_test.to_csv('./test.csv', index=False)

In [28]:
df_data_train.to_csv('./train.csv', index=False)