In [1]:
import os
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import Counter

from gensim.models.word2vec import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [2]:
label1 = pd.read_csv('../data/preliminary_train_label_dataset.csv')
label2 = pd.read_csv('../data/preliminary_train_label_dataset_s.csv')
label_df = pd.concat([label1, label2]).reset_index(drop=True)
label_df = label_df.drop_duplicates().reset_index(drop=True)

In [3]:
submit_df = pd.read_csv('../data/preliminary_submit_dataset_b.csv')

In [4]:
log_df = pd.read_csv('./log_template.csv')

In [5]:
log_df['time'] = pd.to_datetime(log_df['time'])
label_df['fault_time'] = pd.to_datetime(label_df['fault_time'])
submit_df['fault_time'] = pd.to_datetime(submit_df['fault_time'])

log_df['time_ts'] = log_df["time"].values.astype(np.int64) // 10 ** 9
label_df['fault_time_ts'] = label_df["fault_time"].values.astype(np.int64) // 10 ** 9
submit_df['fault_time_ts'] = submit_df["fault_time"].values.astype(np.int64) // 10 ** 9

In [6]:
label_df = label_df.merge(log_df[['sn', 'server_model']].drop_duplicates(), on=['sn'], how='left')
submit_df = submit_df.merge(log_df[['sn', 'server_model']].drop_duplicates(), on=['sn'], how='left')

In [7]:
def get_statistical_features(df1, df2, suffix):
    cnt1 = df1.shape[0]
    cnt2 = df2.shape[0]
    percent = cnt1 / cnt2 if cnt2 > 0 else 0

    msg_nunique1 = df1['msg_id'].nunique()
    template_unique1 = df1['template_id'].nunique()
    msg_nunique2 = df2['msg_id'].nunique()
    template_unique2 = df2['template_id'].nunique()

    msg_precent1 = msg_nunique1 / cnt1 if cnt1 > 0 else 0    
    template_precent1 = template_unique1 / cnt1 if cnt1 > 0 else 0
    
    template_to_msg_percent = template_unique1 / msg_nunique1  if msg_nunique1 > 0 else 0
    msg_to_msg_percent = msg_nunique1 / msg_nunique2 if msg_nunique2 > 0 else 0
    template_to_template_percent = template_unique1 / template_unique2  if template_unique2 > 0 else 0
    
    return {
        'cnt_%s'%suffix: cnt1,
        'percent_%s'%suffix: percent,
        'msg_nunique_%s'%suffix: msg_nunique1,
        'template_nunique_%s'%suffix: template_unique1,
        'msg_percent_%s'%suffix: msg_precent1,
        'template_percent_%s'%suffix: template_precent1,
        'template_to_msg_percent_%s'%suffix: template_to_msg_percent,
        'msg_to_msg_percent_%s'%suffix: msg_to_msg_percent,
        'template_to_template_percent_%s'%suffix: template_to_template_percent,
    }

In [8]:
def get_time_feature(df, fault_time_ts, suffix):
    if df.shape[0] == 0:
        second_span = np.nan
        time_diffs_avg = np.nan
        time_diffs_max = np.nan
        time_diffs_min = np.nan
        time_diffs_std = np.nan
        max_time_diff = np.nan
    else:
        second_span = fault_time_ts - df.iloc[-1]['time_ts']
        time_diffs = df['time_ts'].diff().iloc[1:]
        time_diffs_avg = np.mean(time_diffs) if time_diffs.shape[0] > 0 else np.nan
        time_diffs_max = np.max(time_diffs) if time_diffs.shape[0] > 0 else np.nan
        time_diffs_min = np.min(time_diffs) if time_diffs.shape[0] > 0 else np.nan
        time_diffs_std = np.std(time_diffs) if time_diffs.shape[0] > 0 else np.nan
        max_time_diff = df['time_ts'].iloc[-1] - df['time_ts'].iloc[0]
    return {
        'second_span_%s'%suffix: second_span,
        'time_diffs_avg_%s'%suffix: time_diffs_avg,
        'time_diffs_max_%s'%suffix: time_diffs_max,
        'time_diffs_min_%s'%suffix: time_diffs_min,
        'time_diffs_std_%s'%suffix: time_diffs_std,
        'max_time_diff_%s'%suffix: max_time_diff
    }

In [9]:
def make_dataset(dataset, data_type='train'):
    ret = []
    for idx in tqdm(range(dataset.shape[0])):
        row = dataset.iloc[idx]
        sn = row['sn']
        fault_time = row['fault_time']
        fault_time_ts = row['fault_time_ts']
        server_model = row['server_model']
        sub_log = log_df[(log_df['sn'] == sn) & (log_df['time_ts'] <= fault_time_ts)]
        sub_log = sub_log.sort_values(by='time')
        last_msg_id = str(sub_log['msg_id'].values[-1]) if sub_log.shape[0] > 0 else 'NULL'
        last_template_id = str(sub_log['template_id'].values[-1]) if sub_log.shape[0] > 0 else 'NULL'
        data = {
            'sn': sn,
            'fault_time': fault_time,
            'server_model': server_model,
            'last_msg_id': last_msg_id,
            'last_template_id': last_template_id
        }
        df_tmp1 = sub_log[sub_log['time_ts'] - fault_time_ts >= -60 * 60 * 2]
        df_tmp2 = sub_log[sub_log['time_ts'] - fault_time_ts >= -60 * 60 * 24]
        data_tmp = get_statistical_features(df_tmp1, df_tmp2, '2h')
        data.update(data_tmp)
        
        data_tmp = get_time_feature(df_tmp1, fault_time_ts, '2h')
        data.update(data_tmp)
        
        if data_type == 'train':
            data['label'] = row['label']
        ret.append(data)
    return ret

In [10]:
train = make_dataset(label_df, data_type='train')
df_train = pd.DataFrame(train)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 16604/16604 [09:23<00:00, 29.47it/s]


In [11]:
test = make_dataset(submit_df, data_type='test')
df_test = pd.DataFrame(test)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 3030/3030 [01:42<00:00, 29.68it/s]


In [12]:
df_train.to_csv('train1.csv', index=False)
df_test.to_csv('test1.csv', index=False)