In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from tqdm import tqdm

In [2]:
label1 = pd.read_csv('../data/preliminary_train_label_dataset.csv')
label2 = pd.read_csv('../data/preliminary_train_label_dataset_s.csv')
label_df = pd.concat([label1, label2]).reset_index(drop=True)
label_df = label_df.drop_duplicates().reset_index(drop=True)

In [3]:
log_df = pd.read_csv('./log_template.csv')

In [4]:
submit_df = pd.read_csv('../data/preliminary_submit_dataset_a.csv')

In [5]:
log_df['time'] = pd.to_datetime(log_df['time'])
label_df['fault_time'] = pd.to_datetime(label_df['fault_time'])
submit_df['fault_time'] = pd.to_datetime(submit_df['fault_time'])

log_df['time_ts'] = log_df["time"].values.astype(np.int64) // 10 ** 9
label_df['fault_time_ts'] = label_df["fault_time"].values.astype(np.int64) // 10 ** 9
submit_df['fault_time_ts'] = submit_df["fault_time"].values.astype(np.int64) // 10 ** 9

In [6]:
log_df.shape, label_df.shape, submit_df.shape

((493527, 9), (16604, 4), (3011, 3))

In [7]:
log_df.sort_values(by=['sn', 'time'], inplace=True)

In [8]:
key_words = ['memory', 'processor', 'disabled', 'failure', 'cpu1_status', 'cpu0_status',
             'shutdown', '-', 'ecc', 'down', 'timestamp', 'cpu0_proc_hot', 'oem_ps1_temp']

In [9]:
def make_dataset(dataset, data_type='train'):
    ret = []
    for idx, row in tqdm(dataset.iterrows()):
        sn = row['sn']
        fault_time = row['fault_time']
        ts = row['fault_time_ts']
        
        if data_type == 'train':
            label = row['label']

        df = log_df[log_df['sn'] == sn].copy()

        df = df[df['time_ts'] <= ts].copy()
        df = df.sort_values(by='time_ts').reset_index(drop=True)
        df = df.tail(50).copy()
        
        feature = np.zeros((len(key_words), ))
        for i in range(len(key_words)):
            for msglist in df['msg_lower'].str.split(' '):
                if key_words[i] in msglist:
                    feature[i] += 1
                    
        feature2 = feature
        if df.shape[0] > 0:
            feature2 /= df.shape[0]
        if data_type == 'train':
            ret.append([sn, fault_time] + feature.tolist() + [label])
        else:
            ret.append([sn, fault_time] + feature.tolist())
    return ret

In [10]:
train = make_dataset(label_df, data_type='train')
df_train = pd.DataFrame(train)

16604it [08:38, 32.02it/s]


In [11]:
test = make_dataset(submit_df, data_type='test')
df_test = pd.DataFrame(test)

3011it [01:35, 31.45it/s]


In [12]:
# columns = ['%s_cnt'%kw for kw in key_words] + ['%s_proba'%kw for kw in key_words]
columns = ['%s_cnt'%kw for kw in key_words]

In [13]:
df_train.columns = ['sn', 'fault_time'] + columns + ['label']
df_test.columns = ['sn', 'fault_time'] + columns

In [14]:
df_train.to_csv('train5.csv', index=False)
df_test.to_csv('test5.csv', index=False)