In [1]:
import os
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import Counter

In [2]:
label1 = pd.read_csv('../data/preliminary_train_label_dataset.csv')
label2 = pd.read_csv('../data/preliminary_train_label_dataset_s.csv')
label_df = pd.concat([label1, label2]).reset_index(drop=True)
label_df = label_df.drop_duplicates().reset_index(drop=True)

In [3]:
submit_df = pd.read_csv('../data/preliminary_submit_dataset_a.csv')

In [4]:
log_df = pd.read_csv('./new_log.csv')

In [5]:
log_df['time'] = pd.to_datetime(log_df['time'])
label_df['fault_time'] = pd.to_datetime(label_df['fault_time'])
submit_df['fault_time'] = pd.to_datetime(submit_df['fault_time'])

log_df['time_ts'] = log_df["time"].values.astype(np.int64) // 10 ** 9
label_df['fault_time_ts'] = label_df["fault_time"].values.astype(np.int64) // 10 ** 9
submit_df['fault_time_ts'] = submit_df["fault_time"].values.astype(np.int64) // 10 ** 9

In [6]:
label_df = label_df.merge(log_df[['sn', 'server_model']].drop_duplicates(), on=['sn'], how='left')
submit_df = submit_df.merge(log_df[['sn', 'server_model']].drop_duplicates(), on=['sn'], how='left')

In [7]:
label_cnt_df = label_df.groupby('label').size().reset_index().rename({0: 'label_cnt'}, axis=1)

label_model_cnt_df = label_df.groupby(['server_model', 'label']).size().reset_index()\
    .rename({0: 'label_model_cnt'}, axis=1)

label_model_cnt_df = label_model_cnt_df.merge(label_cnt_df, on='label', how='left')

label_model_cnt_df['model/label'] = label_model_cnt_df['label_model_cnt'] / label_model_cnt_df['label_cnt']

In [8]:
def make_dataset(dataset, data_type='train'):
    ret = []
    for idx, row in tqdm(dataset.iterrows()):
        sn = row['sn']
        fault_time = row['fault_time']
        ts = row['fault_time_ts']
        
        if data_type == 'train':
            label = row['label']

        df = log_df[log_df['sn'] == sn].copy()
        df = df[df['time_ts'] <= ts].copy()
        df = df.sort_values(by='time_ts').reset_index(drop=True)
        df = df.tail(20).copy()

        if df.shape[0] > 0:
            server_model = df['server_model'].values[0]
            sub = label_model_cnt_df[label_model_cnt_df['server_model'] == server_model]
            probas = [0, 0, 0, 0]
            for item in sub.values:
                probas[item[1]] = item[4]
            server_model = int(server_model[2:])
        else:
            server_model = np.nan
            probas = [0, 0, 0, 0]

        if data_type == 'train':
            ret.append([sn, fault_time] + [server_model] + probas + [label])
        else:
            ret.append([sn, fault_time] + [server_model] + probas)
    return ret

In [9]:
train = make_dataset(label_df, data_type='train')
df_train = pd.DataFrame(train)

16604it [08:39, 31.94it/s]


In [10]:
test = make_dataset(submit_df, data_type='test')
df_test = pd.DataFrame(test)

3011it [01:34, 32.00it/s]


In [11]:
df_train.columns = ['sn', 'fault_time', 'server_model', 'p0', 'p1', 'p2', 'p3', 'label']
df_test.columns = ['sn', 'fault_time', 'server_model', 'p0', 'p1', 'p2', 'p3',]

In [12]:
df_train.to_csv('train3.csv', index=False)
df_test.to_csv('test3.csv', index=False)

In [13]:
df_train

Unnamed: 0,sn,fault_time,server_model,p0,p1,p2,p3,label
0,SERVER_25698,2020-10-09 13:43:00,0.0,0.000678,0.000000,0.000000,0.000000,0
1,SERVER_25699,2020-08-25 18:50:00,3.0,0.000678,0.000295,0.000215,0.000000,0
2,SERVER_25712,2020-03-16 13:20:00,4.0,0.006098,0.000000,0.002253,0.003717,0
3,SERVER_25708,2020-07-25 12:44:00,4.0,0.006098,0.000000,0.002253,0.003717,0
4,SERVER_25711,2020-03-16 16:51:00,4.0,0.006098,0.000000,0.002253,0.003717,0
...,...,...,...,...,...,...,...,...
16599,SERVER_24971,2020-03-04 21:09:00,102.0,0.000000,0.000000,0.000000,0.005783,3
16600,SERVER_24971,2020-11-12 20:49:00,102.0,0.000000,0.000000,0.000000,0.005783,3
16601,SERVER_24962,2020-09-12 12:18:00,102.0,0.000000,0.000000,0.000000,0.005783,3
16602,SERVER_24971,2020-10-04 17:41:00,102.0,0.000000,0.000000,0.000000,0.005783,3
