In [1]:
import os
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import Counter

from gensim.models.word2vec import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [2]:
label1 = pd.read_csv('../data/preliminary_train_label_dataset.csv')
label2 = pd.read_csv('../data/preliminary_train_label_dataset_s.csv')
label_df = pd.concat([label1, label2]).reset_index(drop=True)
label_df = label_df.drop_duplicates().reset_index(drop=True)

In [3]:
submit_df = pd.read_csv('../data/preliminary_submit_dataset_b.csv')

In [4]:
log_df = pd.read_csv('./log_template.csv')

In [5]:
log_df['time'] = pd.to_datetime(log_df['time'])
label_df['fault_time'] = pd.to_datetime(label_df['fault_time'])
submit_df['fault_time'] = pd.to_datetime(submit_df['fault_time'])

log_df['time_ts'] = log_df["time"].values.astype(np.int64) // 10 ** 9
label_df['fault_time_ts'] = label_df["fault_time"].values.astype(np.int64) // 10 ** 9
submit_df['fault_time_ts'] = submit_df["fault_time"].values.astype(np.int64) // 10 ** 9

In [6]:
label_df = label_df.merge(log_df[['sn', 'server_model']].drop_duplicates(), on=['sn'], how='left')
submit_df = submit_df.merge(log_df[['sn', 'server_model']].drop_duplicates(), on=['sn'], how='left')

In [7]:
def make_dataset(dataset, data_type='train'):
    ret = []
    for idx in tqdm(range(dataset.shape[0])):
        row = dataset.iloc[idx]
        sn = row['sn']
        fault_time = row['fault_time']
        fault_time_ts = row['fault_time_ts']
        server_model = row['server_model']
        sub_log = log_df[(log_df['sn'] == sn) & (log_df['time_ts'] <= fault_time_ts)]
        sub_log = sub_log.sort_values(by='time')

        df_tmp1 = sub_log[sub_log['time_ts'] - fault_time_ts >= -60 * 60 * 2]
        template_counter = Counter()
        template_counter.update(df_tmp1['template_id'].values)
        template_counter = [(l,k) for k,l in sorted([(j,i) for i,j in template_counter.items()], reverse=True)]
        
        msg_counter = Counter()
        msg_counter.update(df_tmp1['msg_id'].values)
        msg_counter = [(l,k) for k,l in sorted([(j,i) for i,j in msg_counter.items()], reverse=True)]
        
        max_time = 0
        cur_time = 0
        cur_msg = 'NULL'
        for msg in df_tmp1['msg_id'].values:
            if msg == cur_msg:
                cur_time += 1
                max_time = max((cur_time, max_time))
            else:
                cur_msg = msg
                cur_time = 1
        
        data = {
            'sn': sn,
            'fault_time': fault_time,
            'tmp_appearance_1': str(template_counter[0][0]) if len(template_counter) > 0 else 'NULL',
            'tmp_appearance_1_percent': template_counter[0][1] / df_tmp1.shape[0] if len(template_counter) > 0 else 0,
            'tmp_appearance_2': str(template_counter[1][0]) if len(template_counter) > 1 else 'NULL',
            'tmp_appearance_2_percent': template_counter[1][1] / df_tmp1.shape[0] if len(template_counter) > 1 else 0,
            'tmp_appearance_3': str(template_counter[2][0]) if len(template_counter) > 2 else 'NULL',
            'tmp_appearance_3_percent': template_counter[2][1] / df_tmp1.shape[0] if len(template_counter) > 2 else 0,
            'msg_appearance_1': str(msg_counter[0][0]) if len(msg_counter) > 0 else 'NULL',
            'msg_appearance_1_percent': msg_counter[0][1] / df_tmp1.shape[0] if len(msg_counter) > 0 else 0,
            'msg_appearance_2': str(msg_counter[1][0]) if len(msg_counter) > 1 else 'NULL',
            'msg_appearance_2_percent': msg_counter[1][1] / df_tmp1.shape[0] if len(msg_counter) > 1 else 0,
            'msg_appearance_3': str(msg_counter[2][0]) if len(msg_counter) > 2 else 'NULL',
            'msg_appearance_3_percent': msg_counter[2][1] / df_tmp1.shape[0] if len(msg_counter) > 2 else 0,
            'max_continuous_msg': cur_msg,
            'max_continuous_time': max_time
        }
        
        if data_type == 'train':
            data['label'] = row['label']
        ret.append(data)
    return ret

In [8]:
train = make_dataset(label_df, data_type='train')
df_train = pd.DataFrame(train)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 16604/16604 [09:07<00:00, 30.33it/s]


In [9]:
test = make_dataset(submit_df, data_type='test')
df_test = pd.DataFrame(test)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 3030/3030 [01:44<00:00, 29.10it/s]


In [10]:
df_train.to_csv('train2.csv', index=False)
df_test.to_csv('test2.csv', index=False)