In [1]:
import os
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import Counter

from gensim.models.word2vec import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [2]:
label1 = pd.read_csv('../data/preliminary_train_label_dataset.csv')
label2 = pd.read_csv('../data/preliminary_train_label_dataset_s.csv')
label_df = pd.concat([label1, label2]).reset_index(drop=True)
label_df = label_df.drop_duplicates().reset_index(drop=True)

In [3]:
submit_df = pd.read_csv('../data/preliminary_submit_dataset_a.csv')

In [4]:
log_df = pd.read_csv('./log_template.csv')

In [5]:
label_df.shape, log_df.shape, submit_df.shape

((16604, 3), (493527, 8), (3011, 2))

In [6]:
log_df['time'] = pd.to_datetime(log_df['time'])
log_df.sort_values(by=['sn', 'time'], inplace=True)
log_df.reset_index(drop=True, inplace=True)

In [7]:
label_df['fault_time'] = pd.to_datetime(label_df['fault_time'])

In [8]:
submit_df['fault_time'] = pd.to_datetime(submit_df['fault_time'])

In [9]:
log_df['time_ts'] = log_df["time"].values.astype(np.int64) // 10 ** 9

In [10]:
label_df['fault_time_ts'] = label_df["fault_time"].values.astype(np.int64) // 10 ** 9

In [11]:
submit_df['fault_time_ts'] = submit_df["fault_time"].values.astype(np.int64) // 10 ** 9

In [12]:
dummy_list = set(log_df.template_id.unique())
dummy_col = ['template_id_' + str(x) for x in dummy_list]

In [13]:
def make_dataset2(dataset, data_type='train'):
    ret = []
    for idx, row in tqdm(dataset.iterrows()):
        sn = row['sn']
        fault_time = row['fault_time']
        ts = row['fault_time_ts']
        
        if data_type == 'train':
            label = row['label']

        df = log_df[log_df['sn'] == sn].copy()

        df = df[df['time_ts'] <= ts].copy()
        df = df.sort_values(by='time_ts').reset_index(drop=True)
        df = df.tail(50).copy()
        
        data = np.zeros(len(dummy_list))
        for item in df.groupby('template_id').size().reset_index().values:
            data[np.where(np.array(dummy_col) == 'template_id_%s'%item[0])[0][0]] = item[1]
        
        if data_type == 'train':
            ret.append([sn, fault_time] + data.tolist() + [label])
        else:
            ret.append([sn, fault_time] + data.tolist())
    return ret

In [14]:
train = make_dataset2(label_df, data_type='train')
df_train = pd.DataFrame(train)

16604it [08:27, 32.71it/s]


In [15]:
test = make_dataset2(submit_df, data_type='test')
df_test = pd.DataFrame(test)

3011it [01:34, 31.98it/s]


In [16]:
df_train.columns = ['sn', 'fault_time'] + dummy_col + ['label']

In [17]:
df_test.columns = ['sn', 'fault_time'] + dummy_col

In [18]:
df_train.shape, df_test.shape

((16604, 190), (3011, 189))

In [19]:
df_train.to_csv('train6.csv', index=False)
df_test.to_csv('test6.csv', index=False)