In [2]:
import os
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import Counter

from nltk.tokenize import word_tokenize
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [3]:
label1 = pd.read_csv('../data/preliminary_train_label_dataset.csv')
label2 = pd.read_csv('../data/preliminary_train_label_dataset_s.csv')
label_df = pd.concat([label1, label2]).reset_index(drop=True)
label_df = label_df.drop_duplicates().reset_index(drop=True)

In [4]:
submit_df = pd.read_csv('../data/preliminary_submit_dataset_a.csv')

In [5]:
log_df = pd.read_csv('./log_template.csv')

In [8]:
log_df['time'] = pd.to_datetime(log_df['time'])
label_df['fault_time'] = pd.to_datetime(label_df['fault_time'])
submit_df['fault_time'] = pd.to_datetime(submit_df['fault_time'])

log_df['time_ts'] = log_df["time"].values.astype(np.int64) // 10 ** 9
label_df['fault_time_ts'] = label_df["fault_time"].values.astype(np.int64) // 10 ** 9
submit_df['fault_time_ts'] = submit_df["fault_time"].values.astype(np.int64) // 10 ** 9

In [9]:
data = []
for i in tqdm(range(label_df.shape[0])):
    row = label_df.iloc[i]
    sn = row['sn']
    fault_time = row['fault_time']
    tmp = log_df[log_df['sn'] == sn]
    tmp = tmp[(tmp['time'] <= fault_time)].tail(10).sort_values(by='time', ascending=False)
    data.append(tmp['msg_lower'].values.tolist())

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16604/16604 [08:29<00:00, 32.56it/s]


In [10]:
for i in tqdm(range(submit_df.shape[0])):
    row = submit_df.iloc[i]
    sn = row['sn']
    fault_time = row['fault_time']
    tmp = log_df[log_df['sn'] == sn]
    tmp = tmp[(tmp['time'] <= fault_time)].tail(10).sort_values(by='time', ascending=False)
    data.append(tmp['msg_lower'].values.tolist())

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3011/3011 [01:33<00:00, 32.36it/s]


In [11]:
data = ['.'.join(msg) for msg in data]

In [13]:
tokenized_sent = [word_tokenize(s) for s in data]

In [14]:
tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(tokenized_sent)]
model = Doc2Vec(tagged_data, vector_size=64, window=2, min_count=1, epochs=10)

In [20]:
tr_data = []

for i in tqdm(range(label_df.shape[0])):
    tr_data.append([])
    row = label_df.iloc[i]
    sn = row['sn']
    fault_time = row['fault_time']
    tmp = log_df[log_df['sn'] == sn]
    tmp = tmp[(tmp['time'] <= fault_time)].tail(10).sort_values(by='time', ascending=False)
    
    vec = model.infer_vector(word_tokenize('.'.join(tmp['msg_lower'])))
    tr_data[-1] += vec.tolist() + [row['sn'], row['fault_time'], row['label']]

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16604/16604 [08:46<00:00, 31.56it/s]


In [21]:
te_data = []

for i in tqdm(range(submit_df.shape[0])):
    te_data.append([])
    row = submit_df.iloc[i]
    sn = row['sn']
    fault_time = row['fault_time']
    tmp = log_df[log_df['sn'] == sn]
    tmp = tmp[(tmp['time'] <= fault_time)].tail(10).sort_values(by='time', ascending=False)
    
    vec = model.infer_vector(word_tokenize('.'.join(tmp['msg_lower'])))
    te_data[-1] += vec.tolist() + [row['sn'], row['fault_time']]

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3011/3011 [01:34<00:00, 31.88it/s]


In [29]:
columns = ['doc_vec_%d'%i for i in range(64)] + ['sn', 'fault_time', 'label']

In [30]:
train_df = pd.DataFrame(tr_data)
train_df.columns = columns

In [31]:
test_df = pd.DataFrame(te_data)
test_df.columns = columns[:-1]

In [32]:
train_df.to_csv('train7.csv', index=False)
test_df.to_csv('test7.csv', index=False)