In [1]:
import os
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import Counter

from gensim.models.word2vec import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [2]:
label1 = pd.read_csv('../data/preliminary_train_label_dataset.csv')
label2 = pd.read_csv('../data/preliminary_train_label_dataset_s.csv')
label_df = pd.concat([label1, label2]).reset_index(drop=True)
label_df = label_df.drop_duplicates().reset_index(drop=True)

In [3]:
submit_df = pd.read_csv('../data/preliminary_submit_dataset_a.csv')

In [4]:
log_df = pd.read_csv('./log_template.csv')

In [5]:
log_df['time'] = pd.to_datetime(log_df['time'])
label_df['fault_time'] = pd.to_datetime(label_df['fault_time'])
submit_df['fault_time'] = pd.to_datetime(submit_df['fault_time'])

log_df['time_ts'] = log_df["time"].values.astype(np.int64) // 10 ** 9
label_df['fault_time_ts'] = label_df["fault_time"].values.astype(np.int64) // 10 ** 9
submit_df['fault_time_ts'] = submit_df["fault_time"].values.astype(np.int64) // 10 ** 9

In [6]:
label_df = label_df.merge(log_df[['sn', 'server_model']].drop_duplicates(), on=['sn'], how='left')
submit_df = submit_df.merge(log_df[['sn', 'server_model']].drop_duplicates(), on=['sn'], how='left')

In [7]:
log_df['time_gap'] = log_df['time'].dt.ceil('2h')

In [8]:
sentences_list = list()
for info, group in log_df.groupby(['sn', 'time_gap']):
    group = group.sort_values(by='time')
    sentences_list.append(" ".join(group['msg_id'].values.astype(str)))

In [9]:
sentences = list()
for s in sentences_list:
    sentences.append([w for w in s.split()])

In [10]:
w2v_model = Word2Vec(sentences, vector_size=64, window=3, min_count=2, sg=0, hs=1, seed=2022)

In [11]:
def get_w2v_mean(sentences):
    emb_matrix = list()
    vec = list()
    for w in sentences.split():
        if w in w2v_model.wv:
            vec.append(w2v_model.wv[w])
    if len(vec) > 0:
        emb_matrix.append(np.mean(vec, axis=0))
    else:
        emb_matrix.append([0] * w2v_model.vector_size)
    return emb_matrix

In [12]:
def make_dataset(dataset, data_type='train'):
    ret = []
    for idx in tqdm(range(dataset.shape[0])):
        row = dataset.iloc[idx]
        sn = row['sn']
        fault_time = row['fault_time']
        fault_time_ts = row['fault_time_ts']
        sub_log = log_df[(log_df['sn'] == sn) & (log_df['time_ts'] <= fault_time_ts)]
        sub_log = sub_log.sort_values(by='time')

        df_tmp1 = sub_log[sub_log['time_ts'] - fault_time_ts >= -60 * 60 * 2]
        
        data = {
            'sn': sn,
            'fault_time': fault_time,
        }
        vec = get_w2v_mean(' '.join(df_tmp1['msg_id'].values.astype(str)))[0]
        for i in range(64):
            data['msgid_w2v_%d'%i] = vec[i]
        if data_type == 'train':
            data['label'] = row['label']
        ret.append(data)
    return ret

In [13]:
train = make_dataset(label_df, data_type='train')
df_train = pd.DataFrame(train)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16604/16604 [08:59<00:00, 30.76it/s]


In [14]:
test = make_dataset(submit_df, data_type='test')
df_test = pd.DataFrame(test)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3011/3011 [01:36<00:00, 31.15it/s]


In [15]:
df_train.to_csv('train7.csv', index=False)
df_test.to_csv('test7.csv', index=False)