In [3]:
import os
import json
import argparse
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import Counter
from tsfresh.feature_extraction.feature_calculators import abs_energy, benford_correlation, count_above, \
    count_above_mean, mean_abs_change, mean_change, percentage_of_reoccurring_datapoints_to_all_datapoints, \
    percentage_of_reoccurring_values_to_all_values, sample_entropy

from scipy.stats import skew, kurtosis
from gensim.models.word2vec import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)

stage = 'test'

label1 = pd.read_csv('../data/preliminary_train_label_dataset.csv')
label2 = pd.read_csv('../data/preliminary_train_label_dataset_s.csv')
label_df = pd.concat([label1, label2]).reset_index(drop=True)
label_df = label_df.drop_duplicates().reset_index(drop=True)

if stage == 'final_a':
    submit_df = pd.read_csv('/tcdata/final_submit_dataset_a.csv')
else:
    submit_df = pd.read_csv('../data/preliminary_submit_dataset_b.csv')

print(label_df.shape, submit_df.shape)

log_df = pd.read_csv('../user_data/log_template.csv')
log_df['msg_lower'] = log_df['msg_lower'].astype(str)
log_df['server_model'] = log_df['server_model'].astype(str)

log_df['time'] = pd.to_datetime(log_df['time'])
label_df['fault_time'] = pd.to_datetime(label_df['fault_time'])
submit_df['fault_time'] = pd.to_datetime(submit_df['fault_time'])

log_df['time_ts'] = log_df["time"].values.astype(np.int64) // 10 ** 9
label_df['fault_time_ts'] = label_df["fault_time"].values.astype(np.int64) // 10 ** 9
submit_df['fault_time_ts'] = submit_df["fault_time"].values.astype(np.int64) // 10 ** 9

if stage == 'final_a':
    crashdump_df1 = pd.read_csv('../data/preliminary_crashdump_dataset.csv')
    venus_df1 = pd.read_csv('../data/preliminary_venus_dataset.csv')
    crashdump_df2 = pd.read_csv('/tcdata/final_crashdump_dataset_a.csv')
    venus_df2 = pd.read_csv('/tcdata/final_venus_dataset_a.csv')
    crashdump_df = pd.concat([crashdump_df1, crashdump_df2]).reset_index(drop=True)
    venus_df = pd.concat([venus_df1, venus_df2]).reset_index(drop=True)
else:
    crashdump_df = pd.read_csv('../data/preliminary_crashdump_dataset.csv')
    venus_df = pd.read_csv('../data/preliminary_venus_dataset.csv')
crashdump_df['fault_time'] = pd.to_datetime(crashdump_df['fault_time'])
venus_df['fault_time'] = pd.to_datetime(venus_df['fault_time'])
crashdump_df['fault_time_ts'] = crashdump_df["fault_time"].values.astype(np.int64) // 10 ** 9
venus_df['fault_time_ts'] = venus_df["fault_time"].values.astype(np.int64) // 10 ** 9

label_df = label_df.merge(log_df[['sn', 'server_model']].drop_duplicates(), on=['sn'], how='left')
submit_df = submit_df.merge(log_df[['sn', 'server_model']].drop_duplicates(), on=['sn'], how='left')
label_df = label_df.fillna('MISSING')
submit_df = submit_df.fillna('MISSING')
print(label_df.shape, submit_df.shape)

label_cnt_df = label_df.groupby('label').size().reset_index().rename({0: 'label_cnt'}, axis=1)
label_model_cnt_df = label_df.groupby(['server_model', 'label']).size().reset_index()\
    .rename({0: 'label_model_cnt'}, axis=1)
label_model_cnt_df = label_model_cnt_df.merge(label_cnt_df, on='label', how='left')
label_model_cnt_df['model/label'] = label_model_cnt_df['label_model_cnt'] / label_model_cnt_df['label_cnt']

# counter_map = {}
# for idx in tqdm(range(label_df.shape[0])):
#     row = label_df.iloc[idx]
#     sn = row['sn']
#     fault_time_ts = row['fault_time_ts']
#     sub_log = log_df[(log_df['sn'] == sn) & (log_df['time_ts'] <= fault_time_ts)]
#     sub_log = sub_log.sort_values(by='time')
#     df_tmp = sub_log.tail(20)
#     k = '%d'%(row['label'])
#     if not k in counter_map:
#         counter_map[k] = Counter()
#     counter_map[k].update(np.unique(df_tmp['msg_id'].values.tolist()))
# for k in counter_map:
#     counter_map[k] = [item[0] for item in counter_map[k].most_common()[:50]]

def safe_split(strs, n, sep='|'):
    str_li = strs.split(sep)
    if len(str_li) >= n + 1:
        return str_li[n].strip()
    else:
        return ''

log_df['time_gap'] = log_df['time'].dt.ceil('30T')
log_df['msg_split_0'] = log_df['msg_lower'].apply(lambda x: safe_split(x, 0))
log_df['msg_split_1'] = log_df['msg_lower'].apply(lambda x: safe_split(x, 1))
log_df['msg_split_2'] = log_df['msg_lower'].apply(lambda x: safe_split(x, 2))

sentences_list = list()
for info, group in log_df.groupby(['sn', 'time_gap']):
    group = group.sort_values(by='time')
    sentences_list.append("\n".join(group['msg_lower'].values.astype(str)))

sentences = list()
for s in sentences_list:
    sentences.append([w for w in s.split()])

w2v_model = Word2Vec(sentences, vector_size=64, window=3, min_count=2, sg=0, hs=1, workers=1, seed=2022)

# tokenized_sent = [word_tokenize(s.lower()) for s in sentences_list]
# tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(tokenized_sent)]
# d2v_model = Doc2Vec(tagged_data, vector_size=32, window=3, min_count=2, workers=1, seed=2022)
# d2v_model.random.seed(2022)

tfv = TfidfVectorizer(ngram_range=(1,3), min_df=5, max_features=50000)
tfv.fit(sentences_list)
X_tfidf = tfv.transform(sentences_list)
svd = TruncatedSVD(n_components=16, random_state=42)
svd.fit(X_tfidf)


(16604, 3) (3030, 2)
(16604, 5) (3030, 4)


TruncatedSVD(n_components=16, random_state=42)

In [4]:
label_df.shape

(16604, 5)

In [13]:
sub_log_cnt = []
for idx in tqdm(range(label_df.shape[0])):
    row = label_df.iloc[idx]
    sn = row['sn']
    fault_time = row['fault_time']
    fault_time_ts = row['fault_time_ts']
    server_model = row['server_model']
    sub_log = log_df[(log_df['sn'] == sn) & (log_df['time_ts'] <= fault_time_ts)]
    sub_log = sub_log.sort_values(by='time')
    
    df_tmp1 = sub_log.tail(10)
    sub_log_cnt.append(df_tmp1.shape[0])

100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 16604/16604 [10:12<00:00, 27.10it/s]


In [14]:
test_sub_log_cnt = []
for idx in tqdm(range(submit_df.shape[0])):
    row = submit_df.iloc[idx]
    sn = row['sn']
    fault_time = row['fault_time']
    fault_time_ts = row['fault_time_ts']
    server_model = row['server_model']
    sub_log = log_df[(log_df['sn'] == sn) & (log_df['time_ts'] <= fault_time_ts)]
    sub_log = sub_log.sort_values(by='time')

    df_tmp1 = sub_log.tail(10)
    test_sub_log_cnt.append(df_tmp1.shape[0])

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 3030/3030 [01:42<00:00, 29.61it/s]


In [15]:
np.mean(sub_log_cnt), np.mean(test_sub_log_cnt)

(6.603228137798121, 3.436963696369637)