In [1]:
import os
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import Counter

from scipy.stats import skew, kurtosis
from gensim.models.word2vec import Word2Vec
from nltk.tokenize import word_tokenize
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

label1 = pd.read_csv('../data/preliminary_train_label_dataset.csv')
label2 = pd.read_csv('../data/preliminary_train_label_dataset_s.csv')
label_df = pd.concat([label1, label2]).reset_index(drop=True)
label_df = label_df.drop_duplicates().reset_index(drop=True)

submit_df = pd.read_csv('../data/preliminary_submit_dataset_b.csv')

print(label_df.shape, submit_df.shape)

log_df = pd.read_csv('../user_data/log_template.csv')
log_df['msg_lower'] = log_df['msg_lower'].astype(str)
log_df['server_model'] = log_df['server_model'].astype(str)

log_df['time'] = pd.to_datetime(log_df['time'])
label_df['fault_time'] = pd.to_datetime(label_df['fault_time'])
submit_df['fault_time'] = pd.to_datetime(submit_df['fault_time'])

log_df['time_ts'] = log_df["time"].values.astype(np.int64) // 10 ** 9
label_df['fault_time_ts'] = label_df["fault_time"].values.astype(np.int64) // 10 ** 9
submit_df['fault_time_ts'] = submit_df["fault_time"].values.astype(np.int64) // 10 ** 9

crashdump_df = pd.read_csv('../data/preliminary_crashdump_dataset.csv')
venus_df = pd.read_csv('../data/preliminary_venus_dataset.csv')
crashdump_df['fault_time'] = pd.to_datetime(crashdump_df['fault_time'])
venus_df['fault_time'] = pd.to_datetime(venus_df['fault_time'])
crashdump_df['fault_time_ts'] = crashdump_df["fault_time"].values.astype(np.int64) // 10 ** 9
venus_df['fault_time_ts'] = venus_df["fault_time"].values.astype(np.int64) // 10 ** 9

label_df = label_df.merge(log_df[['sn', 'server_model']].drop_duplicates(), on=['sn'], how='left')
submit_df = submit_df.merge(log_df[['sn', 'server_model']].drop_duplicates(), on=['sn'], how='left')
label_df = label_df.fillna('MISSING')
submit_df = submit_df.fillna('MISSING')
print(label_df.shape, submit_df.shape)

label_cnt_df = label_df.groupby('label').size().reset_index().rename({0: 'label_cnt'}, axis=1)
label_model_cnt_df = label_df.groupby(['server_model', 'label']).size().reset_index()\
    .rename({0: 'label_model_cnt'}, axis=1)
label_model_cnt_df = label_model_cnt_df.merge(label_cnt_df, on='label', how='left')
label_model_cnt_df['model/label'] = label_model_cnt_df['label_model_cnt'] / label_model_cnt_df['label_cnt']


(16604, 3) (3030, 2)
(16604, 5) (3030, 4)


In [2]:
label_df

Unnamed: 0,sn,fault_time,label,fault_time_ts,server_model
0,SERVER_25698,2020-10-09 13:43:00,0,1602250980,SM0
1,SERVER_25699,2020-08-25 18:50:00,0,1598381400,SM3
2,SERVER_25712,2020-03-16 13:20:00,0,1584364800,SM4
3,SERVER_25708,2020-07-25 12:44:00,0,1595681040,SM4
4,SERVER_25711,2020-03-16 16:51:00,0,1584377460,SM4
...,...,...,...,...,...
16599,SERVER_24971,2020-03-04 21:09:00,3,1583356140,SM102
16600,SERVER_24971,2020-11-12 20:49:00,3,1605214140,SM102
16601,SERVER_24962,2020-09-12 12:18:00,3,1599913080,SM102
16602,SERVER_24971,2020-10-04 17:41:00,3,1601833260,SM102


In [3]:
label_df.iloc[0]

sn                      SERVER_25698
fault_time       2020-10-09 13:43:00
label                              0
fault_time_ts             1602250980
server_model                     SM0
Name: 0, dtype: object

In [11]:
label_df['hour'] = label_df['fault_time'].dt.hour

In [13]:
label_df['dayofweek'] = label_df['fault_time'].dt.dayofweek

In [22]:
import seaborn as sns
import matplotlib.pyplot as plt

In [33]:
label_df[label_df['label'] == 0].groupby('dayofweek').size() / label_df[label_df['label'] == 0].shape[0]

dayofweek
0    0.109079
1    0.143631
2    0.149051
3    0.153117
4    0.159214
5    0.149051
6    0.136856
dtype: float64

In [34]:
label_df[label_df['label'] == 1].groupby('dayofweek').size() / label_df[label_df['label'] == 1].shape[0]

dayofweek
0    0.115441
1    0.149985
2    0.151166
3    0.138175
4    0.139061
5    0.168586
6    0.137585
dtype: float64