In [1]:
import os
import nltk
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models.word2vec import Word2Vec
from sklearn.ensemble import RandomForestClassifier
from tqdm import tqdm

In [2]:
submit_df = pd.read_csv('./data/preliminary_submit_dataset_a.csv')

In [3]:
label1 = pd.read_csv('./data/preliminary_train_label_dataset.csv')
label2 = pd.read_csv('./data/preliminary_train_label_dataset_s.csv')

In [4]:
label_df = pd.concat([label1, label2]).reset_index(drop=True)

In [5]:
train_log_df = pd.read_csv('./data/preliminary_sel_log_dataset.csv')
test_log_df = pd.read_csv('./data/preliminary_sel_log_dataset_a.csv')

In [6]:
log_df = pd.concat([train_log_df, test_log_df]).reset_index(drop=True)

In [11]:
log_df['msg'].values[0].split('|')

[' System Boot Initiated BIOS_Boot_Up ', ' State Asserted ', ' Asserted']

In [7]:
tail_msg_list = []
for sn, group in log_df.groupby('sn'):
    group = group.sort_values(by='time')
    tail_msg_list.append('.'.join(group['msg'].tail(10).to_list()))

In [8]:
tokenized_sent = [word_tokenize(s.lower()) for s in tail_msg_list]

In [9]:
tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(tokenized_sent)]
model = Doc2Vec(tagged_data, vector_size=64, window=2, min_count=1, epochs=10)

In [10]:
def get_time_feature(df):
    if df.shape[0] <= 1:
        return [df.shape[0], np.nan, np.nan, np.nan, np.nan]
    times = pd.to_datetime(df['time'])
    time_diffs = (times.diff() / np.timedelta64(1, 's')).values[1:]
    return [df.shape[0], np.mean(time_diffs), np.max(time_diffs), np.min(time_diffs), np.sum(time_diffs)]

In [11]:
train_feature = []
train_label = []
model_feature = []
time_feature = []
for sn, group in train_log_df.groupby('sn'):
    group = group.sort_values(by='time')
    sub_label_df = label_df[label_df['sn'] == sn]
    for item in sub_label_df.values:
        label = item[0]
        fault_time = item[1]
        label = item[2]
        sub_df = group[group['time'] <= fault_time].tail(10)
        time_feature.append(get_time_feature(sub_df))
        if sub_df.shape[0] > 0:
            model_feature.append(int(sub_df['server_model'].values[0][2:]))
        else:
            model_feature.append(np.nan)
        train_feature.append(model.infer_vector(word_tokenize('. s'.join(sub_df['msg']).lower())))
        train_label.append(label)

In [12]:
train_feature = np.array(train_feature)
model_feature = np.array(model_feature)
time_feature = np.array(time_feature)
train_label = np.array(train_label)
print(train_feature.shape, model_feature.shape, time_feature.shape, train_label.shape)

(16669, 64) (16669,) (16669, 5) (16669,)


In [13]:
train_feature = np.concatenate([train_feature, model_feature.reshape((model_feature.shape[0], 1))], axis=1)
train_feature = np.concatenate([train_feature, time_feature], axis=1)

In [14]:
test_log_df = test_log_df.sort_values(by='time')

In [15]:
test_data = []
model_feature = []
time_feature = []
for i, row in submit_df.iterrows():
    sub_df = test_log_df[(test_log_df['sn']==row['sn'])&(test_log_df['time']<=row['fault_time'])].tail(10)
    time_feature.append(get_time_feature(sub_df))
    if sub_df.shape[0] > 0:
        model_feature.append(int(sub_df['server_model'].values[0][2:]))
    else:
        model_feature.append(np.nan)
    test_data.append(model.infer_vector(word_tokenize('. '.join(sub_df['msg']).lower())))

In [16]:
test_feature = np.array(test_data)
model_feature = np.array(model_feature)
time_feature = np.array(time_feature)

test_feature = np.concatenate([test_feature, model_feature.reshape((model_feature.shape[0], 1))], axis=1)
test_feature = np.concatenate([test_feature, time_feature], axis=1)
test_feature.shape

(3011, 70)

In [17]:
def macro_f1(y_true, y_pred) -> float:
    """
    计算得分
    :param target_df: [sn,fault_time,label]
    :param submit_df: [sn,fault_time,label]
    :return:
    """
    weights =  [3  /  7,  2  /  7,  1  /  7,  1  /  7]
    overall_df = pd.DataFrame([y_true, y_pred]).T
    overall_df.columns = ['label_gt', 'label_pr']

    macro_F1 =  0.
    for i in  range(len(weights)):
        TP =  len(overall_df[(overall_df['label_gt'] == i) & (overall_df['label_pr'] == i)])
        FP =  len(overall_df[(overall_df['label_gt'] != i) & (overall_df['label_pr'] == i)])
        FN =  len(overall_df[(overall_df['label_gt'] == i) & (overall_df['label_pr'] != i)])
        precision = TP /  (TP + FP)  if  (TP + FP)  >  0  else  0
        recall = TP /  (TP + FN)  if  (TP + FP)  >  0  else  0
        F1 =  2  * precision * recall /  (precision + recall)  if  (precision + recall)  >  0  else  0
        macro_F1 += weights[i]  * F1
    return macro_F1

In [18]:
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold

In [19]:
preds = np.zeros((test_feature.shape[0], 4))
val_preds = np.zeros((train_feature.shape[0], 4))

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for train_index, test_index in kf.split(train_feature, train_label):
    xtrain, ytrain = train_feature[train_index], train_label[train_index]
    xtest, ytest = train_feature[test_index], train_label[test_index]
    
    dtrain = lgb.Dataset(xtrain, label=ytrain)
    dvalid = lgb.Dataset(xtest, label=ytest)
    param = {
        'objective': 'multiclass',
        'num_class': 4,
        'metric': 'multi_logloss',
        'early_stopping_rounds': 20,
        'learning_rate': 0.03,
        'random_state': 42
    }
    gbm = lgb.train(
        param, dtrain, valid_sets=[dtrain, dvalid], num_boost_round=100000, verbose_eval=10
    )
    
    val_preds[test_index] = gbm.predict(xtest)
    preds += gbm.predict(test_feature) / 5

print(macro_f1(train_label, np.argmax(val_preds, axis=1)))



You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 17432
[LightGBM] [Info] Number of data points in the train set: 13335, number of used features: 70
[LightGBM] [Info] Start training from score -2.424031
[LightGBM] [Info] Start training from score -1.593813
[LightGBM] [Info] Start training from score -0.578828
[LightGBM] [Info] Start training from score -1.912359
Training until validation scores don't improve for 20 rounds
[10]	training's multi_logloss: 0.961539	valid_1's multi_logloss: 0.983434
[20]	training's multi_logloss: 0.846718	valid_1's multi_logloss: 0.887026
[30]	training's multi_logloss: 0.764388	valid_1's multi_logloss: 0.82073
[40]	training's multi_logloss: 0.702164	valid_1's multi_logloss: 0.772603
[50]	training's multi_logloss: 0.652227	valid_1's multi_logloss: 0.737861
[60]	training's multi_logloss: 0.610253	valid_1's multi_logloss: 0.709654
[70]	training's multi_logloss: 0.574908	valid_1's multi_logloss: 0.687979
[80]	training's mult

[40]	training's multi_logloss: 0.699566	valid_1's multi_logloss: 0.776746
[50]	training's multi_logloss: 0.650105	valid_1's multi_logloss: 0.742736
[60]	training's multi_logloss: 0.609409	valid_1's multi_logloss: 0.716355
[70]	training's multi_logloss: 0.574809	valid_1's multi_logloss: 0.694578
[80]	training's multi_logloss: 0.544365	valid_1's multi_logloss: 0.67657
[90]	training's multi_logloss: 0.518067	valid_1's multi_logloss: 0.662229
[100]	training's multi_logloss: 0.49339	valid_1's multi_logloss: 0.650131
[110]	training's multi_logloss: 0.471063	valid_1's multi_logloss: 0.639487
[120]	training's multi_logloss: 0.451	valid_1's multi_logloss: 0.631567
[130]	training's multi_logloss: 0.43276	valid_1's multi_logloss: 0.624528
[140]	training's multi_logloss: 0.415695	valid_1's multi_logloss: 0.618723
[150]	training's multi_logloss: 0.399571	valid_1's multi_logloss: 0.613372
[160]	training's multi_logloss: 0.384735	valid_1's multi_logloss: 0.609676
[170]	training's multi_logloss: 0.370

[100]	training's multi_logloss: 0.487037	valid_1's multi_logloss: 0.668956
[110]	training's multi_logloss: 0.464874	valid_1's multi_logloss: 0.660082
[120]	training's multi_logloss: 0.444531	valid_1's multi_logloss: 0.652107
[130]	training's multi_logloss: 0.42588	valid_1's multi_logloss: 0.646491
[140]	training's multi_logloss: 0.408557	valid_1's multi_logloss: 0.641475
[150]	training's multi_logloss: 0.392517	valid_1's multi_logloss: 0.636885
[160]	training's multi_logloss: 0.37766	valid_1's multi_logloss: 0.633011
[170]	training's multi_logloss: 0.363947	valid_1's multi_logloss: 0.629544
[180]	training's multi_logloss: 0.350778	valid_1's multi_logloss: 0.626634
[190]	training's multi_logloss: 0.338829	valid_1's multi_logloss: 0.624098
[200]	training's multi_logloss: 0.327591	valid_1's multi_logloss: 0.622887
[210]	training's multi_logloss: 0.316761	valid_1's multi_logloss: 0.621473
[220]	training's multi_logloss: 0.30655	valid_1's multi_logloss: 0.62007
[230]	training's multi_loglos

In [21]:
submit_df['label'] = np.argmax(preds, axis=1)
submit_df[['sn', 'fault_time', 'label']].to_csv('./preliminary_pred_a.csv', index=0)

In [20]:
0.5090071657946984

0.5090071657946984