In [1]:
import warnings
warnings.simplefilter('ignore')

import gc
import re

import numpy as np
import pandas as pd
pd.set_option('max_columns', None)
pd.set_option('max_rows', 500)
pd.set_option('max_colwidth', 200)
from tqdm.notebook import tqdm

import nltk
from nltk.tokenize import word_tokenize

from gensim.models.word2vec import Word2Vec

from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import KFold, GroupKFold, StratifiedKFold

from catboost import CatBoostClassifier

# loading data

In [2]:
sel_data = pd.read_csv('D:/AI/TianCHI/competition_baselines-master/competitions/tianchi_aiops2022/Rawdata/preliminary_sel_log_dataset.csv')
sel_data2 = pd.read_csv('D:/AI/TianCHI/competition_baselines-master/competitions/tianchi_aiops2022/Rawdata//preliminary_sel_log_dataset_a.csv')
sel_data = pd.concat([sel_data, sel_data2])
sel_data['time'] = pd.to_datetime(sel_data['time'])
sel_data.sort_values(by=['sn', 'time'], inplace=True)
sel_data.reset_index(drop=True, inplace=True)

print(sel_data.shape)
sel_data.head(10)

(493527, 4)


Unnamed: 0,sn,time,msg,server_model
0,000d33b21436,2020-09-02 11:38:40,System Boot Initiated BIOS_Boot_Up | Initiated by warm reset | Asserted,SM40
1,000d33b21436,2020-09-02 15:46:23,System Boot Initiated BIOS_Boot_Up | Initiated by power up | Asserted,SM40
2,005c5a9218ba,2020-06-28 18:26:14,Memory Memory_Status | Correctable ECC | Asserted,SM99
3,005c5a9218ba,2020-06-28 18:26:15,Memory Memory_Status | Correctable ECC | Asserted,SM99
4,005c5a9218ba,2020-06-28 18:26:20,Memory Memory_Status | Correctable ECC | Asserted,SM99
5,005c5a9218ba,2020-06-28 18:26:25,Memory Memory_Status | Correctable ECC | Asserted,SM99
6,005c5a9218ba,2020-06-28 18:26:26,Memory Memory_Status | Correctable ECC | Asserted,SM99
7,005c5a9218ba,2020-06-28 18:26:30,Memory Memory_Status | Correctable ECC | Asserted,SM99
8,005c5a9218ba,2020-06-28 18:38:49,System ACPI Power State #0x7d | S4/S5: soft-off | Asserted,SM99
9,005c5a9218ba,2020-06-28 18:40:26,System ACPI Power State #0x7d | S0/G0: working | Asserted,SM99


In [3]:
# sel_data['msg'] = sel_data['msg'].astype(str).apply(lambda x: x.replace('_', ' '))

In [3]:
train_data = pd.read_csv('D:/AI/TianCHI/competition_baselines-master/competitions/tianchi_aiops2022/Rawdata//preliminary_train_label_dataset.csv')
train_data2 = pd.read_csv('D:/AI/TianCHI/competition_baselines-master/competitions/tianchi_aiops2022/Rawdata//preliminary_train_label_dataset_s.csv')
train_data = pd.concat([train_data, train_data2])
train_data['fault_time'] = pd.to_datetime(train_data['fault_time'])
train_data.sort_values(by=['sn', 'fault_time'], inplace=True)
train_data.reset_index(drop=True, inplace=True)

print(train_data.shape)
train_data.head(10)

(16669, 3)


Unnamed: 0,sn,fault_time,label
0,SERVER_10001,2020-05-01 10:04:00,1
1,SERVER_10003,2020-03-28 09:48:00,2
2,SERVER_10008,2020-02-25 16:12:00,1
3,SERVER_10008,2020-03-11 18:04:00,2
4,SERVER_10009,2020-05-08 16:37:00,3
5,SERVER_10012,2020-07-13 03:32:00,3
6,SERVER_10017,2020-06-11 15:52:00,3
7,SERVER_10017,2020-06-11 15:52:00,3
8,SERVER_10018,2020-05-31 03:33:00,3
9,SERVER_10019,2020-01-29 22:38:00,3


In [4]:
test_data = pd.read_csv('D:/AI/TianCHI/competition_baselines-master/competitions/tianchi_aiops2022/Rawdata//preliminary_submit_dataset_a.csv')
test_data['fault_time'] = pd.to_datetime(test_data['fault_time'])

print(test_data.shape)
test_data.head(10)

(3011, 2)


Unnamed: 0,sn,fault_time
0,000d33b21436,2020-09-02 16:42:54
1,005c5a9218ba,2020-06-28 19:05:16
2,0079283bde6e,2020-04-26 21:32:44
3,007bdf23b62f,2020-06-16 18:40:39
4,00a577a8e54f,2020-04-07 07:16:55
5,00a85fb232bf,2020-05-27 03:24:09
6,00ae2639c426,2019-12-30 05:24:54
7,00b9c343ace4,2020-11-13 01:29:55
8,00bdcf2207d5,2020-01-04 13:39:40
9,00c76d7884f5,2020-07-16 21:22:54


# w2v model

In [5]:
tmp = sel_data.groupby(['sn'], as_index=False)['msg'].agg(list)
tmp['text'] = tmp['msg'].apply(lambda x: ("\n".join([i for i in x])).lower())
sentences_list = tmp['text'].values.tolist()

sentences = list()
for s in sentences_list:
    sentences.append([w for w in s.split()])

In [6]:
%%time

#w2v_model = Word2Vec(sentences, size=32, window=3, min_count=5, sg=0, hs=1, seed=2022)
w2v_model = Word2Vec(sentences,vector_size=32, window=3, min_count=5, sg=0, hs=1, seed=2022)

Wall time: 5.52 s


In [7]:
def get_w2v_mean(sentences):
    emb_matrix = list()
    vec = list()
    for w in sentences.split():
        if w in w2v_model.wv:
            vec.append(w2v_model.wv[w])
    if len(vec) > 0:
        emb_matrix.append(np.mean(vec, axis=0))
    else:
        emb_matrix.append([0] * model.vector_size)
    return emb_matrix

# tf-idf model

In [8]:
%%time

X = list(tmp['text'].values)
tfv = TfidfVectorizer(ngram_range=(1,3), min_df=5, max_features=50000)
tfv.fit(X)

Wall time: 3.85 s


TfidfVectorizer(max_features=50000, min_df=5, ngram_range=(1, 3))

In [9]:
%%time

X_tfidf = tfv.transform(X)
svd = TruncatedSVD(n_components=16)
svd.fit(X_tfidf)

Wall time: 4.28 s


TruncatedSVD(n_components=16)

In [10]:
def get_tfidf_svd(sentences, n_components=16):
    X_tfidf = tfv.transform(sentences)
    X_svd = svd.transform(X_tfidf)
    return np.mean(X_svd, axis=0)

# other features

In [11]:
sel_data['time_ts'] = sel_data["time"].values.astype(np.int64) // 10 ** 9
train_data['fault_time_ts'] = train_data["fault_time"].values.astype(np.int64) // 10 ** 9

In [12]:
def safe_split(strs, n, sep='|'):
    str_li = strs.split(sep)
    if len(str_li) >= n + 1:
        return str_li[n]
    else:
        return ''

sel_data['msg_split_0'] = sel_data['msg'].apply(lambda x: safe_split(x, 0))
sel_data['msg_split_1'] = sel_data['msg'].apply(lambda x: safe_split(x, 1))
sel_data['msg_split_2'] = sel_data['msg'].apply(lambda x: safe_split(x, 2))

sel_data['category'] = sel_data['msg'].apply(lambda x: x.split()[0])

In [13]:
cate_map = {
    'Memory': 0,
    'System': 1,
    'Processor': 2,
    'Temperature': 3,
    'Drive': 4,
    'Power': 5,
    'Unknown': 6,
    'Microcontroller': 7,
    'OS': 8,
    'Watchdog2': 9,
    'OEM': 10,
    'Button': 11,
    'Slot/Connector': 12,
    'Microcontroller/Coprocessor': 13,
    'Management': 14,
    'Event': 15,
    'Watchdog': 16,
    'Slot': 17,
    'Fan': 18,
    'Critical': 19,
    'device': 20,
    'LAN': 21,
    'Version': 22,
    'Add-in': 23,
    'Terminator': 24,
    'Chassis': 25,
    'reserved': 26,
    'Physical': 27,
    'Session': 28,
    'Reserved': 29,
    'Cable/Interconnect': 30,
    'Cable': 31,
    'Chip': 32,
    'Battery': 33
}

# make dataset

In [14]:
train_data.head()

Unnamed: 0,sn,fault_time,label,fault_time_ts
0,SERVER_10001,2020-05-01 10:04:00,1,1588327440
1,SERVER_10003,2020-03-28 09:48:00,2,1585388880
2,SERVER_10008,2020-02-25 16:12:00,1,1582647120
3,SERVER_10008,2020-03-11 18:04:00,2,1583949840
4,SERVER_10009,2020-05-08 16:37:00,3,1588955820


In [15]:
sel_data.head()

Unnamed: 0,sn,time,msg,server_model,time_ts,msg_split_0,msg_split_1,msg_split_2,category
0,000d33b21436,2020-09-02 11:38:40,System Boot Initiated BIOS_Boot_Up | Initiated by warm reset | Asserted,SM40,1599046720,System Boot Initiated BIOS_Boot_Up,Initiated by warm reset,Asserted,System
1,000d33b21436,2020-09-02 15:46:23,System Boot Initiated BIOS_Boot_Up | Initiated by power up | Asserted,SM40,1599061583,System Boot Initiated BIOS_Boot_Up,Initiated by power up,Asserted,System
2,005c5a9218ba,2020-06-28 18:26:14,Memory Memory_Status | Correctable ECC | Asserted,SM99,1593368774,Memory Memory_Status,Correctable ECC,Asserted,Memory
3,005c5a9218ba,2020-06-28 18:26:15,Memory Memory_Status | Correctable ECC | Asserted,SM99,1593368775,Memory Memory_Status,Correctable ECC,Asserted,Memory
4,005c5a9218ba,2020-06-28 18:26:20,Memory Memory_Status | Correctable ECC | Asserted,SM99,1593368780,Memory Memory_Status,Correctable ECC,Asserted,Memory


In [16]:
def make_dataset(dataset, data_type='train'):
    ret = list()

    for idx, row in tqdm(dataset.iterrows()):
        sn = row['sn']
        fault_time = row['fault_time']
        ts = row['fault_time_ts']
        
        if data_type == 'train':
            label = row['label']

        df = sel_data[sel_data['sn'] == sn].copy()

        df = df[df['time_ts'] <= ts].copy()
        df = df.sort_values(by='time_ts').reset_index(drop=True)
        df = df.tail(40).copy()        # TODO: could change last 40 logs here

        # make some features

        logs_count = len(df)

        if logs_count > 0:
            msg_nunique = df['msg'].nunique()
            msg_category_nunique = df['category'].nunique()
            msg_split_0_nunique = df['msg_split_0'].nunique()
            msg_split_1_nunique = df['msg_split_1'].nunique()
            msg_split_2_nunique = df['msg_split_2'].nunique()
            last_category = df['category'].value_counts().index[0]
            last_category = cate_map[last_category] if last_category in cate_map else len(cate_map)

            s = df['time_ts'].values
            if len(s) > 0:
                seconds_span = s[-1] - s[0] 
            else:
                seconds_span = 0

            df['time_ts_shift_1'] = df['time_ts'].shift(1)
            df['time_ts_diffs_1'] = df['time_ts'] - df['time_ts_shift_1']
            s = df['time_ts_diffs_1'].values
            if len(s) > 1:
                log_time_diffs_avg = np.mean(s[1:])
                log_time_diffs_max = np.max(s[1:])
                log_time_diffs_min = np.min(s[1:])
                log_time_diffs_std = np.std(s[1:])
            else:
                try:
                    log_time_diffs_avg = log_time_diffs_max = log_time_diffs_min = s[0]
                    log_time_diffs_std = 0
                except:
                    log_time_diffs_avg = log_time_diffs_max = log_time_diffs_min = log_time_diffs_std = 0

            all_msg = "\n".join(df['msg'].values.tolist()).lower()
            w2v_emb = get_w2v_mean(all_msg)[0]
            tfv_emb = get_tfidf_svd([s.lower() for s in df['msg'].values.tolist()])

        else:
            logs_count = 0
            msg_nunique = 0
            msg_category_nunique = 0
            msg_split_0_nunique = 0
            msg_split_1_nunique = 0
            msg_split_2_nunique = 0
            last_category = 0
            seconds_span = 0
            log_time_diffs_avg = 0
            log_time_diffs_max = 0
            log_time_diffs_min = 0
            log_time_diffs_std = 0
            w2v_emb = [0] * 32
            tfv_emb = [0] * 16


        # format dataset
        data = {
            'sn': sn,
            'fault_time': fault_time,
            'logs_count': logs_count,
            'msg_nunique': msg_nunique,
            'msg_category_nunique': msg_category_nunique,
            'msg_split_0_nunique': msg_split_0_nunique,
            'msg_split_1_nunique': msg_split_1_nunique,
            'msg_split_2_nunique': msg_split_2_nunique,
            'last_category': last_category,
            'seconds_span': seconds_span,
            'log_time_diffs_avg': log_time_diffs_avg,
            'log_time_diffs_max': log_time_diffs_max,
            'log_time_diffs_min': log_time_diffs_min,
            'log_time_diffs_std': log_time_diffs_std,
        }

        for i in range(32):
            data[f'msg_w2v_{i}'] = w2v_emb[i]
        for i in range(16):
            data[f'msg_tfv_{i}'] = tfv_emb[i]
            
        if data_type == 'train':
            data['label'] = label

        ret.append(data)
        
    return ret

In [17]:
train = make_dataset(train_data, data_type='train')
df_train = pd.DataFrame(train)

print(df_train.shape)
df_train.head()

0it [00:00, ?it/s]

(16669, 63)


Unnamed: 0,sn,fault_time,logs_count,msg_nunique,msg_category_nunique,msg_split_0_nunique,msg_split_1_nunique,msg_split_2_nunique,last_category,seconds_span,log_time_diffs_avg,log_time_diffs_max,log_time_diffs_min,log_time_diffs_std,msg_w2v_0,msg_w2v_1,msg_w2v_2,msg_w2v_3,msg_w2v_4,msg_w2v_5,msg_w2v_6,msg_w2v_7,msg_w2v_8,msg_w2v_9,msg_w2v_10,msg_w2v_11,msg_w2v_12,msg_w2v_13,msg_w2v_14,msg_w2v_15,msg_w2v_16,msg_w2v_17,msg_w2v_18,msg_w2v_19,msg_w2v_20,msg_w2v_21,msg_w2v_22,msg_w2v_23,msg_w2v_24,msg_w2v_25,msg_w2v_26,msg_w2v_27,msg_w2v_28,msg_w2v_29,msg_w2v_30,msg_w2v_31,msg_tfv_0,msg_tfv_1,msg_tfv_2,msg_tfv_3,msg_tfv_4,msg_tfv_5,msg_tfv_6,msg_tfv_7,msg_tfv_8,msg_tfv_9,msg_tfv_10,msg_tfv_11,msg_tfv_12,msg_tfv_13,msg_tfv_14,msg_tfv_15,label
0,SERVER_10001,2020-05-01 10:04:00,9,9,3,5,5,2,2,660,82.5,332.0,0.0,131.264047,-0.192678,-0.422263,-0.623276,-1.575063,0.119791,-1.418376,-0.392275,0.803006,-0.139611,-0.101934,0.131073,-0.305873,0.063804,0.888717,0.458124,0.403122,-0.102679,-0.058097,0.348334,0.291018,-0.886966,0.047955,0.941204,0.337638,-0.582843,0.558431,-0.239136,0.330266,0.487814,-0.251389,-0.158772,0.101647,0.151897,-0.159091,-0.04585,-0.13712,0.037563,0.017992,-0.026371,-0.061369,0.196829,-0.066196,0.15843,0.150945,0.005486,-0.045674,0.028717,0.003497,1
1,SERVER_10003,2020-03-28 09:48:00,40,1,1,1,1,1,0,57,1.461538,2.0,1.0,0.498519,-0.185774,0.184353,-1.364618,-0.344243,0.652793,-1.114465,1.284625,0.570601,-0.40307,0.499811,-0.600623,0.166945,-1.323626,0.039155,-0.522155,0.224448,-0.23682,-0.477922,-0.721437,1.768058,-1.106367,0.699869,0.720213,0.653124,0.103541,0.987464,0.510914,0.297919,-0.414473,-0.656889,0.528691,-0.92171,0.214217,0.163643,0.088819,-0.070341,0.039226,0.040002,-0.023293,-0.117804,-0.116215,-0.099459,-0.030253,-0.027286,0.023109,-0.107916,0.066018,0.118433,2
2,SERVER_10008,2020-02-25 16:12:00,5,3,2,2,3,1,2,38,9.5,33.0,0.0,13.720423,-0.424774,0.017928,-0.546295,-1.172927,0.914821,-0.621449,0.462552,1.290344,-0.248379,1.18412,-0.144578,0.177405,-0.847192,0.435682,-0.187527,0.498358,0.47669,-0.610104,-0.802261,1.104439,-1.29076,-0.001748,0.944741,0.423207,-0.400046,-0.177233,1.232935,0.562668,0.065899,-1.420155,-0.499073,-0.407388,0.133858,-0.04579,-0.007332,-0.14199,0.267678,0.13356,-0.034543,-0.001381,0.052571,0.077571,0.02986,-0.057572,-0.022454,-0.023218,-0.001505,-0.11832,1
3,SERVER_10008,2020-03-11 18:04:00,9,4,3,3,4,1,2,1299319,162414.875,1245629.0,0.0,409792.273236,-0.314116,-0.025452,-0.758655,-1.206952,0.805616,-0.767379,0.168234,1.036363,-0.134858,1.011028,-0.007454,0.124957,-0.780979,0.519951,-0.125957,0.68622,0.443623,-0.500455,-0.64912,1.302308,-1.057668,0.152681,0.985002,0.622225,-0.434803,-0.029057,1.010772,0.761046,0.264369,-1.243474,-0.496989,-0.126545,0.161708,-0.08611,-0.01697,-0.117737,0.249082,0.107375,0.009076,-2.4e-05,0.06151,0.055417,0.023456,-0.041575,-0.018602,0.005665,-0.011278,-0.067618,2
4,SERVER_10009,2020-05-08 16:37:00,4,4,1,2,1,2,4,21,7.0,21.0,0.0,9.899495,-0.608514,-0.376773,0.806442,0.71213,-1.452448,-0.647234,0.361486,0.982217,0.172178,1.409562,-0.611899,0.322953,-1.645572,0.727755,0.861701,0.179649,-1.008179,-0.996238,-0.119675,1.07593,-0.466934,0.124938,1.103678,0.885442,0.325571,-0.030377,-0.26262,-0.009778,-0.410922,1.195756,-0.479456,-0.918985,0.011797,-0.006899,-0.000903,-0.01031,-0.005546,0.011294,-0.003612,-0.02373,-0.011572,0.035312,-0.012004,0.016295,0.023304,0.226142,0.394136,-0.037734,3


In [18]:
df_train = df_train[df_train['logs_count'] > 0].copy()
df_train.shape

(16571, 63)

In [19]:
test_data['fault_time_ts'] = test_data["fault_time"].values.astype(np.int64) // 10 ** 9

test = make_dataset(test_data, data_type='test')

df_test = pd.DataFrame(test)
print(df_test.shape)
df_test.head()

0it [00:00, ?it/s]

(3011, 62)


Unnamed: 0,sn,fault_time,logs_count,msg_nunique,msg_category_nunique,msg_split_0_nunique,msg_split_1_nunique,msg_split_2_nunique,last_category,seconds_span,log_time_diffs_avg,log_time_diffs_max,log_time_diffs_min,log_time_diffs_std,msg_w2v_0,msg_w2v_1,msg_w2v_2,msg_w2v_3,msg_w2v_4,msg_w2v_5,msg_w2v_6,msg_w2v_7,msg_w2v_8,msg_w2v_9,msg_w2v_10,msg_w2v_11,msg_w2v_12,msg_w2v_13,msg_w2v_14,msg_w2v_15,msg_w2v_16,msg_w2v_17,msg_w2v_18,msg_w2v_19,msg_w2v_20,msg_w2v_21,msg_w2v_22,msg_w2v_23,msg_w2v_24,msg_w2v_25,msg_w2v_26,msg_w2v_27,msg_w2v_28,msg_w2v_29,msg_w2v_30,msg_w2v_31,msg_tfv_0,msg_tfv_1,msg_tfv_2,msg_tfv_3,msg_tfv_4,msg_tfv_5,msg_tfv_6,msg_tfv_7,msg_tfv_8,msg_tfv_9,msg_tfv_10,msg_tfv_11,msg_tfv_12,msg_tfv_13,msg_tfv_14,msg_tfv_15
0,000d33b21436,2020-09-02 16:42:54,2,2,1,1,2,1,1,14863,14863.0,14863.0,14863.0,0.0,0.176774,-0.14397,-1.879156,-1.400867,0.419286,-1.625964,-1.059834,-0.267002,0.404677,0.378096,0.611487,-0.048592,-0.467053,0.93307,0.07873,1.780381,0.296158,-0.038601,0.081927,2.116284,0.147627,1.128596,1.164737,1.701428,-0.585442,0.725838,-0.069003,1.683462,1.031518,-0.405131,-0.609699,1.127796,0.365154,-0.356821,-0.030162,0.148505,0.194267,-0.112638,0.221041,-0.002965,0.120015,-0.108876,-0.131008,0.012952,0.022818,0.298617,-0.126675,0.371314
1,005c5a9218ba,2020-06-28 19:05:16,10,4,2,3,4,1,0,867,96.333333,739.0,1.0,229.054579,-0.194304,-0.029266,-1.043357,-1.148937,0.730639,-1.537647,0.037677,0.25494,-0.294306,0.471668,-0.026515,0.254326,-0.972557,0.371941,-0.098951,0.861207,-0.0451,-0.128952,-0.327682,1.482235,-0.379215,0.795666,1.127212,1.078994,-0.157503,0.958982,0.384364,0.710294,0.610424,-0.635941,0.190232,-0.02438,0.237742,0.013483,0.062169,-0.031689,0.038237,-0.009766,-0.012814,-0.14845,-0.105099,-0.235243,-0.090182,0.16756,-0.3549,0.189179,-0.094731,-0.045445
2,0079283bde6e,2020-04-26 21:32:44,1,1,1,1,1,1,5,0,,,,0.0,-0.518338,-0.306188,-0.116333,-0.217269,-0.303796,-1.620146,-0.212126,-0.043083,0.175438,-0.094694,0.033129,-1.658059,-0.843689,0.631087,-0.087674,0.206705,-0.290902,0.226296,-0.428127,1.310052,-1.579618,-0.399393,2.099555,0.656982,0.671247,-0.102938,-0.528643,1.027944,0.628172,-0.54532,-0.042515,0.040593,0.052623,-0.044621,-0.018084,-0.061261,-0.093756,0.022443,0.0038,-0.14059,-0.038552,0.227581,-0.077383,0.082166,0.007342,0.007063,-0.070818,0.047824
3,007bdf23b62f,2020-06-16 18:40:39,19,5,3,4,5,1,0,2477,137.611111,760.0,0.0,232.859552,-0.061551,-0.111422,-1.015283,-1.055931,0.431537,-1.467788,0.258986,0.164879,-0.208852,0.54228,0.057156,0.291574,-1.169061,0.379286,-0.340612,0.71819,-0.344978,-0.067532,-0.18813,1.641221,-0.80234,1.039242,1.106396,0.895073,-0.028114,1.114943,0.322838,0.574481,0.448971,-0.656013,0.397151,-0.182813,0.479957,0.359418,-0.236788,0.064044,-0.070913,-0.024194,-0.016141,0.031433,0.088708,-0.006893,0.021145,-0.018,-0.003136,0.024449,-0.019045,-0.006545
4,00a577a8e54f,2020-04-07 07:16:55,6,6,3,5,5,1,8,563,112.6,369.0,0.0,134.117262,0.345196,0.332425,-0.201089,-0.335762,0.756313,-0.51218,-0.501818,0.537034,-0.197411,0.751408,0.232717,-0.273736,-0.088164,0.198667,-0.882762,0.465789,-0.58542,-0.148375,-0.425711,1.720286,-0.661216,0.740227,0.391946,1.406725,0.676276,0.210513,0.125926,0.964489,-0.339875,-0.676076,-0.3552,0.17519,0.106396,-0.035529,0.036857,0.088155,0.053826,-0.097669,-0.078714,-0.070725,-0.138587,0.062167,0.221864,-0.056011,-0.006704,0.039166,-0.02214,-0.003005


In [20]:
df_test[df_test['logs_count'] == 0].shape

(0, 62)

# catboost model

In [21]:
classes = np.unique(df_train['label'])
weights = compute_class_weight(class_weight='balanced', classes=classes, y=df_train['label'])
class_weights = dict(zip(classes, weights))

class_weights

{0: 2.8086440677966102,
 1: 1.2249408633944412,
 2: 0.4468503937007874,
 3: 1.6957634056487925}

In [22]:
NUM_CLASSES = df_train['label'].nunique()
FOLDS = 10
TARGET = 'label'
use_features = [col for col in df_train.columns if col not in ['sn', 'fault_time', TARGET]]

def run_ctb(df_train, df_test, use_features):
    target = TARGET
    oof_pred = np.zeros((len(df_train), NUM_CLASSES))
    y_pred = np.zeros((len(df_test), NUM_CLASSES))
    
    folds = GroupKFold(n_splits=FOLDS)
    for fold, (tr_ind, val_ind) in enumerate(folds.split(df_train, df_train[TARGET], df_train['sn'])):
        print(f'Fold {fold + 1}') 
        x_train, x_val = df_train[use_features].iloc[tr_ind], df_train[use_features].iloc[val_ind] 
        y_train, y_val = df_train[target].iloc[tr_ind], df_train[target].iloc[val_ind]
        
        params = { 
            'task_type': 'GPU', 
            'bootstrap_type': 'Bernoulli',
            'learning_rate': 0.1, 
            'eval_metric': 'MultiClass', 
            'loss_function': 'MultiClass', 
            'classes_count': NUM_CLASSES, 
            'iterations': 1000, 
            'random_seed': 2022, 
            'depth': 8, 
            'subsample': 0.8, 
            'leaf_estimation_iterations': 8,
            'reg_lambda': 0.5,
            'class_weights': class_weights,
            'early_stopping_rounds': 100 
        }
        model = CatBoostClassifier(**params)
        
        model.fit(x_train, 
                  y_train, 
                  eval_set=(x_val, y_val), 
                  verbose=100) 
        oof_pred[val_ind] = model.predict_proba(x_val) 
        y_pred += model.predict_proba(df_test[use_features]) / folds.n_splits
        
        score = f1_score(y_val, oof_pred[val_ind].argmax(axis=1), average='macro')
        print(f'F1 score: {score}')
        
        print("Features importance...")
        feat_imp = pd.DataFrame({'imp': model.feature_importances_, 'feature': use_features})
        print(feat_imp.sort_values(by='imp').reset_index(drop=True))
        
        del x_train, x_val, y_train, y_val
        gc.collect()
        
    return y_pred, oof_pred

In [23]:
y_pred, oof_pred = run_ctb(df_train, df_test, use_features)

Fold 1
0:	learn: 1.2317220	test: 1.2382863	best: 1.2382863 (0)	total: 22.8ms	remaining: 22.8s
100:	learn: 0.3171021	test: 0.6582002	best: 0.6525340 (52)	total: 2.17s	remaining: 19.3s
bestTest = 0.6525339549
bestIteration = 52
Shrink model to first 53 iterations.
F1 score: 0.6826822738326324
Features importance...
         imp               feature
0   0.000000             msg_w2v_0
1   0.000000             msg_w2v_5
2   0.000000            msg_w2v_20
3   0.129654            msg_w2v_27
4   0.191768             msg_tfv_9
5   0.207824            msg_w2v_21
6   0.307522    log_time_diffs_min
7   0.325502            msg_w2v_11
8   0.352618            msg_w2v_10
9   0.374556             msg_w2v_1
10  0.380666             msg_w2v_8
11  0.538449            msg_w2v_25
12  0.582474             msg_w2v_9
13  0.642404            msg_w2v_28
14  0.644349    log_time_diffs_avg
15  0.654871            msg_tfv_10
16  0.691338            msg_w2v_13
17  0.745404             msg_w2v_2
18  0.753682        

59  5.759592             msg_tfv_5
Fold 5
0:	learn: 1.2235342	test: 1.2269452	best: 1.2269452 (0)	total: 24.8ms	remaining: 24.8s
100:	learn: 0.3212863	test: 0.7015341	best: 0.6702790 (52)	total: 2.25s	remaining: 20s
bestTest = 0.6702790461
bestIteration = 52
Shrink model to first 53 iterations.
F1 score: 0.6757176373072458
Features importance...
         imp               feature
0   0.000000            msg_tfv_13
1   0.047632            msg_w2v_13
2   0.157341             msg_w2v_5
3   0.176406            msg_w2v_10
4   0.265256             msg_w2v_7
5   0.274454    log_time_diffs_min
6   0.412177             msg_w2v_0
7   0.421207            msg_w2v_25
8   0.473801            msg_w2v_20
9   0.496342             msg_w2v_8
10  0.616795             msg_w2v_4
11  0.627176            msg_w2v_12
12  0.636784            msg_w2v_15
13  0.727529             msg_tfv_9
14  0.774731            msg_w2v_21
15  0.775410            msg_w2v_18
16  0.805849    log_time_diffs_avg
17  0.844560          

59  5.473008            msg_w2v_26
Fold 9
0:	learn: 1.2293520	test: 1.2479586	best: 1.2479586 (0)	total: 24.8ms	remaining: 24.8s
100:	learn: 0.3288086	test: 0.6732452	best: 0.6675154 (67)	total: 2.18s	remaining: 19.4s
bestTest = 0.6675153933
bestIteration = 67
Shrink model to first 68 iterations.
F1 score: 0.6883821554180884
Features importance...
         imp               feature
0   0.082168             msg_w2v_5
1   0.159297            msg_w2v_12
2   0.248081             msg_w2v_8
3   0.248934            msg_w2v_10
4   0.257172            msg_w2v_25
5   0.290173    log_time_diffs_min
6   0.313146             msg_w2v_0
7   0.320615            msg_w2v_20
8   0.458926             msg_w2v_1
9   0.498312            msg_w2v_13
10  0.612113            msg_w2v_18
11  0.733532            msg_w2v_15
12  0.750895   msg_split_2_nunique
13  0.818366            msg_w2v_11
14  0.856523            msg_w2v_21
15  0.865357            msg_w2v_31
16  0.874718            msg_w2v_28
17  0.905760        

In [24]:
target_df = df_train[['sn', 'fault_time', 'label']].copy()
oof_df = target_df.copy()
oof_df['label'] = oof_pred.argmax(axis=1)

def  macro_f1(target_df: pd.DataFrame,  submit_df: pd.DataFrame)  -> float:

    """
    计算得分
    :param target_df: [sn,fault_time,label]
    :param submit_df: [sn,fault_time,label]
    :return:
    """

    weights =  [3/7,  2/7,  1/7,  1/7]

    overall_df = target_df.merge(submit_df, how='left', on=['sn', 'fault_time'], suffixes=['_gt', '_pr'])
    overall_df.fillna(-1)

    macro_F1 =  0.
    for i in  range(len(weights)):
        TP =  len(overall_df[(overall_df['label_gt'] == i) & (overall_df['label_pr'] == i)])
        FP =  len(overall_df[(overall_df['label_gt'] != i) & (overall_df['label_pr'] == i)])
        FN =  len(overall_df[(overall_df['label_gt'] == i) & (overall_df['label_pr'] != i)])
        precision = TP /  (TP + FP)  if  (TP + FP)  >  0  else  0
        recall = TP /  (TP + FN)  if  (TP + FN)  >  0  else  0
        F1 =  2  * precision * recall /  (precision + recall)  if  (precision + recall)  >  0  else  0
        macro_F1 += weights[i]  * F1
    return macro_F1


macro_f1(target_df, oof_df)

0.5935358389927934

In [25]:
sub = df_test[['sn', 'fault_time']].copy()
sub['label'] = y_pred.argmax(axis=1)
display(sub.head())
sub['label'].value_counts()

Unnamed: 0,sn,fault_time,label
0,000d33b21436,2020-09-02 16:42:54,3
1,005c5a9218ba,2020-06-28 19:05:16,2
2,0079283bde6e,2020-04-26 21:32:44,3
3,007bdf23b62f,2020-06-16 18:40:39,2
4,00a577a8e54f,2020-04-07 07:16:55,2


2    1620
3     573
1     510
0     308
Name: label, dtype: int64

In [26]:
sub.to_csv('baseline2_gkf_sn.csv', index=False)