In [1]:
import warnings
warnings.simplefilter('ignore')

import gc
import re

import numpy as np
import pandas as pd
pd.set_option('max_columns', None)
pd.set_option('max_rows', 500)
pd.set_option('max_colwidth', 200)
from tqdm.notebook import tqdm

import nltk
from nltk.tokenize import word_tokenize

from gensim.models.word2vec import Word2Vec

from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import KFold, GroupKFold, StratifiedKFold

from catboost import CatBoostClassifier

In [2]:
sel_data = pd.read_csv('data/preliminary_sel_log_dataset.csv')
sel_data2 = pd.read_csv('data/preliminary_sel_log_dataset_a.csv')
sel_data = pd.concat([sel_data, sel_data2])
sel_data['time'] = pd.to_datetime(sel_data['time'])
sel_data.sort_values(by=['sn', 'time'], inplace=True)
sel_data.reset_index(drop=True, inplace=True)

print(sel_data.shape)
sel_data.head(10)

(493527, 4)


Unnamed: 0,sn,time,msg,server_model
0,000d33b21436,2020-09-02 11:38:40,System Boot Initiated BIOS_Boot_Up | Initiated by warm reset | Asserted,SM40
1,000d33b21436,2020-09-02 15:46:23,System Boot Initiated BIOS_Boot_Up | Initiated by power up | Asserted,SM40
2,005c5a9218ba,2020-06-28 18:26:14,Memory Memory_Status | Correctable ECC | Asserted,SM99
3,005c5a9218ba,2020-06-28 18:26:15,Memory Memory_Status | Correctable ECC | Asserted,SM99
4,005c5a9218ba,2020-06-28 18:26:20,Memory Memory_Status | Correctable ECC | Asserted,SM99
5,005c5a9218ba,2020-06-28 18:26:25,Memory Memory_Status | Correctable ECC | Asserted,SM99
6,005c5a9218ba,2020-06-28 18:26:26,Memory Memory_Status | Correctable ECC | Asserted,SM99
7,005c5a9218ba,2020-06-28 18:26:30,Memory Memory_Status | Correctable ECC | Asserted,SM99
8,005c5a9218ba,2020-06-28 18:38:49,System ACPI Power State #0x7d | S4/S5: soft-off | Asserted,SM99
9,005c5a9218ba,2020-06-28 18:40:26,System ACPI Power State #0x7d | S0/G0: working | Asserted,SM99


In [3]:
train_data = pd.read_csv('data/preliminary_train_label_dataset.csv')
train_data2 = pd.read_csv('data/preliminary_train_label_dataset_s.csv')
train_data = pd.concat([train_data, train_data2])
train_data['fault_time'] = pd.to_datetime(train_data['fault_time'])
train_data.sort_values(by=['sn', 'fault_time'], inplace=True)
train_data.reset_index(drop=True, inplace=True)

print(train_data.shape)
train_data.head(10)

(16669, 3)


Unnamed: 0,sn,fault_time,label
0,SERVER_10001,2020-05-01 10:04:00,1
1,SERVER_10003,2020-03-28 09:48:00,2
2,SERVER_10008,2020-02-25 16:12:00,1
3,SERVER_10008,2020-03-11 18:04:00,2
4,SERVER_10009,2020-05-08 16:37:00,3
5,SERVER_10012,2020-07-13 03:32:00,3
6,SERVER_10017,2020-06-11 15:52:00,3
7,SERVER_10017,2020-06-11 15:52:00,3
8,SERVER_10018,2020-05-31 03:33:00,3
9,SERVER_10019,2020-01-29 22:38:00,3


In [4]:
test_data = pd.read_csv('data/preliminary_submit_dataset_a.csv')
test_data['fault_time'] = pd.to_datetime(test_data['fault_time'])

print(test_data.shape)
test_data.head(10)

(3011, 2)


Unnamed: 0,sn,fault_time
0,000d33b21436,2020-09-02 16:42:54
1,005c5a9218ba,2020-06-28 19:05:16
2,0079283bde6e,2020-04-26 21:32:44
3,007bdf23b62f,2020-06-16 18:40:39
4,00a577a8e54f,2020-04-07 07:16:55
5,00a85fb232bf,2020-05-27 03:24:09
6,00ae2639c426,2019-12-30 05:24:54
7,00b9c343ace4,2020-11-13 01:29:55
8,00bdcf2207d5,2020-01-04 13:39:40
9,00c76d7884f5,2020-07-16 21:22:54


In [5]:
tmp = sel_data.groupby(['sn'], as_index=False)['msg'].agg(list)
tmp['text'] = tmp['msg'].apply(lambda x: ("\n".join([i for i in x])).lower())
sentences_list = tmp['text'].values.tolist()

sentences = list()
for s in sentences_list:
    sentences.append([w for w in s.split()])

In [6]:
w2v_model = Word2Vec(sentences, vector_size=32, window=3, min_count=5, sg=0, hs=1, seed=2022)

In [7]:
def get_w2v_mean(sentences):
    emb_matrix = list()
    vec = list()
    for w in sentences.split():
        if w in w2v_model.wv:
            vec.append(w2v_model.wv[w])
    if len(vec) > 0:
        emb_matrix.append(np.mean(vec, axis=0))
    else:
        emb_matrix.append([0] * model.vector_size)
    return emb_matrix

In [8]:
X = list(tmp['text'].values)
tfv = TfidfVectorizer(ngram_range=(1,3), min_df=5, max_features=50000)
tfv.fit(X)

TfidfVectorizer(max_features=50000, min_df=5, ngram_range=(1, 3))

In [9]:
X_tfidf = tfv.transform(X)
svd = TruncatedSVD(n_components=16)
svd.fit(X_tfidf)

TruncatedSVD(n_components=16)

In [10]:
def get_tfidf_svd(sentences, n_components=16):
    X_tfidf = tfv.transform(sentences)
    X_svd = svd.transform(X_tfidf)
    return np.mean(X_svd, axis=0)

In [11]:
sel_data['time_ts'] = sel_data["time"].values.astype(np.int64) // 10 ** 9
train_data['fault_time_ts'] = train_data["fault_time"].values.astype(np.int64) // 10 ** 9

In [12]:
def safe_split(strs, n, sep='|'):
    str_li = strs.split(sep)
    if len(str_li) >= n + 1:
        return str_li[n]
    else:
        return ''

sel_data['msg_split_0'] = sel_data['msg'].apply(lambda x: safe_split(x, 0))
sel_data['msg_split_1'] = sel_data['msg'].apply(lambda x: safe_split(x, 1))
sel_data['msg_split_2'] = sel_data['msg'].apply(lambda x: safe_split(x, 2))

sel_data['category'] = sel_data['msg'].apply(lambda x: x.split()[0])

In [13]:
cate_map = {
    'Memory': 0,
    'System': 1,
    'Processor': 2,
    'Temperature': 3,
    'Drive': 4,
    'Power': 5,
    'Unknown': 6,
    'Microcontroller': 7,
    'OS': 8,
    'Watchdog2': 9,
    'OEM': 10,
    'Button': 11,
    'Slot/Connector': 12,
    'Microcontroller/Coprocessor': 13,
    'Management': 14,
    'Event': 15,
    'Watchdog': 16,
    'Slot': 17,
    'Fan': 18,
    'Critical': 19,
    'device': 20,
    'LAN': 21,
    'Version': 22,
    'Add-in': 23,
    'Terminator': 24,
    'Chassis': 25,
    'reserved': 26,
    'Physical': 27,
    'Session': 28,
    'Reserved': 29,
    'Cable/Interconnect': 30,
    'Cable': 31,
    'Chip': 32,
    'Battery': 33
}

In [14]:
def make_dataset(dataset, data_type='train'):
    ret = list()

    for idx, row in tqdm(dataset.iterrows()):
        sn = row['sn']
        fault_time = row['fault_time']
        ts = row['fault_time_ts']
        
        if data_type == 'train':
            label = row['label']

        df = sel_data[sel_data['sn'] == sn].copy()

        df = df[df['time_ts'] <= ts].copy()
        df = df.sort_values(by='time_ts').reset_index(drop=True)
        df = df.tail(40).copy()        # TODO: could change last 40 logs here

        # make some features

        logs_count = len(df)

        if logs_count > 0:
            msg_nunique = df['msg'].nunique()
            msg_category_nunique = df['category'].nunique()
            msg_split_0_nunique = df['msg_split_0'].nunique()
            msg_split_1_nunique = df['msg_split_1'].nunique()
            msg_split_2_nunique = df['msg_split_2'].nunique()
            last_category = df['category'].value_counts().index[0]
            last_category = cate_map[last_category] if last_category in cate_map else len(cate_map)

            s = df['time_ts'].values
            if len(s) > 0:
                seconds_span = s[-1] - s[0] 
            else:
                seconds_span = 0

            df['time_ts_shift_1'] = df['time_ts'].shift(1)
            df['time_ts_diffs_1'] = df['time_ts'] - df['time_ts_shift_1']
            s = df['time_ts_diffs_1'].values
            if len(s) > 1:
                log_time_diffs_avg = np.mean(s[1:])
                log_time_diffs_max = np.max(s[1:])
                log_time_diffs_min = np.min(s[1:])
                log_time_diffs_std = np.std(s[1:])
            else:
                try:
                    log_time_diffs_avg = log_time_diffs_max = log_time_diffs_min = s[0]
                    log_time_diffs_std = 0
                except:
                    log_time_diffs_avg = log_time_diffs_max = log_time_diffs_min = log_time_diffs_std = 0

            all_msg = "\n".join(df['msg'].values.tolist()).lower()
            w2v_emb = get_w2v_mean(all_msg)[0]
            tfv_emb = get_tfidf_svd([s.lower() for s in df['msg'].values.tolist()])

        else:
            logs_count = 0
            msg_nunique = 0
            msg_category_nunique = 0
            msg_split_0_nunique = 0
            msg_split_1_nunique = 0
            msg_split_2_nunique = 0
            last_category = 0
            seconds_span = 0
            log_time_diffs_avg = 0
            log_time_diffs_max = 0
            log_time_diffs_min = 0
            log_time_diffs_std = 0
            w2v_emb = [0] * 32
            tfv_emb = [0] * 16


        # format dataset
        data = {
            'sn': sn,
            'fault_time': fault_time,
            'logs_count': logs_count,
            'msg_nunique': msg_nunique,
            'msg_category_nunique': msg_category_nunique,
            'msg_split_0_nunique': msg_split_0_nunique,
            'msg_split_1_nunique': msg_split_1_nunique,
            'msg_split_2_nunique': msg_split_2_nunique,
            'last_category': last_category,
            'seconds_span': seconds_span,
            'log_time_diffs_avg': log_time_diffs_avg,
            'log_time_diffs_max': log_time_diffs_max,
            'log_time_diffs_min': log_time_diffs_min,
            'log_time_diffs_std': log_time_diffs_std,
        }

        for i in range(32):
            data[f'msg_w2v_{i}'] = w2v_emb[i]
        for i in range(16):
            data[f'msg_tfv_{i}'] = tfv_emb[i]
            
        if data_type == 'train':
            data['label'] = label

        ret.append(data)
        
    return ret

In [15]:
train = make_dataset(train_data, data_type='train')
df_train = pd.DataFrame(train)

print(df_train.shape)
df_train.head()

0it [00:00, ?it/s]

(16669, 63)


Unnamed: 0,sn,fault_time,logs_count,msg_nunique,msg_category_nunique,msg_split_0_nunique,msg_split_1_nunique,msg_split_2_nunique,last_category,seconds_span,log_time_diffs_avg,log_time_diffs_max,log_time_diffs_min,log_time_diffs_std,msg_w2v_0,msg_w2v_1,msg_w2v_2,msg_w2v_3,msg_w2v_4,msg_w2v_5,msg_w2v_6,msg_w2v_7,msg_w2v_8,msg_w2v_9,msg_w2v_10,msg_w2v_11,msg_w2v_12,msg_w2v_13,msg_w2v_14,msg_w2v_15,msg_w2v_16,msg_w2v_17,msg_w2v_18,msg_w2v_19,msg_w2v_20,msg_w2v_21,msg_w2v_22,msg_w2v_23,msg_w2v_24,msg_w2v_25,msg_w2v_26,msg_w2v_27,msg_w2v_28,msg_w2v_29,msg_w2v_30,msg_w2v_31,msg_tfv_0,msg_tfv_1,msg_tfv_2,msg_tfv_3,msg_tfv_4,msg_tfv_5,msg_tfv_6,msg_tfv_7,msg_tfv_8,msg_tfv_9,msg_tfv_10,msg_tfv_11,msg_tfv_12,msg_tfv_13,msg_tfv_14,msg_tfv_15,label
0,SERVER_10001,2020-05-01 10:04:00,9,9,3,5,5,2,2,660,82.5,332.0,0.0,131.264047,0.700379,-0.629923,-0.952765,-0.93668,-0.166198,-1.550375,-0.316409,1.273225,-0.389465,-0.015804,0.373918,-1.03656,-0.281342,1.079894,-0.016628,0.196102,0.181302,0.096729,0.6622,0.188445,-0.360879,0.027016,0.513861,0.160221,-0.010354,0.729322,-0.51292,-0.117447,0.026307,-0.038081,-0.186998,0.392833,0.151897,-0.159091,-0.045854,-0.137123,0.037584,0.017939,-0.02636,-0.061342,0.196764,-0.066509,0.158092,0.151791,0.006769,-0.046361,0.025703,0.005171,1
1,SERVER_10003,2020-03-28 09:48:00,40,1,1,1,1,1,0,57,1.461538,2.0,1.0,0.498519,0.712487,0.459921,-2.033084,-0.038312,0.07129,-0.689347,0.427789,1.318271,0.244873,0.056716,0.008596,-1.466636,-0.674577,0.079415,-0.959297,-0.151059,-0.272708,-0.750014,-0.896591,0.72261,-0.265073,0.58536,0.768186,0.585732,-0.069446,1.118316,-0.395495,-0.138185,-0.303303,-0.652371,1.496285,-0.690089,0.214217,0.163642,0.08883,-0.070367,0.039201,0.040025,-0.022257,-0.118042,-0.116782,-0.099236,-0.035794,-0.015759,0.024922,-0.104765,0.046493,0.100924,2
2,SERVER_10008,2020-02-25 16:12:00,5,3,2,2,3,1,2,38,9.5,33.0,0.0,13.720423,0.684653,0.298548,-1.566285,-0.436122,0.321197,-1.27784,0.104868,2.084399,0.072692,0.561428,0.091299,-1.279943,-0.974046,0.703925,-0.62534,0.24009,0.160568,-0.29299,-0.742377,0.405687,-0.339603,0.623597,0.303128,0.364009,-0.324919,-0.426674,0.424732,-0.019926,-0.376095,-1.347809,0.066909,-0.376634,0.133858,-0.04579,-0.007332,-0.141995,0.267752,0.133512,-0.034925,-0.001206,0.05288,0.077277,0.030138,-0.058774,-0.026617,-0.015883,-0.005363,-0.115501,1
3,SERVER_10008,2020-03-11 18:04:00,9,4,3,3,4,1,2,1299319,162414.875,1245629.0,0.0,409792.273236,0.827885,0.024599,-1.737865,-0.487622,0.223473,-1.292184,-0.073425,1.838596,0.013211,0.355664,0.177133,-1.164628,-0.888056,0.68931,-0.522854,0.286734,0.320673,-0.187947,-0.633161,0.7109,-0.222134,0.667596,0.347415,0.53808,-0.155449,-0.141348,0.280759,0.081551,-0.173286,-1.216581,-0.037672,-0.096993,0.161708,-0.08611,-0.016971,-0.117738,0.249145,0.107349,0.008729,0.000154,0.061809,0.055261,0.023551,-0.042634,-0.022529,0.013462,-0.015185,-0.066865,2
4,SERVER_10009,2020-05-08 16:37:00,4,4,1,2,1,2,4,21,7.0,21.0,0.0,9.899495,-0.458744,0.458298,0.66093,0.365559,-0.955961,-0.843295,0.39489,1.562269,0.707169,0.261565,-1.238765,-0.302514,-1.433187,0.558801,0.876107,-0.277186,-0.07353,-1.176077,-0.510104,1.33259,-0.952384,-0.023651,0.173969,0.471626,0.109892,0.460979,-0.878275,0.961492,-0.495348,1.199852,0.42726,-0.614269,0.011797,-0.006899,-0.000905,-0.010315,-0.005557,0.011228,-0.00323,-0.024015,-0.011856,0.035666,-0.012139,0.01893,0.028291,0.194176,0.415143,-0.031964,3


In [19]:
df_train = df_train[df_train['logs_count'] > 0].copy()
df_train.shape

(16571, 63)

In [20]:
test_data['fault_time_ts'] = test_data["fault_time"].values.astype(np.int64) // 10 ** 9

test = make_dataset(test_data, data_type='test')

df_test = pd.DataFrame(test)
print(df_test.shape)
df_test.head()

0it [00:00, ?it/s]

(3011, 62)


Unnamed: 0,sn,fault_time,logs_count,msg_nunique,msg_category_nunique,msg_split_0_nunique,msg_split_1_nunique,msg_split_2_nunique,last_category,seconds_span,log_time_diffs_avg,log_time_diffs_max,log_time_diffs_min,log_time_diffs_std,msg_w2v_0,msg_w2v_1,msg_w2v_2,msg_w2v_3,msg_w2v_4,msg_w2v_5,msg_w2v_6,msg_w2v_7,msg_w2v_8,msg_w2v_9,msg_w2v_10,msg_w2v_11,msg_w2v_12,msg_w2v_13,msg_w2v_14,msg_w2v_15,msg_w2v_16,msg_w2v_17,msg_w2v_18,msg_w2v_19,msg_w2v_20,msg_w2v_21,msg_w2v_22,msg_w2v_23,msg_w2v_24,msg_w2v_25,msg_w2v_26,msg_w2v_27,msg_w2v_28,msg_w2v_29,msg_w2v_30,msg_w2v_31,msg_tfv_0,msg_tfv_1,msg_tfv_2,msg_tfv_3,msg_tfv_4,msg_tfv_5,msg_tfv_6,msg_tfv_7,msg_tfv_8,msg_tfv_9,msg_tfv_10,msg_tfv_11,msg_tfv_12,msg_tfv_13,msg_tfv_14,msg_tfv_15
0,000d33b21436,2020-09-02 16:42:54,2,2,1,1,2,1,1,14863,14863.0,14863.0,14863.0,0.0,1.477323,-1.154896,-2.65644,-0.779042,-0.111181,-1.342488,-0.880156,0.763793,-0.282595,-0.511595,0.701681,-0.445886,-0.373602,0.562188,-0.150481,0.677995,1.146524,0.262212,-0.255928,2.031761,0.423715,1.185801,0.734177,1.528122,0.559244,1.316649,-0.615345,0.685247,0.710253,-0.467329,-0.549227,1.086135,0.365154,-0.356821,-0.030163,0.148521,0.194186,-0.112608,0.220981,-0.002671,0.120277,-0.108615,-0.131744,0.008496,0.020198,0.305241,-0.113058,0.352372
1,005c5a9218ba,2020-06-28 19:05:16,10,4,2,3,4,1,0,867,96.333333,739.0,1.0,229.054579,0.978063,-0.299149,-1.965629,-0.475017,0.062735,-1.16313,-0.074698,1.111549,-0.296958,0.138031,0.29494,-0.802941,-0.842891,0.309227,-0.409757,0.447437,0.060409,0.110904,-0.36295,1.018337,0.036472,0.811588,0.827642,1.156492,0.260533,1.184866,-0.178854,0.011725,0.490181,-0.359292,0.48854,0.216592,0.237742,0.013483,0.062169,-0.031689,0.038234,-0.009736,-0.012661,-0.148432,-0.105018,-0.235047,-0.090137,0.170675,-0.351866,0.196224,-0.073757,-0.050536
2,0079283bde6e,2020-04-26 21:32:44,1,1,1,1,1,1,5,0,,,,0.0,-0.018298,-0.673039,-1.200413,-0.056358,-0.583385,-1.091062,-0.08651,0.615469,-0.183456,-0.127628,0.090854,-2.186912,-0.914238,0.875823,-0.178919,0.629768,-0.454986,0.505771,-0.223511,1.130023,-1.249034,-0.834266,1.127253,0.434101,1.178299,-0.505045,-1.199978,0.328664,0.319481,0.249258,0.276962,0.830689,0.052623,-0.044621,-0.018092,-0.061271,-0.093727,0.022376,0.003805,-0.140836,-0.038172,0.22694,-0.07881,0.082023,0.000271,0.015674,-0.075058,0.035136
3,007bdf23b62f,2020-06-16 18:40:39,19,5,3,4,5,1,0,2477,137.611111,760.0,0.0,232.859552,0.983501,-0.199246,-1.859158,-0.433996,-0.044436,-0.847487,-0.114836,1.152563,-0.151098,0.079367,0.421615,-0.713111,-0.95615,0.246108,-0.6297,0.5652,-0.155588,-0.317526,-0.396239,1.172649,-0.243707,0.891384,0.964237,0.933389,0.230908,1.287999,-0.44967,0.051,0.249176,-0.480669,0.919069,0.018412,0.479957,0.359418,-0.236788,0.064042,-0.070916,-0.024199,-0.016152,0.031398,0.088715,-0.007019,0.020846,-0.017083,-0.003531,0.024628,-0.012061,-0.004482
4,00a577a8e54f,2020-04-07 07:16:55,6,6,3,5,5,1,8,563,112.6,369.0,0.0,134.117262,1.037564,0.20643,-0.612729,0.149342,0.379141,-0.628021,-0.621325,1.003117,-0.158813,0.099166,0.277389,-0.812496,0.088793,-0.123625,-1.276569,0.194934,0.01789,-0.460229,-0.60051,1.461289,0.023221,0.662383,0.099721,1.336204,0.898689,0.424996,-0.387441,0.283573,-0.420355,-0.647553,-0.092452,0.372737,0.106396,-0.035529,0.036857,0.088154,0.053833,-0.097678,-0.078732,-0.070648,-0.138498,0.062238,0.222043,-0.054833,-0.006731,0.042304,-0.021616,-0.003545


In [21]:
df_test[df_test['logs_count'] == 0].shape

(0, 62)

In [22]:
classes = np.unique(df_train['label'])
weights = compute_class_weight(class_weight='balanced', classes=classes, y=df_train['label'])
class_weights = dict(zip(classes, weights))

class_weights

{0: 2.8086440677966102,
 1: 1.2249408633944412,
 2: 0.4468503937007874,
 3: 1.6957634056487925}

In [23]:
NUM_CLASSES = df_train['label'].nunique()
FOLDS = 10
TARGET = 'label'
use_features = [col for col in df_train.columns if col not in ['sn', 'fault_time', TARGET]]

def run_ctb(df_train, df_test, use_features):
    target = TARGET
    oof_pred = np.zeros((len(df_train), NUM_CLASSES))
    y_pred = np.zeros((len(df_test), NUM_CLASSES))
    
    folds = GroupKFold(n_splits=FOLDS)
    for fold, (tr_ind, val_ind) in enumerate(folds.split(df_train, df_train[TARGET], df_train['sn'])):
        print(f'Fold {fold + 1}') 
        x_train, x_val = df_train[use_features].iloc[tr_ind], df_train[use_features].iloc[val_ind] 
        y_train, y_val = df_train[target].iloc[tr_ind], df_train[target].iloc[val_ind]
        
        params = { 
            'task_type': 'GPU', 
            'bootstrap_type': 'Bernoulli',
            'learning_rate': 0.1, 
            'eval_metric': 'MultiClass', 
            'loss_function': 'MultiClass', 
            'classes_count': NUM_CLASSES, 
            'iterations': 1000, 
            'random_seed': 2022, 
            'depth': 8, 
            'subsample': 0.8, 
            'leaf_estimation_iterations': 8,
            'reg_lambda': 0.5,
            'class_weights': class_weights,
            'early_stopping_rounds': 100 
        }
        model = CatBoostClassifier(**params)
        
        model.fit(x_train, 
                  y_train, 
                  eval_set=(x_val, y_val), 
                  verbose=100) 
        oof_pred[val_ind] = model.predict_proba(x_val) 
        y_pred += model.predict_proba(df_test[use_features]) / folds.n_splits
        
        score = f1_score(y_val, oof_pred[val_ind].argmax(axis=1), average='macro')
        print(f'F1 score: {score}')
        
        print("Features importance...")
        feat_imp = pd.DataFrame({'imp': model.feature_importances_, 'feature': use_features})
        print(feat_imp.sort_values(by='imp').reset_index(drop=True))
        
        del x_train, x_val, y_train, y_val
        gc.collect()
        
    return y_pred, oof_pred

In [24]:
y_pred, oof_pred = run_ctb(df_train, df_test, use_features)

Fold 1
0:	learn: 1.2166115	test: 1.2194828	best: 1.2194828 (0)	total: 45.8ms	remaining: 45.7s
100:	learn: 0.3366573	test: 0.6689908	best: 0.6532619 (47)	total: 4s	remaining: 35.6s


KeyboardInterrupt: 

In [None]:
target_df = df_train[['sn', 'fault_time', 'label']].copy()
oof_df = target_df.copy()
oof_df['label'] = oof_pred.argmax(axis=1)

def  macro_f1(target_df: pd.DataFrame,  submit_df: pd.DataFrame)  -> float:

    """
    计算得分
    :param target_df: [sn,fault_time,label]
    :param submit_df: [sn,fault_time,label]
    :return:
    """

    weights =  [3/7,  2/7,  1/7,  1/7]

    overall_df = target_df.merge(submit_df, how='left', on=['sn', 'fault_time'], suffixes=['_gt', '_pr'])
    overall_df.fillna(-1)

    macro_F1 =  0.
    for i in  range(len(weights)):
        TP =  len(overall_df[(overall_df['label_gt'] == i) & (overall_df['label_pr'] == i)])
        FP =  len(overall_df[(overall_df['label_gt'] != i) & (overall_df['label_pr'] == i)])
        FN =  len(overall_df[(overall_df['label_gt'] == i) & (overall_df['label_pr'] != i)])
        precision = TP /  (TP + FP)  if  (TP + FP)  >  0  else  0
        recall = TP /  (TP + FN)  if  (TP + FN)  >  0  else  0
        F1 =  2  * precision * recall /  (precision + recall)  if  (precision + recall)  >  0  else  0
        macro_F1 += weights[i]  * F1
    return macro_F1


macro_f1(target_df, oof_df)

In [None]:
sub = df_test[['sn', 'fault_time']].copy()
sub['label'] = y_pred.argmax(axis=1)
display(sub.head())
sub['label'].value_counts()

In [None]:
sub.to_csv('baseline2_gkf_sn.csv', index=False)