In [1]:
import gc
import numpy as np
import pandas as pd
from gensim.models.word2vec import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import KFold, GroupKFold, StratifiedKFold
from sklearn.utils.class_weight import compute_class_weight
from catboost import CatBoostClassifier
import lightgbm as lgb
from tqdm import tqdm

In [2]:
label1 = pd.read_csv('./data/preliminary_train_label_dataset.csv')
label2 = pd.read_csv('./data/preliminary_train_label_dataset_s.csv')
label_df = pd.concat([label1, label2]).reset_index(drop=True)

In [3]:
# train_log_df = pd.read_csv('./data/preliminary_sel_log_dataset.csv')
# test_log_df = pd.read_csv('./data/preliminary_sel_log_dataset_a.csv')

# log_df = pd.concat([train_log_df, test_log_df]).reset_index(drop=True)
log_df = pd.read_csv('./log_template_new.csv')

In [4]:
log_df.head()

Unnamed: 0,sn,time,msg,server_model,new_msg,template_id,template
0,SERVER_25698,2020-10-09 08:32:21,System Boot Initiated BIOS_Boot_Up | State As...,SM0,system boot initiated bios boot up | state ass...,1,system boot initiated <:*:> <:*:> <:*:> <:*:> ...
1,SERVER_25698,2020-10-09 07:43:48,System Boot Initiated BIOS_Boot_Up | State As...,SM0,system boot initiated bios boot up | state ass...,1,system boot initiated <:*:> <:*:> <:*:> <:*:> ...
2,SERVER_25698,2020-10-09 08:16:22,System Boot Initiated BIOS_Boot_Up | State As...,SM0,system boot initiated bios boot up | state ass...,1,system boot initiated <:*:> <:*:> <:*:> <:*:> ...
3,SERVER_25698,2020-10-09 05:46:41,System Boot Initiated BIOS_Boot_Up | State As...,SM0,system boot initiated bios boot up | state ass...,1,system boot initiated <:*:> <:*:> <:*:> <:*:> ...
4,SERVER_25698,2020-10-09 12:59:13,System Boot Initiated BIOS_Boot_Up | State As...,SM0,system boot initiated bios boot up | state ass...,1,system boot initiated <:*:> <:*:> <:*:> <:*:> ...


In [5]:
submit_df = pd.read_csv('./data/preliminary_submit_dataset_a.csv')

In [6]:
log_df.shape, label_df.shape, submit_df.shape

((493527, 7), (16669, 3), (3011, 2))

In [7]:
label_df = label_df.drop_duplicates().reset_index(drop=True)

In [8]:
log_df['time'] = pd.to_datetime(log_df['time'])
log_df.sort_values(by=['sn', 'time'], inplace=True)
log_df.reset_index(drop=True, inplace=True)

In [9]:
label_df['fault_time'] = pd.to_datetime(label_df['fault_time'])
label_df.sort_values(by=['sn', 'fault_time'], inplace=True)
label_df.reset_index(drop=True, inplace=True)

In [10]:
submit_df['fault_time'] = pd.to_datetime(submit_df['fault_time'])

In [11]:
log_df.shape, label_df.shape, submit_df.shape

((493527, 7), (16604, 3), (3011, 2))

In [15]:
tmp = log_df.groupby(['sn'], as_index=False)['new_msg'].agg(list)
tmp['text'] = tmp['new_msg'].apply(lambda x: ("\n".join([i for i in x])).lower())
sentences_list = tmp['text'].values.tolist()

sentences = list()
for s in sentences_list:
    sentences.append([w for w in s.split()])

In [16]:
w2v_model = Word2Vec(sentences, vector_size=32, window=3, min_count=5, sg=0, hs=1, seed=2022)

In [17]:
def get_w2v_mean(sentences):
    emb_matrix = list()
    vec = list()
    for w in sentences.split():
        if w in w2v_model.wv:
            vec.append(w2v_model.wv[w])
    if len(vec) > 0:
        emb_matrix.append(np.mean(vec, axis=0))
    else:
        emb_matrix.append([0] * model.vector_size)
    return emb_matrix

In [18]:
X = list(tmp['text'].values)
tfv = TfidfVectorizer(ngram_range=(1,3), min_df=5, max_features=50000)
tfv.fit(X)

TfidfVectorizer(max_features=50000, min_df=5, ngram_range=(1, 3))

In [19]:
X_tfidf = tfv.transform(X)
svd = TruncatedSVD(n_components=16)
svd.fit(X_tfidf)

TruncatedSVD(n_components=16)

In [20]:
def get_tfidf_svd(sentences, n_components=16):
    X_tfidf = tfv.transform(sentences)
    X_svd = svd.transform(X_tfidf)
    return np.mean(X_svd, axis=0)

In [21]:
log_df['time_ts'] = log_df["time"].values.astype(np.int64) // 10 ** 9
label_df['fault_time_ts'] = label_df["fault_time"].values.astype(np.int64) // 10 ** 9

In [27]:
cate_map = {
    'system': 0,
    'processor': 1,
    'memory': 2,
    'cable': 3,
    'add-in': 4,
    'button': 5,
    'power': 6,
    'drive': 7,
    'battery': 8,
    'physical': 9,
    'slot': 10,
    'management': 11,
    'microcontroller': 12,
    'terminator': 13,
    'event': 14,
    'temperature': 15,
    'boot': 16,
    'unknown': 17,
    'critical': 18,
    'os': 19,
    'chip': 20,
    'fan': 21,
    'session': 22,
    'oem': 23,
    'watchdog': 24,
    'device': 25,
    'watchdog2': 26,
    '<:HEX:>': 27,
    'reserved': 28,
    'version': 29,
    'lan': 30,
    'request': 31,
    'chassis': 32
}

In [28]:
def safe_split(strs, n, sep='|'):
    str_li = strs.split(sep)
    if len(str_li) >= n + 1:
        return str_li[n]
    else:
        return ''

log_df['msg_split_0'] = log_df['template'].apply(lambda x: safe_split(x, 0))
log_df['msg_split_1'] = log_df['template'].apply(lambda x: safe_split(x, 1))
log_df['msg_split_2'] = log_df['template'].apply(lambda x: safe_split(x, 2))

log_df['category'] = log_df['template'].apply(lambda x: x.split()[0])

In [29]:
def make_dataset(dataset, data_type='train'):
    ret = list()

    for idx, row in tqdm(dataset.iterrows()):
        sn = row['sn']
        fault_time = row['fault_time']
        ts = row['fault_time_ts']
        
        if data_type == 'train':
            label = row['label']

        df = log_df[log_df['sn'] == sn].copy()

        df = df[df['time_ts'] <= ts].copy()
        df = df.sort_values(by='time_ts').reset_index(drop=True)
        df = df.tail(50).copy()        # TODO: could change last 40 logs here

        # make some features

        logs_count = len(df)

        if logs_count > 0:
            msg_nunique = df['template'].nunique()
            msg_category_nunique = df['category'].nunique()
            msg_split_0_nunique = df['msg_split_0'].nunique()
            msg_split_1_nunique = df['msg_split_1'].nunique()
            msg_split_2_nunique = df['msg_split_2'].nunique()
            last_category = df['category'].value_counts().index[0]
            last_category = cate_map[last_category] if last_category in cate_map else len(cate_map)

            s = df['time_ts'].values
            if len(s) > 0:
                seconds_span = s[-1] - s[0] 
            else:
                seconds_span = 0

            df['time_ts_shift_1'] = df['time_ts'].shift(1)
            df['time_ts_diffs_1'] = df['time_ts'] - df['time_ts_shift_1']
            s = df['time_ts_diffs_1'].values
            if len(s) > 1:
                log_time_diffs_avg = np.mean(s[1:])
                log_time_diffs_max = np.max(s[1:])
                log_time_diffs_min = np.min(s[1:])
                log_time_diffs_std = np.std(s[1:])
            else:
                try:
                    log_time_diffs_avg = log_time_diffs_max = log_time_diffs_min = s[0]
                    log_time_diffs_std = 0
                except:
                    log_time_diffs_avg = log_time_diffs_max = log_time_diffs_min = log_time_diffs_std = 0

            all_msg = "\n".join(df['new_msg'].values.tolist()).lower()
            w2v_emb = get_w2v_mean(all_msg)[0]
            tfv_emb = get_tfidf_svd([s.lower() for s in df['new_msg'].values.tolist()])

        else:
            logs_count = 0
            msg_nunique = 0
            msg_category_nunique = 0
            msg_split_0_nunique = 0
            msg_split_1_nunique = 0
            msg_split_2_nunique = 0
            last_category = 0
            seconds_span = 0
            log_time_diffs_avg = 0
            log_time_diffs_max = 0
            log_time_diffs_min = 0
            log_time_diffs_std = 0
            w2v_emb = [0] * 32
            tfv_emb = [0] * 16


        # format dataset
        data = {
            'sn': sn,
            'fault_time': fault_time,
            'logs_count': logs_count,
            'msg_nunique': msg_nunique,
            'msg_category_nunique': msg_category_nunique,
            'msg_split_0_nunique': msg_split_0_nunique,
            'msg_split_1_nunique': msg_split_1_nunique,
            'msg_split_2_nunique': msg_split_2_nunique,
            'last_category': last_category,
            'seconds_span': seconds_span,
            'log_time_diffs_avg': log_time_diffs_avg,
            'log_time_diffs_max': log_time_diffs_max,
            'log_time_diffs_min': log_time_diffs_min,
            'log_time_diffs_std': log_time_diffs_std,
        }

        for i in range(32):
            data[f'msg_w2v_{i}'] = w2v_emb[i]
        for i in range(16):
            data[f'msg_tfv_{i}'] = tfv_emb[i]
            
        if data_type == 'train':
            data['label'] = label

        ret.append(data)
        
    return ret

In [30]:
train = make_dataset(label_df, data_type='train')
df_train = pd.DataFrame(train)

16604it [09:07, 30.34it/s]


In [31]:
submit_df['fault_time_ts'] = submit_df["fault_time"].values.astype(np.int64) // 10 ** 9
test = make_dataset(submit_df, data_type='test')
df_test = pd.DataFrame(test)

3011it [01:40, 29.91it/s]


In [32]:
label_df.shape, df_train.shape, submit_df.shape, df_test.shape

((16604, 4), (16604, 63), (3011, 3), (3011, 62))

In [33]:
df_train.head()

Unnamed: 0,sn,fault_time,logs_count,msg_nunique,msg_category_nunique,msg_split_0_nunique,msg_split_1_nunique,msg_split_2_nunique,last_category,seconds_span,...,msg_tfv_7,msg_tfv_8,msg_tfv_9,msg_tfv_10,msg_tfv_11,msg_tfv_12,msg_tfv_13,msg_tfv_14,msg_tfv_15,label
0,SERVER_10001,2020-05-01 10:04:00,9,4,3,4,1,1,1,660,...,-0.098684,-0.155633,-0.089223,0.168824,-0.068493,0.185579,-0.050645,0.01134,-0.007487,1
1,SERVER_10003,2020-03-28 09:48:00,50,1,1,1,1,1,2,72,...,0.009596,0.038074,-0.054012,-0.005263,-0.038151,0.067867,-0.011294,0.007414,0.052355,2
2,SERVER_10008,2020-02-25 16:12:00,5,3,2,3,1,1,1,38,...,-0.046816,-0.096398,0.044391,0.002898,-0.025588,-0.07006,-0.010874,-0.007836,-0.124227,1
3,SERVER_10008,2020-03-11 18:04:00,9,4,3,4,1,1,1,1299319,...,-0.016333,-0.104602,0.025963,0.013769,-0.012813,-0.052424,-0.001034,-0.007312,-0.05449,2
4,SERVER_10009,2020-05-08 16:37:00,4,2,1,2,1,1,7,21,...,-0.0144,-0.031011,0.056183,-0.007552,0.035564,0.000809,0.083856,0.560552,-0.027072,3


In [34]:
df_test.head()

Unnamed: 0,sn,fault_time,logs_count,msg_nunique,msg_category_nunique,msg_split_0_nunique,msg_split_1_nunique,msg_split_2_nunique,last_category,seconds_span,...,msg_tfv_6,msg_tfv_7,msg_tfv_8,msg_tfv_9,msg_tfv_10,msg_tfv_11,msg_tfv_12,msg_tfv_13,msg_tfv_14,msg_tfv_15
0,000d33b21436,2020-09-02 16:42:54,2,1,1,1,1,1,0,14863,...,-0.228361,0.161018,-0.107763,-0.136676,-0.02343,0.162639,-0.039877,0.121325,-0.003281,0.491803
1,005c5a9218ba,2020-06-28 19:05:16,10,3,2,3,1,1,2,867,...,-0.075177,-0.046408,-0.017672,0.083341,0.245686,0.345558,-0.122429,-0.271064,0.020521,0.100849
2,0079283bde6e,2020-04-26 21:32:44,1,1,1,1,1,1,6,0,...,0.033053,-0.027693,-0.11964,0.23306,-0.170246,0.044565,0.084041,-0.005454,-0.059084,0.049326
3,007bdf23b62f,2020-06-16 18:40:39,19,4,3,4,1,1,2,2477,...,0.012236,-0.029873,-0.019956,-0.05628,-0.001559,-0.027748,-0.017789,0.004808,0.000123,-0.001833
4,00a577a8e54f,2020-04-07 07:16:55,6,4,3,4,1,1,19,563,...,-0.116088,-0.111855,0.069101,0.194338,0.135794,-0.130125,-0.051303,0.052286,-0.00917,0.01438


In [35]:
log_df['time'] = pd.to_datetime(log_df['time'])
log_df['time_ts'] = log_df["time"].values.astype(np.int64) // 10 ** 9

In [36]:
dummy_list = set(log_df.template_id.unique())
dummy_col = ['template_id_' + str(x) for x in dummy_list]

In [37]:
def make_dataset2(dataset, data_type='train'):
    ret = []
    for idx, row in tqdm(dataset.iterrows()):
        sn = row['sn']
        fault_time = row['fault_time']
        ts = row['fault_time_ts']
        
        if data_type == 'train':
            label = row['label']

        df = log_df[log_df['sn'] == sn].copy()

        df = df[df['time_ts'] <= ts].copy()
        df = df.sort_values(by='time_ts').reset_index(drop=True)
        df = df.tail(50).copy()
        
        data = np.zeros(len(dummy_list))
        for item in df.groupby('template_id').size().reset_index().values:
            data[np.where(np.array(dummy_col) == 'template_id_%s'%item[0])[0][0]] = item[1]
        
        if data_type == 'train':
            ret.append([sn, fault_time] + data.tolist() + [label])
        else:
            ret.append([sn, fault_time] + data.tolist())
    return ret

In [38]:
train2 = make_dataset2(label_df, data_type='train')
df_train2 = pd.DataFrame(train2)

16604it [08:25, 32.82it/s]


In [39]:
test2 = make_dataset2(submit_df, data_type='test')
df_test2 = pd.DataFrame(test2)

3011it [01:32, 32.50it/s]


In [40]:
df_train2.columns = ['sn', 'fault_time'] + dummy_col + ['label']

In [41]:
df_test2.columns = ['sn', 'fault_time'] + dummy_col

In [42]:
df_train.shape, df_train2.shape, df_test.shape, df_test2.shape

((16604, 63), (16604, 202), (3011, 62), (3011, 201))

In [43]:
df_train = df_train.merge(df_train2, on=['sn', 'fault_time', 'label'])

In [44]:
df_test = df_test.merge(df_test2, on=['sn', 'fault_time'])

In [45]:
df_train.shape, df_train2.shape, df_test.shape, df_test2.shape

((16604, 262), (16604, 202), (3011, 261), (3011, 201))

In [53]:
# tr_proba_df = pd.read_csv('./tr_proba_df.csv')

In [54]:
# te_proba_df = pd.read_csv('./te_proba_df.csv')

In [55]:
# df_train = pd.concat([df_train, tr_proba_df], axis=1)
# df_test = pd.concat([df_test, te_proba_df], axis=1)

In [56]:
# df_train.shape, df_train2.shape, df_test.shape, df_test2.shape

In [46]:
classes = np.unique(df_train['label'])
weights = compute_class_weight(class_weight='balanced', classes=classes, y=df_train['label'])
class_weights = dict(zip(classes, weights))

In [47]:
df_train.shape, df_test.shape

((16604, 262), (3011, 261))

In [59]:
df_train.to_csv('train.csv', index=False)
df_test.to_csv('test.csv', index=False)

In [60]:
# df_train = df_train.iloc[:, :-12]
# df_test = df_test.iloc[:, :-12]

In [1]:
import gc
import numpy as np
import pandas as pd
from gensim.models.word2vec import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import KFold, GroupKFold, StratifiedKFold
from sklearn.utils.class_weight import compute_class_weight
from catboost import CatBoostClassifier
import lightgbm as lgb
from tqdm import tqdm

import optuna
import warnings

warnings.filterwarnings("ignore", category=UserWarning)

In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [3]:
df_train.shape, df_test.shape

((16604, 267), (3011, 266))

In [4]:
# df_train = df_train.iloc[:, :-12]
# df_test = df_test.iloc[:, :-12]

In [5]:
tr_proba_df = pd.read_csv('./tr_proba_df.csv')
te_proba_df = pd.read_csv('./te_proba_df.csv')

In [6]:
df_train = pd.concat([df_train, tr_proba_df], axis=1)
df_test = pd.concat([df_test, te_proba_df], axis=1)

In [7]:
# df_train2 = pd.read_csv('./nezha-base-count3/pretrain/bert_train.csv')
# df_test2 = pd.read_csv('./nezha-base-count3/pretrain/bert_test.csv')

In [8]:
# df_train = df_train.merge(df_train2, how='left', on=['sn', 'fault_time'])
# df_test = df_test.merge(df_test2, how='left', on=['sn', 'fault_time'])

In [9]:
use_features = [col for col in df_train.columns if col not in ['sn', 'fault_time', 'label']]

X = df_train[use_features].values
y = df_train['label'].values

In [10]:
# import optuna  # pip install optuna
# from optuna.integration import LightGBMPruningCallback

# def macro_f1(y_true, y_pred) -> float:
#     """
#     计算得分
#     :param target_df: [sn,fault_time,label]
#     :param submit_df: [sn,fault_time,label]
#     :return:
#     """
#     weights =  [3  /  7,  2  /  7,  1  /  7,  1  /  7]
#     overall_df = pd.DataFrame([y_true, y_pred]).T
#     overall_df.columns = ['label_gt', 'label_pr']

#     macro_F1 =  0.
#     for i in  range(len(weights)):
#         TP =  len(overall_df[(overall_df['label_gt'] == i) & (overall_df['label_pr'] == i)])
#         FP =  len(overall_df[(overall_df['label_gt'] != i) & (overall_df['label_pr'] == i)])
#         FN =  len(overall_df[(overall_df['label_gt'] == i) & (overall_df['label_pr'] != i)])
#         precision = TP /  (TP + FP)  if  (TP + FP)  >  0  else  0
#         recall = TP /  (TP + FN)  if  (TP + FP)  >  0  else  0
#         F1 =  2  * precision * recall /  (precision + recall)  if  (precision + recall)  >  0  else  0
#         macro_F1 += weights[i]  * F1
#     return macro_F1

# def objective(trial, X, y):
#     # 参数网格
#     param_grid = {
#         "n_estimators": trial.suggest_categorical("n_estimators", [10000]),
#         "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
#         "num_leaves": trial.suggest_int("num_leaves", 20, 3000, step=20),
#         "max_depth": trial.suggest_int("max_depth", 3, 12),
#         "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 200, 10000, step=100),
#         "lambda_l1": trial.suggest_int("lambda_l1", 0, 100, step=5),
#         "lambda_l2": trial.suggest_int("lambda_l2", 0, 100, step=5),
#         "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
#         "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 0.95, step=0.1),
#         "bagging_freq": trial.suggest_categorical("bagging_freq", [1]),
#         "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 0.95, step=0.1),
#         "random_state": 2021,
#     }
#     # 5折交叉验证
#     cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1121218)
    
#     cv_scores = np.empty(5)
#     for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
#         X_train, X_test = X[train_idx], X[test_idx]
#         y_train, y_test = y[train_idx], y[test_idx]

#         # LGBM建模
#         model = lgb.LGBMClassifier(objective="multi_class", **param_grid)
#         model.fit(
#             X_train,
#             y_train,
#             eval_set=[(X_test, y_test)],
#             eval_metric="multi_logloss",
#             early_stopping_rounds=100,
#             callbacks=[
#                 LightGBMPruningCallback(trial, "multi_logloss")
#             ],
#         )
#         # 模型预测
#         preds = model.predict_proba(X_test)
#         # 优化指标logloss最小
#         print(macro_f1(y_test, np.argmax(preds, axis=1)))
#         cv_scores[idx] = 1 - macro_f1(y_test, np.argmax(preds, axis=1))
        
#     return np.mean(cv_scores)

In [11]:
# study = optuna.create_study(direction="minimize", study_name="LGBM Classifier")
# func = lambda trial: objective(trial, X, y)
# study.optimize(func, n_trials=200)

In [12]:
# print(f"\tBest value (rmse): {study.best_value:.5f}")
# print(f"\tBest params:")

# for key, value in study.best_params.items():
#     print(f"\t\t{key}: {value}")

In [13]:
classes = np.unique(df_train['label'])
weights = compute_class_weight(class_weight='balanced', classes=classes, y=df_train['label'])
class_weights = dict(zip(classes, weights))

In [14]:
def macro_f1(y_true, y_pred) -> float:
    """
    计算得分
    :param target_df: [sn,fault_time,label]
    :param submit_df: [sn,fault_time,label]
    :return:
    """
    weights =  [3  /  7,  2  /  7,  1  /  7,  1  /  7]
    overall_df = pd.DataFrame([y_true, y_pred]).T
    overall_df.columns = ['label_gt', 'label_pr']

    macro_F1 =  0.
    for i in  range(len(weights)):
        TP =  len(overall_df[(overall_df['label_gt'] == i) & (overall_df['label_pr'] == i)])
        FP =  len(overall_df[(overall_df['label_gt'] != i) & (overall_df['label_pr'] == i)])
        FN =  len(overall_df[(overall_df['label_gt'] == i) & (overall_df['label_pr'] != i)])
        precision = TP /  (TP + FP)  if  (TP + FP)  >  0  else  0
        recall = TP /  (TP + FN)  if  (TP + FP)  >  0  else  0
        F1 =  2  * precision * recall /  (precision + recall)  if  (precision + recall)  >  0  else  0
        macro_F1 += weights[i]  * F1
    return macro_F1

In [15]:
NUM_CLASSES = df_train['label'].nunique()
FOLDS = 10
TARGET = 'label'
use_features = [col for col in df_train.columns if col not in ['sn', 'fault_time', TARGET]]

def run_ctb(df_train, df_test, use_features):
    target = TARGET
    oof_pred = np.zeros((len(df_train), NUM_CLASSES))
    y_pred = np.zeros((len(df_test), NUM_CLASSES))
    
    folds = GroupKFold(n_splits=FOLDS)
    for fold, (tr_ind, val_ind) in enumerate(folds.split(df_train, df_train[TARGET], df_train['sn'])):
        print(f'Fold {fold + 1}') 
        x_train, x_val = df_train[use_features].iloc[tr_ind], df_train[use_features].iloc[val_ind] 
        y_train, y_val = df_train[target].iloc[tr_ind], df_train[target].iloc[val_ind]
        
#         param_grid = {
#             "n_estimators": 10000,
#             "learning_rate": 0.026782275336720557,
#             "num_leaves": 1740,
#             "max_depth": 8,
#             "min_data_in_leaf": 200,
#             "lambda_l1": 0,
#             "lambda_l2": 0,
#             "min_gain_to_split": 0.376709573423049,
#             "bagging_fraction": 0.5,
#             "bagging_freq": 1,
#             "feature_fraction": 0.9,
#         }
        
#         model = lgb.LGBMClassifier(objective="multi_class", **param_grid)
#         model.fit(
#             x_train,
#             y_train,
#             eval_set=[(x_val, y_val)],
#             eval_metric="multi_logloss",
#             early_stopping_rounds=20,
#         )
#         oof_pred[val_ind] = model.predict_proba(x_val)
#         y_pred += model.predict_proba(df_test[use_features]) / folds.n_splits
        
#     return y_pred, oof_pred
        
#         dtrain = lgb.Dataset(x_train, label=y_train)
#         dvalid = lgb.Dataset(x_val, label=y_val)
#         param = {
#             'objective': 'multiclass',
#             'num_class': 4,
#             'metric': 'multi_logloss',
#             'early_stopping_rounds': 20,
#             'learning_rate': 0.03,
#             'random_state': 42
#         }
#         gbm = lgb.train(
#             param, dtrain, valid_sets=[dtrain, dvalid], num_boost_round=100000, verbose_eval=10
#         )
        
#         oof_pred[val_ind] = gbm.predict(x_val)
#         y_pred += gbm.predict(df_test[use_features]) / folds.n_splits
        
#     return y_pred, oof_pred

        params = { 
            'task_type': 'GPU', 
            'bootstrap_type': 'Bernoulli',
            'learning_rate': 0.05, 
            'eval_metric': 'MultiClass', 
            'loss_function': 'MultiClass', 
            'classes_count': NUM_CLASSES, 
            'iterations': 10000, 
            'random_seed': 2022, 
            'depth': 8, 
            'subsample': 0.8, 
            'leaf_estimation_iterations': 8,
            'reg_lambda': 0.5,
            'class_weights': class_weights,
            'early_stopping_rounds': 100 
        }
        model = CatBoostClassifier(**params)
        
        model.fit(x_train, 
                  y_train, 
                  eval_set=(x_val, y_val), 
                  verbose=100) 
        oof_pred[val_ind] = model.predict_proba(x_val) 
        y_pred += model.predict_proba(df_test[use_features]) / folds.n_splits
        
        score = f1_score(y_val, oof_pred[val_ind].argmax(axis=1), average='macro')
        print(f'F1 score: {score}')
        
        print("Features importance...")
        feat_imp = pd.DataFrame({'imp': model.feature_importances_, 'feature': use_features})
        feat_imp.sort_values(by='imp').to_csv('%d_imp.csv'%fold, index=False)
        print(feat_imp.sort_values(by='imp').reset_index(drop=True))
        
        del x_train, x_val, y_train, y_val
        gc.collect()
        
    return y_pred, oof_pred

In [16]:
y_pred, oof_pred = run_ctb(df_train, df_test, use_features)

Fold 1
0:	learn: 1.3071505	test: 1.3066389	best: 1.3066389 (0)	total: 46.6ms	remaining: 7m 46s
100:	learn: 0.4008818	test: 0.5684082	best: 0.5684082 (100)	total: 4.25s	remaining: 6m 56s
200:	learn: 0.2882983	test: 0.5469100	best: 0.5454677 (191)	total: 8.22s	remaining: 6m 40s
bestTest = 0.5454676554
bestIteration = 191
Shrink model to first 192 iterations.
F1 score: 0.7627130172339165
Features importance...
          imp          feature
0    0.000000  template_id_204
1    0.000000   template_id_49
2    0.000000   template_id_48
3    0.000000  template_id_122
4    0.000000   template_id_45
..        ...              ...
271  3.163952        msg_tfv_5
272  3.231562       msg_w2v_28
273  3.838391    last_category
274  4.261104   template_id_78
275  4.781632       msg_w2v_30

[276 rows x 2 columns]
Fold 2
0:	learn: 1.3113620	test: 1.3150125	best: 1.3150125 (0)	total: 43.5ms	remaining: 7m 14s
100:	learn: 0.4012031	test: 0.5465361	best: 0.5465361 (100)	total: 4.26s	remaining: 6m 57s
200:	le

bestTest = 0.546430372
bestIteration = 253
Shrink model to first 254 iterations.
F1 score: 0.7623080557700848
Features importance...
          imp         feature
0    0.000000  template_id_73
1    0.000000  template_id_65
2    0.000000  template_id_67
3    0.000000  template_id_68
4    0.000000  template_id_69
..        ...             ...
271  2.648899  template_id_28
272  3.809597  template_id_78
273  4.101997      msg_w2v_28
274  4.111339   last_category
275  5.073246      msg_w2v_30

[276 rows x 2 columns]


In [18]:
print(macro_f1(df_train[TARGET], np.argmax(oof_pred, axis=1)))

0.7105531465354477


In [19]:
submit_df = pd.read_csv('./data/preliminary_submit_dataset_a.csv')

In [20]:
sub = submit_df[['sn', 'fault_time']].copy()
sub['label'] = y_pred.argmax(axis=1)
display(sub.head())
sub['label'].value_counts()

Unnamed: 0,sn,fault_time,label
0,000d33b21436,2020-09-02 16:42:54,3
1,005c5a9218ba,2020-06-28 19:05:16,2
2,0079283bde6e,2020-04-26 21:32:44,3
3,007bdf23b62f,2020-06-16 18:40:39,2
4,00a577a8e54f,2020-04-07 07:16:55,2


2    1629
1     702
3     567
0     113
Name: label, dtype: int64

In [21]:
sub.to_csv('baseline3_gkf_sn.csv', index=False)

In [24]:
label1 = pd.read_csv('./data/preliminary_train_label_dataset.csv')
label2 = pd.read_csv('./data/preliminary_train_label_dataset_s.csv')
label_df = pd.concat([label1, label2]).reset_index(drop=True)
label_df['label'].value_counts()

2    9343
1    3387
3    2463
0    1476
Name: label, dtype: int64

In [47]:
0.6132878338098

0.6132878338098