In [1]:
import gc
import numpy as np
import pandas as pd
from gensim.models.word2vec import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import KFold, GroupKFold, StratifiedKFold
from sklearn.utils.class_weight import compute_class_weight
from catboost import CatBoostClassifier
import lightgbm as lgb
from tqdm import tqdm

import optuna
import warnings

warnings.filterwarnings("ignore", category=UserWarning)

In [2]:
df_train1 = pd.read_csv('train1.csv', dtype={'server_model': 'str',
                                             'last_msg_id': 'str',
                                             'last_template_id': 'str'})
df_test1 = pd.read_csv('test1.csv', dtype={'server_model': 'str',
                                           'last_msg_id': 'str',
                                           'last_template_id': 'str'})

df_train2 = pd.read_csv('train2.csv', dtype={'appearance_1': 'str',
                                             'appearance_2': 'str',
                                             'appearance_3': 'str'})
df_test2 = pd.read_csv('test2.csv', dtype={'appearance_1': 'str',
                                           'appearance_2': 'str',
                                           'appearance_3': 'str'})

df_train3 = pd.read_csv('train3.csv')
df_test3 = pd.read_csv('test3.csv')

df_train4 = pd.read_csv('train4.csv')
df_test4 = pd.read_csv('test4.csv')

df_train5 = pd.read_csv('train5.csv')
df_test5 = pd.read_csv('test5.csv')

df_train = pd.merge(df_train1, df_train2, on=['sn', 'fault_time', 'label'])\
    .merge(df_train3, on=['sn', 'fault_time', 'label'])\
    .merge(df_train4, on=['sn', 'fault_time', 'label'])\
    .merge(df_train5, on=['sn', 'fault_time', 'label'])
df_test = pd.merge(df_test1, df_test2, on=['sn', 'fault_time'])\
    .merge(df_test3, on=['sn', 'fault_time'])\
    .merge(df_test4, on=['sn', 'fault_time'])\
    .merge(df_test5, on=['sn', 'fault_time'])

In [3]:
bert_train = pd.read_csv('../bert/train.csv')
bert_test = pd.read_csv('../bert/test.csv')

df_train = pd.concat([df_train, bert_train.iloc[:, 9:]], axis=1)
df_test = pd.concat([df_test, bert_test.iloc[:, 9:]], axis=1)

In [4]:
df_train.shape, df_test.shape

((16604, 487), (3011, 486))

In [22]:
from autox.autox_competition.feature_selection import AdversarialValidation

In [26]:
adversarialValidation = AdversarialValidation()
adversarialValidation.fit(train=df_train[['sn', 'label'] + use_features], test=df_test[['sn'] + use_features],
                          id_=['sn'], target='label')

   INFO ->  used_features: ['logs_count', 'msg_nunique', 'msg_category_nunique', 'msg_split_0_nunique', 'msg_split_1_nunique', 'msg_split_2_nunique', 'last_category', 'seconds_span', 'log_time_diffs_avg', 'log_time_diffs_max', 'log_time_diffs_min', 'log_time_diffs_std', 'msg_w2v_0', 'msg_w2v_1', 'msg_w2v_2', 'msg_w2v_3', 'msg_w2v_4', 'msg_w2v_5', 'msg_w2v_6', 'msg_w2v_7', 'msg_w2v_8', 'msg_w2v_9', 'msg_w2v_10', 'msg_w2v_11', 'msg_w2v_12', 'msg_w2v_13', 'msg_w2v_14', 'msg_w2v_15', 'msg_w2v_16', 'msg_w2v_17', 'msg_w2v_18', 'msg_w2v_19', 'msg_w2v_20', 'msg_w2v_21', 'msg_w2v_22', 'msg_w2v_23', 'msg_w2v_24', 'msg_w2v_25', 'msg_w2v_26', 'msg_w2v_27', 'msg_w2v_28', 'msg_w2v_29', 'msg_w2v_30', 'msg_w2v_31', 'msg_w2v_32', 'msg_w2v_33', 'msg_w2v_34', 'msg_w2v_35', 'msg_w2v_36', 'msg_w2v_37', 'msg_w2v_38', 'msg_w2v_39', 'msg_w2v_40', 'msg_w2v_41', 'msg_w2v_42', 'msg_w2v_43', 'msg_w2v_44', 'msg_w2v_45', 'msg_w2v_46', 'msg_w2v_47', 'msg_w2v_48', 'msg_w2v_49', 'msg_w2v_50', 'msg_w2v_51', 'msg_w2v_52

   INFO ->  categorical_features: []
   INFO ->  Training on fold 1
   INFO ->  AUC: 0.9285108286391208
   INFO ->  Fold 1 finished in 0:00:06.301209
   INFO ->  Training on fold 2
   INFO ->  AUC: 0.9171390190691661
   INFO ->  Fold 2 finished in 0:00:06.295649
   INFO ->  Training on fold 3
   INFO ->  AUC: 0.9321355072029411
   INFO ->  Fold 3 finished in 0:00:06.286834
   INFO ->  Training on fold 4
   INFO ->  AUC: 0.9263895341790562
   INFO ->  Fold 4 finished in 0:00:06.287544
   INFO ->  Training on fold 5
   INFO ->  AUC: 0.9407907750404604
   INFO ->  Fold 5 finished in 0:00:06.244381
   INFO ->  [0.9285108286391208, 0.9171390190691661, 0.9321355072029411, 0.9263895341790562, 0.9407907750404604]
   INFO ->  Mean AUC: 0.9289931328261488
   INFO ->  ##################################################
   INFO ->  used_features: ['msg_nunique', 'msg_category_nunique', 'msg_split_0_nunique', 'msg_split_1_nunique', 'msg_split_2_nunique', 'last_category', 'seconds_span', 'log_time_di

   INFO ->  categorical_features: []
   INFO ->  Training on fold 1
   INFO ->  AUC: 0.92983061511389
   INFO ->  Fold 1 finished in 0:00:06.280748
   INFO ->  Training on fold 2
   INFO ->  AUC: 0.9085498141564319
   INFO ->  Fold 2 finished in 0:00:06.306962
   INFO ->  Training on fold 3
   INFO ->  AUC: 0.9277947611340887
   INFO ->  Fold 3 finished in 0:00:06.297985
   INFO ->  Training on fold 4
   INFO ->  AUC: 0.922027007918552
   INFO ->  Fold 4 finished in 0:00:06.348925
   INFO ->  Training on fold 5
   INFO ->  AUC: 0.9333575595916002
   INFO ->  Fold 5 finished in 0:00:06.319637
   INFO ->  [0.92983061511389, 0.9085498141564319, 0.9277947611340887, 0.922027007918552, 0.9333575595916002]
   INFO ->  Mean AUC: 0.9243119515829126
   INFO ->  ##################################################
   INFO ->  used_features: ['msg_nunique', 'msg_category_nunique', 'msg_split_0_nunique', 'msg_split_1_nunique', 'msg_split_2_nunique', 'last_category', 'seconds_span', 'log_time_diffs_av

   INFO ->  categorical_features: []
   INFO ->  Training on fold 1
   INFO ->  AUC: 0.9255846875276492
   INFO ->  Fold 1 finished in 0:00:06.294218
   INFO ->  Training on fold 2
   INFO ->  AUC: 0.9092262746444731
   INFO ->  Fold 2 finished in 0:00:06.361212
   INFO ->  Training on fold 3
   INFO ->  AUC: 0.9271751183839837
   INFO ->  Fold 3 finished in 0:00:06.287750
   INFO ->  Training on fold 4
   INFO ->  AUC: 0.9171433116515837
   INFO ->  Fold 4 finished in 0:00:06.354040
   INFO ->  Training on fold 5
   INFO ->  AUC: 0.9340910907310835
   INFO ->  Fold 5 finished in 0:00:06.277852
   INFO ->  [0.9255846875276492, 0.9092262746444731, 0.9271751183839837, 0.9171433116515837, 0.9340910907310835]
   INFO ->  Mean AUC: 0.9226440965877547
   INFO ->  ##################################################
   INFO ->  used_features: ['msg_nunique', 'msg_category_nunique', 'msg_split_0_nunique', 'msg_split_1_nunique', 'msg_split_2_nunique', 'last_category', 'seconds_span', 'log_time_di

   INFO ->  categorical_features: []
   INFO ->  Training on fold 1
   INFO ->  AUC: 0.9219259051719901
   INFO ->  Fold 1 finished in 0:00:06.284342
   INFO ->  Training on fold 2
   INFO ->  AUC: 0.9062277795733678
   INFO ->  Fold 2 finished in 0:00:06.335599
   INFO ->  Training on fold 3
   INFO ->  AUC: 0.9265764550740274
   INFO ->  Fold 3 finished in 0:00:06.308517
   INFO ->  Training on fold 4
   INFO ->  AUC: 0.9203332053975436
   INFO ->  Fold 4 finished in 0:00:06.324907
   INFO ->  Training on fold 5
   INFO ->  AUC: 0.9339374912585666
   INFO ->  Fold 5 finished in 0:00:06.295025
   INFO ->  [0.9219259051719901, 0.9062277795733678, 0.9265764550740274, 0.9203332053975436, 0.9339374912585666]
   INFO ->  Mean AUC: 0.9218001672950992
   INFO ->  ##################################################
   INFO ->  used_features: ['msg_nunique', 'msg_category_nunique', 'msg_split_0_nunique', 'msg_split_1_nunique', 'msg_split_2_nunique', 'last_category', 'seconds_span', 'log_time_di

   INFO ->  categorical_features: []
   INFO ->  Training on fold 1
   INFO ->  AUC: 0.9275012042744369
   INFO ->  Fold 1 finished in 0:00:06.300070
   INFO ->  Training on fold 2
   INFO ->  AUC: 0.9071532098416291
   INFO ->  Fold 2 finished in 0:00:06.273325
   INFO ->  Training on fold 3
   INFO ->  AUC: 0.930421686746988
   INFO ->  Fold 3 finished in 0:00:06.301963
   INFO ->  Training on fold 4
   INFO ->  AUC: 0.9197395664996768
   INFO ->  Fold 4 finished in 0:00:06.302069
   INFO ->  Training on fold 5
   INFO ->  AUC: 0.9316619712681572
   INFO ->  Fold 5 finished in 0:00:06.237308
   INFO ->  [0.9275012042744369, 0.9071532098416291, 0.930421686746988, 0.9197395664996768, 0.9316619712681572]
   INFO ->  Mean AUC: 0.9232955277261776
   INFO ->  ##################################################
   INFO ->  used_features: ['msg_nunique', 'msg_category_nunique', 'msg_split_0_nunique', 'msg_split_1_nunique', 'msg_split_2_nunique', 'last_category', 'seconds_span', 'log_time_diff

   INFO ->  categorical_features: []
   INFO ->  Training on fold 1
   INFO ->  AUC: 0.9261157971313692
   INFO ->  Fold 1 finished in 0:00:06.244792
   INFO ->  Training on fold 2
   INFO ->  AUC: 0.9063302965416935
   INFO ->  Fold 2 finished in 0:00:06.362571
   INFO ->  Training on fold 3
   INFO ->  AUC: 0.9290200603408659
   INFO ->  Fold 3 finished in 0:00:06.383287
   INFO ->  Training on fold 4
   INFO ->  AUC: 0.9193888372656754
   INFO ->  Fold 4 finished in 0:00:06.384296
   INFO ->  Training on fold 5
   INFO ->  AUC: 0.932701452576475
   INFO ->  Fold 5 finished in 0:00:06.246974
   INFO ->  [0.9261157971313692, 0.9063302965416935, 0.9290200603408659, 0.9193888372656754, 0.932701452576475]
   INFO ->  Mean AUC: 0.9227112887712158
   INFO ->  ##################################################
   INFO ->  used_features: ['msg_nunique', 'msg_category_nunique', 'msg_split_0_nunique', 'msg_split_1_nunique', 'msg_split_2_nunique', 'last_category', 'seconds_span', 'log_time_diff

   INFO ->  categorical_features: []
   INFO ->  Training on fold 1
   INFO ->  AUC: 0.9295654289674699
   INFO ->  Fold 1 finished in 0:00:06.229645
   INFO ->  Training on fold 2
   INFO ->  AUC: 0.9043877767453136
   INFO ->  Fold 2 finished in 0:00:06.279089
   INFO ->  Training on fold 3
   INFO ->  AUC: 0.9271389038742032
   INFO ->  Fold 3 finished in 0:00:06.240274
   INFO ->  Training on fold 4
   INFO ->  AUC: 0.9182707457983195
   INFO ->  Fold 4 finished in 0:00:06.228281
   INFO ->  Training on fold 5
   INFO ->  AUC: 0.932327818737637
   INFO ->  Fold 5 finished in 0:00:06.225491
   INFO ->  [0.9295654289674699, 0.9043877767453136, 0.9271389038742032, 0.9182707457983195, 0.932327818737637]
   INFO ->  Mean AUC: 0.9223381348245885
   INFO ->  ##################################################
   INFO ->  used_features: ['msg_nunique', 'msg_category_nunique', 'msg_split_0_nunique', 'msg_split_1_nunique', 'msg_split_2_nunique', 'last_category', 'seconds_span', 'log_time_diff

   INFO ->  categorical_features: []
   INFO ->  Training on fold 1
   INFO ->  AUC: 0.9241599571376609
   INFO ->  Fold 1 finished in 0:00:06.226506
   INFO ->  Training on fold 2
   INFO ->  AUC: 0.907709730526826
   INFO ->  Fold 2 finished in 0:00:06.222900
   INFO ->  Training on fold 3
   INFO ->  AUC: 0.9279665927391159
   INFO ->  Fold 3 finished in 0:00:06.244726
   INFO ->  Training on fold 4
   INFO ->  AUC: 0.9190646210407241
   INFO ->  Fold 4 finished in 0:00:06.210716
   INFO ->  Training on fold 5
   INFO ->  AUC: 0.9341070750664349
   INFO ->  Fold 5 finished in 0:00:06.225380
   INFO ->  [0.9241599571376609, 0.907709730526826, 0.9279665927391159, 0.9190646210407241, 0.9341070750664349]
   INFO ->  Mean AUC: 0.9226015953021524
   INFO ->  ##################################################
   INFO ->  used_features: ['msg_nunique', 'msg_category_nunique', 'msg_split_0_nunique', 'msg_split_1_nunique', 'msg_split_2_nunique', 'last_category', 'seconds_span', 'log_time_diff

   INFO ->  categorical_features: []
   INFO ->  Training on fold 1
   INFO ->  AUC: 0.925798999223366
   INFO ->  Fold 1 finished in 0:00:06.212660
   INFO ->  Training on fold 2
   INFO ->  AUC: 0.9023351648351648
   INFO ->  Fold 2 finished in 0:00:06.280793
   INFO ->  Training on fold 3
   INFO ->  AUC: 0.9298305160942275
   INFO ->  Fold 3 finished in 0:00:06.266186
   INFO ->  Training on fold 4
   INFO ->  AUC: 0.916230001616031
   INFO ->  Fold 4 finished in 0:00:06.199852
   INFO ->  Training on fold 5
   INFO ->  AUC: 0.9301177346200724
   INFO ->  Fold 5 finished in 0:00:06.220907
   INFO ->  [0.925798999223366, 0.9023351648351648, 0.9298305160942275, 0.916230001616031, 0.9301177346200724]
   INFO ->  Mean AUC: 0.9208624832777723
   INFO ->  ##################################################
   INFO ->  used_features: ['msg_nunique', 'msg_category_nunique', 'msg_split_0_nunique', 'msg_split_1_nunique', 'msg_split_2_nunique', 'last_category', 'seconds_span', 'log_time_diffs_

   INFO ->  categorical_features: []
   INFO ->  Training on fold 1
   INFO ->  AUC: 0.9251322244177701
   INFO ->  Fold 1 finished in 0:00:06.277205
   INFO ->  Training on fold 2
   INFO ->  AUC: 0.9043011675824176
   INFO ->  Fold 2 finished in 0:00:06.261388
   INFO ->  Training on fold 3
   INFO ->  AUC: 0.9293132729924674
   INFO ->  Fold 3 finished in 0:00:06.225750
   INFO ->  Training on fold 4
   INFO ->  AUC: 0.9186113748383968
   INFO ->  Fold 4 finished in 0:00:06.214737
   INFO ->  Training on fold 5
   INFO ->  AUC: 0.9309409278906671
   INFO ->  Fold 5 finished in 0:00:06.243971
   INFO ->  [0.9251322244177701, 0.9043011675824176, 0.9293132729924674, 0.9186113748383968, 0.9309409278906671]
   INFO ->  Mean AUC: 0.9216597935443438
   INFO ->  ##################################################
   INFO ->  used_features: ['msg_nunique', 'msg_category_nunique', 'msg_split_0_nunique', 'msg_split_1_nunique', 'msg_split_2_nunique', 'last_category', 'seconds_span', 'log_time_di

   INFO ->  categorical_features: []
   INFO ->  Training on fold 1
   INFO ->  AUC: 0.9224368616116634
   INFO ->  Fold 1 finished in 0:00:06.223998
   INFO ->  Training on fold 2
   INFO ->  AUC: 0.9061394028765353
   INFO ->  Fold 2 finished in 0:00:06.245049
   INFO ->  Training on fold 3
   INFO ->  AUC: 0.9269403484585108
   INFO ->  Fold 3 finished in 0:00:06.259372
   INFO ->  Training on fold 4
   INFO ->  AUC: 0.9179018362152553
   INFO ->  Fold 4 finished in 0:00:06.194885
   INFO ->  Training on fold 5
   INFO ->  AUC: 0.9313078183380288
   INFO ->  Fold 5 finished in 0:00:06.223543
   INFO ->  [0.9224368616116634, 0.9061394028765353, 0.9269403484585108, 0.9179018362152553, 0.9313078183380288]
   INFO ->  Mean AUC: 0.9209452534999987
   INFO ->  ##################################################
   INFO ->  used_features: ['msg_nunique', 'msg_category_nunique', 'msg_split_0_nunique', 'msg_split_1_nunique', 'msg_split_2_nunique', 'last_category', 'seconds_span', 'log_time_di

   INFO ->  categorical_features: []
   INFO ->  Training on fold 1
   INFO ->  AUC: 0.92437746384719
   INFO ->  Fold 1 finished in 0:00:06.182969
   INFO ->  Training on fold 2
   INFO ->  AUC: 0.9066933985132515
   INFO ->  Fold 2 finished in 0:00:06.214520
   INFO ->  Training on fold 3
   INFO ->  AUC: 0.9289269016363962
   INFO ->  Fold 3 finished in 0:00:06.230285
   INFO ->  Training on fold 4
   INFO ->  AUC: 0.91573357708468
   INFO ->  Fold 4 finished in 0:00:06.202662
   INFO ->  Training on fold 5
   INFO ->  AUC: 0.9311022697756199
   INFO ->  Fold 5 finished in 0:00:06.243166
   INFO ->  [0.92437746384719, 0.9066933985132515, 0.9289269016363962, 0.91573357708468, 0.9311022697756199]
   INFO ->  Mean AUC: 0.9213667221714275
   INFO ->  ##################################################
   INFO ->  used_features: ['msg_nunique', 'msg_category_nunique', 'msg_split_0_nunique', 'msg_split_1_nunique', 'msg_split_2_nunique', 'last_category', 'seconds_span', 'log_time_diffs_avg'

   INFO ->  categorical_features: []
   INFO ->  Training on fold 1
   INFO ->  AUC: 0.9251971077751889
   INFO ->  Fold 1 finished in 0:00:06.202117
   INFO ->  Training on fold 2
   INFO ->  AUC: 0.9054505696509374
   INFO ->  Fold 2 finished in 0:00:06.230406
   INFO ->  Training on fold 3
   INFO ->  AUC: 0.929711382844812
   INFO ->  Fold 3 finished in 0:00:06.219073
   INFO ->  Training on fold 4
   INFO ->  AUC: 0.9146185156755009
   INFO ->  Fold 4 finished in 0:00:06.228362
   INFO ->  Training on fold 5
   INFO ->  AUC: 0.929578513057204
   INFO ->  Fold 5 finished in 0:00:06.264011
   INFO ->  [0.9251971077751889, 0.9054505696509374, 0.929711382844812, 0.9146185156755009, 0.929578513057204]
   INFO ->  Mean AUC: 0.9209112178007286
   INFO ->  ##################################################
   INFO ->  used_features: ['msg_nunique', 'msg_category_nunique', 'msg_split_0_nunique', 'msg_split_1_nunique', 'msg_split_2_nunique', 'last_category', 'seconds_span', 'log_time_diffs_

   INFO ->  categorical_features: []
   INFO ->  Training on fold 1
   INFO ->  AUC: 0.9219534314448343
   INFO ->  Fold 1 finished in 0:00:06.173183
   INFO ->  Training on fold 2
   INFO ->  AUC: 0.9050793875242404
   INFO ->  Fold 2 finished in 0:00:06.227992
   INFO ->  Training on fold 3
   INFO ->  AUC: 0.9276181841795041
   INFO ->  Fold 3 finished in 0:00:06.196453
   INFO ->  Training on fold 4
   INFO ->  AUC: 0.9156924187944409
   INFO ->  Fold 4 finished in 0:00:06.238424
   INFO ->  Training on fold 5
   INFO ->  AUC: 0.9341859977222321
   INFO ->  Fold 5 finished in 0:00:06.174606
   INFO ->  [0.9219534314448343, 0.9050793875242404, 0.9276181841795041, 0.9156924187944409, 0.9341859977222321]
   INFO ->  Mean AUC: 0.9209058839330504
   INFO ->  ##################################################
   INFO ->  used_features: ['msg_nunique', 'msg_category_nunique', 'msg_split_0_nunique', 'msg_split_1_nunique', 'msg_split_2_nunique', 'last_category', 'log_time_diffs_avg', 'log_t

   INFO ->  categorical_features: []
   INFO ->  Training on fold 1
   INFO ->  AUC: 0.9237723773852007
   INFO ->  Fold 1 finished in 0:00:06.180208
   INFO ->  Training on fold 2
   INFO ->  AUC: 0.9018195499353587
   INFO ->  Fold 2 finished in 0:00:06.268322
   INFO ->  Training on fold 3
   INFO ->  AUC: 0.9273234729964634
   INFO ->  Fold 3 finished in 0:00:06.190826
   INFO ->  Training on fold 4
   INFO ->  AUC: 0.9139337225274726
   INFO ->  Fold 4 finished in 0:00:06.210099
   INFO ->  Training on fold 5
   INFO ->  AUC: 0.9341030789825971
   INFO ->  Fold 5 finished in 0:00:06.238531
   INFO ->  [0.9237723773852007, 0.9018195499353587, 0.9273234729964634, 0.9139337225274726, 0.9341030789825971]
   INFO ->  Mean AUC: 0.9201904403654184
   INFO ->  ##################################################
   INFO ->  used_features: ['msg_nunique', 'msg_category_nunique', 'msg_split_0_nunique', 'msg_split_1_nunique', 'msg_split_2_nunique', 'last_category', 'log_time_diffs_avg', 'log_t

   INFO ->  categorical_features: []
   INFO ->  Training on fold 1
   INFO ->  AUC: 0.9204879031861661
   INFO ->  Fold 1 finished in 0:00:06.171579
   INFO ->  Training on fold 2
   INFO ->  AUC: 0.9026520584195218
   INFO ->  Fold 2 finished in 0:00:06.192061
   INFO ->  Training on fold 3
   INFO ->  AUC: 0.926601430598014
   INFO ->  Fold 3 finished in 0:00:06.183811
   INFO ->  Training on fold 4
   INFO ->  AUC: 0.9149427319004525
   INFO ->  Fold 4 finished in 0:00:06.204684
   INFO ->  Training on fold 5
   INFO ->  AUC: 0.9332179464125157
   INFO ->  Fold 5 finished in 0:00:06.216410
   INFO ->  [0.9204879031861661, 0.9026520584195218, 0.926601430598014, 0.9149427319004525, 0.9332179464125157]
   INFO ->  Mean AUC: 0.919580414103334
   INFO ->  ##################################################
   INFO ->  used_features: ['msg_nunique', 'msg_category_nunique', 'msg_split_0_nunique', 'msg_split_1_nunique', 'msg_split_2_nunique', 'last_category', 'log_time_diffs_avg', 'log_time

   INFO ->  categorical_features: []
   INFO ->  Training on fold 1
   INFO ->  AUC: 0.9227244128547694
   INFO ->  Fold 1 finished in 0:00:06.161740
   INFO ->  Training on fold 2
   INFO ->  AUC: 0.9071926005979315
   INFO ->  Fold 2 finished in 0:00:06.163958
   INFO ->  Training on fold 3
   INFO ->  AUC: 0.9267924933565106
   INFO ->  Fold 3 finished in 0:00:06.168007
   INFO ->  Training on fold 4
   INFO ->  AUC: 0.9168110152714932
   INFO ->  Fold 4 finished in 0:00:06.212791
   INFO ->  Training on fold 5
   INFO ->  AUC: 0.930637974784711
   INFO ->  Fold 5 finished in 0:00:06.213708
   INFO ->  [0.9227244128547694, 0.9071926005979315, 0.9267924933565106, 0.9168110152714932, 0.930637974784711]
   INFO ->  Mean AUC: 0.9208316993730831
   INFO ->  ##################################################
   INFO ->  used_features: ['msg_nunique', 'msg_category_nunique', 'msg_split_0_nunique', 'msg_split_1_nunique', 'msg_split_2_nunique', 'last_category', 'log_time_diffs_avg', 'log_tim

   INFO ->  categorical_features: []
   INFO ->  Training on fold 1
   INFO ->  AUC: 0.922498795725563
   INFO ->  Fold 1 finished in 0:00:06.204304
   INFO ->  Training on fold 2
   INFO ->  AUC: 0.905732112556561
   INFO ->  Fold 2 finished in 0:00:06.210195
   INFO ->  Training on fold 3
   INFO ->  AUC: 0.9278714359927271
   INFO ->  Fold 3 finished in 0:00:06.197703
   INFO ->  Training on fold 4
   INFO ->  AUC: 0.9144372171945702
   INFO ->  Fold 4 finished in 0:00:06.139573
   INFO ->  Training on fold 5
   INFO ->  AUC: 0.9306409718475894
   INFO ->  Fold 5 finished in 0:00:06.174066
   INFO ->  [0.922498795725563, 0.905732112556561, 0.9278714359927271, 0.9144372171945702, 0.9306409718475894]
   INFO ->  Mean AUC: 0.9202361066634023
   INFO ->  ##################################################
   INFO ->  used_features: ['msg_nunique', 'msg_category_nunique', 'msg_split_0_nunique', 'msg_split_1_nunique', 'msg_split_2_nunique', 'last_category', 'log_time_diffs_avg', 'log_time_

   INFO ->  categorical_features: []
   INFO ->  Training on fold 1
   INFO ->  AUC: 0.9249488797790034
   INFO ->  Fold 1 finished in 0:00:06.131633
   INFO ->  Training on fold 2
   INFO ->  AUC: 0.9030946994182288
   INFO ->  Fold 2 finished in 0:00:06.181031
   INFO ->  Training on fold 3
   INFO ->  AUC: 0.9252112929329257
   INFO ->  Fold 3 finished in 0:00:06.179275
   INFO ->  Training on fold 4


KeyboardInterrupt: 

In [27]:
adversarialValidation.removed_features

['logs_count',
 '20',
 'msg_w2v_12',
 'msg_w2v_0',
 'msg_w2v_10',
 'msg_w2v_29',
 'msg_w2v_8',
 'msg_w2v_24',
 'msg_w2v_19',
 'msg_w2v_25',
 'msg_w2v_53',
 'msg_w2v_23',
 'msg_w2v_48',
 'seconds_span',
 'msg_w2v_21',
 'msg_w2v_46',
 'msg_w2v_49',
 'msg_w2v_38']

In [5]:
classes = np.unique(df_train['label'])
weights = compute_class_weight(class_weight='balanced', classes=classes, y=df_train['label'])
class_weights = dict(zip(classes, weights))

In [6]:
def macro_f1(y_true, y_pred) -> float:
    """
    计算得分
    :param target_df: [sn,fault_time,label]
    :param submit_df: [sn,fault_time,label]
    :return:
    """
    weights =  [3  /  7,  2  /  7,  1  /  7,  1  /  7]
    overall_df = pd.DataFrame([y_true, y_pred]).T
    overall_df.columns = ['label_gt', 'label_pr']

    macro_F1 =  0.
    for i in  range(len(weights)):
        TP =  len(overall_df[(overall_df['label_gt'] == i) & (overall_df['label_pr'] == i)])
        FP =  len(overall_df[(overall_df['label_gt'] != i) & (overall_df['label_pr'] == i)])
        FN =  len(overall_df[(overall_df['label_gt'] == i) & (overall_df['label_pr'] != i)])
        precision = TP /  (TP + FP)  if  (TP + FP)  >  0  else  0
        recall = TP /  (TP + FN)  if  (TP + FP)  >  0  else  0
        F1 =  2  * precision * recall /  (precision + recall)  if  (precision + recall)  >  0  else  0
        macro_F1 += weights[i]  * F1
    return macro_F1

In [7]:
class_weights

{0: 2.812330623306233,
 1: 1.2255683495718925,
 2: 0.4453862660944206,
 3: 1.714580751755473}

In [10]:
NUM_CLASSES = df_train['label'].nunique()
FOLDS = 10
TARGET = 'label'
use_features = [col for col in df_train.columns if col not in ['sn', 'fault_time', TARGET]]

categorical_feats = ['server_model', 'last_msg_id', 'last_template_id',
                     'appearance_1', 'appearance_2', 'appearance_3']
for c in categorical_feats:
    df_train[c] = df_train[c].astype('category')
    df_test[c] = df_test[c].astype('category')

def run_ctb(df_train, df_test, use_features):
    target = TARGET
    oof_pred = np.zeros((len(df_train), NUM_CLASSES))
    y_pred = np.zeros((len(df_test), NUM_CLASSES))
    
    folds = GroupKFold(n_splits=FOLDS)
    for fold, (tr_ind, val_ind) in enumerate(folds.split(df_train, df_train[TARGET], df_train['sn'])):
        print(f'Fold {fold + 1}')
        x_train, x_val = df_train[use_features].iloc[tr_ind], df_train[use_features].iloc[val_ind] 
        y_train, y_val = df_train[target].iloc[tr_ind], df_train[target].iloc[val_ind]
        
#         trn_data = lgb.Dataset(x_train, label=y_train)
#         val_data = lgb.Dataset(x_val, label=y_val)
        
#         lgb_params = {
#             "objective": "multiclass",
#             "num_class": 4,
#             "boosting_type": "gbdt",
#             "num_leaves": 32,
#             "min_data_in_leaf": 8,
#             "num_boost_round": 2000,
#             "max_depth": 7,
#             "bagging_fraction": 0.8,
#             "feature_fraction": 0.8,
#             "reg_alpha": 1,
#             "reg_lambda": 1,
#             'early_stopping_rounds': 100,
#             'learning_rate': 0.03,
#             'class_weight': class_weights
#         }
        
#         model = lgb.train(params=lgb_params,
#             train_set=trn_data,
#             valid_sets=[trn_data, val_data],
#             num_boost_round=100000,
#             verbose_eval=100)


        model = lgb.LGBMClassifier(
            objective='multiclass',
            n_estimators=10000,
            learning_rate=0.03,
            max_depth=7,
            random_state=42,
            subsample=0.8,
            subsample_freq=5,
            colsample_bytree=0.8,
            reg_alpha=5,
            reg_lambda=5,
            class_weight=class_weights
        )
        model.fit(x_train, y_train,
                  eval_set=[(x_train, y_train), (x_val, y_val)],
                  callbacks=[lgb.early_stopping(50)]
        )

        oof_pred[val_ind] = model.predict_proba(x_val) 
        y_pred += model.predict_proba(df_test[use_features]) / folds.n_splits
        
        score = f1_score(y_val, oof_pred[val_ind].argmax(axis=1), average='macro')
        print(f'F1 score: {score}')
        
        df_feature_importance = (
            pd.DataFrame({
                'feature': model.feature_name_,
                'importance': model.feature_importances_,
            })
            .sort_values('importance', ascending=False)
        )
        print(df_feature_importance)
        df_feature_importance.to_csv('%d_imp.csv'%fold, index=False)
        
        del x_train, x_val, y_train, y_val
        gc.collect()
        
    return y_pred, oof_pred

In [11]:
y_pred, oof_pred = run_ctb(df_train, df_test, use_features)

Fold 1
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[429]	training's multi_logloss: 0.186981	valid_1's multi_logloss: 0.383362
F1 score: 0.749172113380854
            feature  importance
0      server_model        2504
1       last_msg_id        2269
12   second_span_2h        1992
21     appearance_3        1320
19     appearance_2        1142
..              ...         ...
306             208          26
435             337          24
447             349          23
327             229          21
215             117          19

[484 rows x 2 columns]
Fold 2
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[500]	training's multi_logloss: 0.167287	valid_1's multi_logloss: 0.400786
F1 score: 0.7558661093138329
            feature  importance
0      server_model        2889
1       last_msg_id        2603
12   second_span_2h        2337
21     appearance_3        1534
19     appearance_2

In [12]:
print(macro_f1(df_train[TARGET], np.argmax(oof_pred, axis=1)))

0.6648718360662993


In [13]:
submit_df = pd.read_csv('../data/preliminary_submit_dataset_a.csv')

In [14]:
sub = submit_df[['sn', 'fault_time']].copy()
sub['label'] = y_pred.argmax(axis=1)
display(sub.head())
sub['label'].value_counts() / sub.shape[0]

Unnamed: 0,sn,fault_time,label
0,000d33b21436,2020-09-02 16:42:54,3
1,005c5a9218ba,2020-06-28 19:05:16,2
2,0079283bde6e,2020-04-26 21:32:44,3
3,007bdf23b62f,2020-06-16 18:40:39,2
4,00a577a8e54f,2020-04-07 07:16:55,2


2    0.540684
3    0.183992
1    0.183328
0    0.091996
Name: label, dtype: float64

In [15]:
sub.to_csv('baseline3_gkf_sn.csv', index=False)

In [16]:
label1 = pd.read_csv('../data/preliminary_train_label_dataset.csv')
label2 = pd.read_csv('../data/preliminary_train_label_dataset_s.csv')
label_df = pd.concat([label1, label2]).reset_index(drop=True)
label_df = label_df.drop_duplicates().reset_index(drop=True)

label_df['label'].value_counts() / label_df.shape[0]

2    0.561311
1    0.203987
3    0.145808
0    0.088894
Name: label, dtype: float64