In [1]:
import gc
import numpy as np
import pandas as pd
from gensim.models.word2vec import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import KFold, GroupKFold, StratifiedKFold
from sklearn.utils.class_weight import compute_class_weight
from catboost import CatBoostClassifier
import lightgbm as lgb
from tqdm import tqdm

import optuna
import warnings

warnings.filterwarnings("ignore", category=UserWarning)

In [2]:
df_train1 = pd.read_csv('train1.csv')
df_test1 = pd.read_csv('test1.csv')

df_train2 = pd.read_csv('train2.csv')
df_test2 = pd.read_csv('test2.csv')

df_train3 = pd.read_csv('train3.csv')
df_test3 = pd.read_csv('test3.csv')

df_train4 = pd.read_csv('train4.csv')
df_test4 = pd.read_csv('test4.csv')

# df_train5 = pd.read_csv('train6.csv')
# df_test5 = pd.read_csv('test6.csv')

# df_train = pd.merge(df_train1, df_train2, on=['sn', 'fault_time', 'label'])\
#     .merge(df_train3, on=['sn', 'fault_time', 'label'])\
#     .merge(df_train4, on=['sn', 'fault_time', 'label'])\
#     .merge(df_train5, on=['sn', 'fault_time', 'label'])
# df_test = pd.merge(df_test1, df_test2, on=['sn', 'fault_time'])\
#     .merge(df_test3, on=['sn', 'fault_time'])\
#     .merge(df_test4, on=['sn', 'fault_time'])\
#     .merge(df_test5, on=['sn', 'fault_time'])

df_train = pd.merge(df_train1, df_train2, on=['sn', 'fault_time', 'label'])\
    .merge(df_train3, on=['sn', 'fault_time', 'label'])\
    .merge(df_train4, on=['sn', 'fault_time', 'label'])
df_test = pd.merge(df_test1, df_test2, on=['sn', 'fault_time'])\
    .merge(df_test3, on=['sn', 'fault_time'])\
    .merge(df_test4, on=['sn', 'fault_time'])

In [3]:
df_train = df_train.drop('server_model', axis=1)
df_test = df_test.drop('server_model', axis=1)

In [4]:
df_train1 = pd.read_csv('../new_src/train1.csv', dtype={'server_model': 'str',
                                             'last_msg_id': 'str',
                                             'last_template_id': 'str'})
df_test1 = pd.read_csv('../new_src/test1.csv', dtype={'server_model': 'str',
                                             'last_msg_id': 'str',
                                             'last_template_id': 'str'})

In [5]:
df_train = df_train.merge(df_train1, on=['sn', 'fault_time', 'label'])
df_test = df_test.merge(df_test1, on=['sn', 'fault_time'])

In [6]:
df_train1 = pd.read_csv('../new_src/train4.csv')
df_test1 = pd.read_csv('../new_src/test4.csv')

df_train = df_train.merge(df_train1, on=['sn', 'fault_time', 'label'])
df_test = df_test.merge(df_test1, on=['sn', 'fault_time'])

In [7]:
bert_train = pd.read_csv('../bert/train.csv')
bert_test = pd.read_csv('../bert/test.csv')

In [8]:
df_train = pd.concat([df_train, bert_train.iloc[:, 9:]], axis=1)
df_test = pd.concat([df_test, bert_test.iloc[:, 9:]], axis=1)

In [9]:
# tr_proba_df = pd.read_csv('./tr_proba_df.csv')
# te_proba_df = pd.read_csv('./te_proba_df.csv')

In [10]:
# df_train = pd.merge(df_train, tr_proba_df, on=['sn', 'fault_time', 'label'])
# df_test = pd.merge(df_test, te_proba_df, on=['sn', 'fault_time'])

In [11]:
# df_train = pd.read_csv('../train.csv')
# df_test = pd.read_csv('../test.csv')

In [12]:
# tr_proba_df = pd.read_csv('../tr_proba_df.csv')
# te_proba_df = pd.read_csv('../te_proba_df.csv')

In [13]:
# df_train = pd.concat([df_train, tr_proba_df], axis=1)
# df_test = pd.concat([df_test, te_proba_df], axis=1)

In [14]:
df_train.shape, df_test.shape

((16604, 1685), (3011, 1684))

In [15]:
classes = np.unique(df_train['label'])
weights = compute_class_weight(class_weight='balanced', classes=classes, y=df_train['label'])
class_weights = dict(zip(classes, weights))

In [16]:
def macro_f1(y_true, y_pred) -> float:
    """
    计算得分
    :param target_df: [sn,fault_time,label]
    :param submit_df: [sn,fault_time,label]
    :return:
    """
    weights =  [3  /  7,  2  /  7,  1  /  7,  1  /  7]
    overall_df = pd.DataFrame([y_true, y_pred]).T
    overall_df.columns = ['label_gt', 'label_pr']

    macro_F1 =  0.
    for i in  range(len(weights)):
        TP =  len(overall_df[(overall_df['label_gt'] == i) & (overall_df['label_pr'] == i)])
        FP =  len(overall_df[(overall_df['label_gt'] != i) & (overall_df['label_pr'] == i)])
        FN =  len(overall_df[(overall_df['label_gt'] == i) & (overall_df['label_pr'] != i)])
        precision = TP /  (TP + FP)  if  (TP + FP)  >  0  else  0
        recall = TP /  (TP + FN)  if  (TP + FP)  >  0  else  0
        F1 =  2  * precision * recall /  (precision + recall)  if  (precision + recall)  >  0  else  0
        macro_F1 += weights[i]  * F1
    return macro_F1

In [17]:
df_train['last_msg_id'] = df_train['last_msg_id'].fillna('NULL')
df_train['last_template_id'] = df_train['last_template_id'].fillna('NULL')

df_test['last_msg_id'] = df_test['last_msg_id'].fillna('NULL')
df_test['last_template_id'] = df_test['last_template_id'].fillna('NULL')

In [18]:
NUM_CLASSES = df_train['label'].nunique()
FOLDS = 10
TARGET = 'label'
use_features = [col for col in df_train.columns if col not in ['sn', 'fault_time', TARGET]]

def run_ctb(df_train, df_test, use_features):
    target = TARGET
    oof_pred = np.zeros((len(df_train), NUM_CLASSES))
    y_pred = np.zeros((len(df_test), NUM_CLASSES))
    
    folds = GroupKFold(n_splits=FOLDS)
    for fold, (tr_ind, val_ind) in enumerate(folds.split(df_train, df_train[TARGET], df_train['sn'])):
        print(f'Fold {fold + 1}')
        x_train, x_val = df_train[use_features].iloc[tr_ind], df_train[use_features].iloc[val_ind] 
        y_train, y_val = df_train[target].iloc[tr_ind], df_train[target].iloc[val_ind]

        params = { 
            'task_type': 'GPU',
            'bootstrap_type': 'Bayesian',
            'learning_rate': 0.03, 
            'eval_metric': 'MultiClass', 
            'loss_function': 'MultiClass', 
            'classes_count': NUM_CLASSES, 
            'iterations': 10000,
            'random_state': 42,
            'depth': 8,
            'leaf_estimation_iterations': 16,
            'reg_lambda': 5,
            'class_weights': class_weights,
            'early_stopping_rounds': 100,
            'cat_features': ['server_model', 'last_msg_id', 'last_template_id'],
        }
        model = CatBoostClassifier(**params)
        
        model.fit(x_train, 
                  y_train, 
                  eval_set=(x_val, y_val), 
                  verbose=100) 
        oof_pred[val_ind] = model.predict_proba(x_val) 
        y_pred += model.predict_proba(df_test[use_features]) / folds.n_splits
        
        score = f1_score(y_val, oof_pred[val_ind].argmax(axis=1), average='macro')
        print(f'F1 score: {score}')
        
        print("Features importance...")
        feat_imp = pd.DataFrame({'imp': model.feature_importances_, 'feature': use_features})
        feat_imp.sort_values(by='imp').to_csv('%d_imp.csv'%fold, index=False)
        print(feat_imp.sort_values(by='imp').reset_index(drop=True))
        
        del x_train, x_val, y_train, y_val
        gc.collect()
        
    return y_pred, oof_pred

In [19]:
y_pred, oof_pred = run_ctb(df_train, df_test, use_features)

Fold 1
0:	learn: 1.3348487	test: 1.3358912	best: 1.3358912 (0)	total: 275ms	remaining: 45m 49s
100:	learn: 0.4885029	test: 0.5965707	best: 0.5965707 (100)	total: 24.7s	remaining: 40m 20s
200:	learn: 0.3972062	test: 0.5677962	best: 0.5677962 (200)	total: 47.5s	remaining: 38m 34s
300:	learn: 0.3408203	test: 0.5582896	best: 0.5581989 (297)	total: 1m 9s	remaining: 37m 29s
400:	learn: 0.3031201	test: 0.5550179	best: 0.5550179 (400)	total: 1m 31s	remaining: 36m 30s
500:	learn: 0.2710425	test: 0.5541181	best: 0.5540655 (498)	total: 1m 53s	remaining: 35m 53s
bestTest = 0.5540655327
bestIteration = 498
Shrink model to first 499 iterations.
F1 score: 0.747944077012585
Features importance...
           imp         feature
0     0.000000      msg_id_507
1     0.000000      msg_id_681
2     0.000000      msg_id_682
3     0.000000      msg_id_683
4     0.000000      msg_id_684
...        ...             ...
1677  2.220330  msg_nunique_2h
1678  2.300893      msg_w2v_27
1679  3.178258              p1


0:	learn: 1.3329752	test: 1.3359733	best: 1.3359733 (0)	total: 274ms	remaining: 45m 39s
100:	learn: 0.4854706	test: 0.6395293	best: 0.6395293 (100)	total: 25s	remaining: 40m 45s
200:	learn: 0.3990136	test: 0.6107168	best: 0.6107168 (200)	total: 47.7s	remaining: 38m 47s
300:	learn: 0.3447900	test: 0.6020791	best: 0.6020791 (300)	total: 1m 10s	remaining: 37m 42s
400:	learn: 0.3045876	test: 0.6000405	best: 0.5992145 (391)	total: 1m 32s	remaining: 36m 53s
500:	learn: 0.2707626	test: 0.6019944	best: 0.5991719 (440)	total: 1m 54s	remaining: 36m 17s
bestTest = 0.5991718714
bestIteration = 440
Shrink model to first 441 iterations.
F1 score: 0.7535258854424287
Features importance...
           imp     feature
0     0.000000  msg_id_593
1     0.000000  msg_id_560
2     0.000000  msg_id_561
3     0.000000  msg_id_562
4     0.000000  msg_id_563
...        ...         ...
1677  2.326708  msg_w2v_44
1678  2.513897   msg_tfv_4
1679  3.107176          p1
1680  4.543391          20
1681  5.478466      

In [20]:
print(macro_f1(df_train[TARGET], np.argmax(oof_pred, axis=1)))

0.6576120056779414


In [21]:
submit_df = pd.read_csv('../data/preliminary_submit_dataset_a.csv')

In [22]:
sub = submit_df[['sn', 'fault_time']].copy()
sub['label'] = y_pred.argmax(axis=1)
display(sub.head())
sub['label'].value_counts() / sub.shape[0]

Unnamed: 0,sn,fault_time,label
0,000d33b21436,2020-09-02 16:42:54,3
1,005c5a9218ba,2020-06-28 19:05:16,2
2,0079283bde6e,2020-04-26 21:32:44,3
3,007bdf23b62f,2020-06-16 18:40:39,2
4,00a577a8e54f,2020-04-07 07:16:55,2


2    0.544337
3    0.186981
1    0.179010
0    0.089671
Name: label, dtype: float64

In [23]:
sub.to_csv('baseline3_gkf_sn.csv', index=False)

In [24]:
label1 = pd.read_csv('../data/preliminary_train_label_dataset.csv')
label2 = pd.read_csv('../data/preliminary_train_label_dataset_s.csv')
label_df = pd.concat([label1, label2]).reset_index(drop=True)
label_df = label_df.drop_duplicates().reset_index(drop=True)

label_df['label'].value_counts() / label_df.shape[0]

2    0.561311
1    0.203987
3    0.145808
0    0.088894
Name: label, dtype: float64

In [25]:
0.6425479819983193, 0.7030

(0.6425479819983193, 0.703)

In [26]:
0.6433593099900003, 0.6425804458852135

(0.6433593099900003, 0.6425804458852135)